Compare commits

...

108 Commits
0.17 ... 0.18

Author SHA1 Message Date
Paul Masurel
f0a2b1cc44 Bumped tantivy and subcrate versions. 2022-05-25 22:50:33 +09:00
Paul Masurel
fcfdc44c61 Bumped tantivy-grammar version 2022-05-25 21:52:46 +09:00
Paul Masurel
3171f0b9ba Added ZSTD support in CHANGELOG 2022-05-25 21:51:46 +09:00
PSeitz
89e19f14b5 Merge pull request #1374 from kryesh/main
Add Zstd compression support, Make block size configurable via IndexSettings
2022-05-25 07:39:46 +02:00
PSeitz
1a6a1396cd Merge pull request #1376 from saroh/json-example
Add examples to explain default field handling in the json example
2022-05-24 07:09:37 +02:00
saroh
e766375700 remove useless example 2022-05-23 19:49:31 +02:00
PSeitz
496b4a4fdb Update examples/json_field.rs 2022-05-23 12:24:36 +02:00
PSeitz
93cc8498b3 Update examples/json_field.rs 2022-05-23 11:59:42 +02:00
PSeitz
0aa3d63a9f Update examples/json_field.rs 2022-05-23 11:39:45 +02:00
PSeitz
4e2a053b69 Update examples/json_field.rs 2022-05-23 11:27:05 +02:00
Paul Masurel
71c4393ec4 Clippy 2022-05-23 10:20:37 +09:00
saroh
b2e97e266a more examples to explain default field handling 2022-05-21 17:36:39 +02:00
Antoine G
9ee4772140 Fix deps for unicode regex compiling (#1373)
* lint doc warning

* fix regex build
2022-05-20 10:18:44 +09:00
Kryesh
c95013b11e Add zstd-compression feature to github workflow tests 2022-05-19 22:15:18 +10:00
Kryesh
fc045e6bf9 Cleanup imports, remove unneeded error mapping 2022-05-19 10:34:02 +10:00
Kryesh
6837a4d468 Fix bench 2022-05-18 20:35:29 +10:00
Kryesh
0759bf9448 Cleanup zstd structure and serialise to u32 in line with lz4 2022-05-18 20:31:22 +10:00
Kryesh
152e8238d7 Fix silly errors from running tests without feature flag 2022-05-18 19:49:10 +10:00
Kryesh
d4e5b48437 Apply feedback - standardise on u64 and fix correct compression bounds 2022-05-18 19:37:28 +10:00
Kryesh
03040ed81d Add Zstd compression support 2022-05-18 14:04:43 +10:00
Kryesh
aaa22ad225 Make block size configurable to allow for better compression ratios on large documents 2022-05-18 11:13:15 +10:00
Antoine G
3223bdf254 Refactorize PhraseScorer::compute_phrase_match (#1364)
* Refactorize PhraseScorer::compute_phrase_match
* implem optim for slop
2022-05-13 09:57:21 +09:00
dependabot[bot]
cbd06ab189 Update pprof requirement from 0.8.0 to 0.9.0 (#1365)
Updates the requirements on [pprof](https://github.com/tikv/pprof-rs) to permit the latest version.
- [Release notes](https://github.com/tikv/pprof-rs/releases)
- [Changelog](https://github.com/tikv/pprof-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/tikv/pprof-rs/commits)

---
updated-dependencies:
- dependency-name: pprof
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2022-05-11 11:42:04 +09:00
Paul Masurel
749395bbb8 Added rustdoc for MultiFruit extract function (#1369) 2022-05-11 11:41:39 +09:00
Paul Masurel
617ba1f0c0 Bugfix in the document deserialization. (#1368)
Deserializing a json field does not expect the
end of the document anymore.

This behavior is well documented in serde_json.
https://docs.serde.rs/serde_json/fn.from_reader.html

Closes #1366
2022-05-11 11:38:10 +09:00
Paul Masurel
2f1cd7e7f0 Bugfix in the document deserialization. (#1367)
Deserializing a json field does not expect the
end of the document anymore.

This behavior is well documented in serde_json.
https://docs.serde.rs/serde_json/fn.from_reader.html

Closes #1366
2022-05-11 11:27:04 +09:00
PSeitz
58c0cb5fc4 Merge pull request #1357 from saroh/1302-json-term-writer-API
Expose helpers to generate json field writer terms
2022-05-10 11:02:05 +08:00
PSeitz
7f45a6ac96 allow setting tokenizer manager on index (#1362)
handle json in tokenizer_for_field
2022-05-09 18:15:45 +09:00
saroh
0ade871126 rename constructor to be more explicit 2022-05-06 13:29:07 +02:00
PSeitz
aab65490c9 Merge pull request #1358 from quickwit-oss/fix_docs
add alias shard_size to split_size for quickwit
2022-05-06 18:41:34 +08:00
Pascal Seitz
d77e8de36a flip alias variable name 2022-05-06 17:52:36 +08:00
Pascal Seitz
d11a8cce26 minor docs fix 2022-05-06 17:52:36 +08:00
Pascal Seitz
bc607a921b add alias shard_size split_size for quickwit
improve some docs
2022-05-06 17:52:36 +08:00
Paul Masurel
1273f33338 Fixed comment. 2022-05-06 18:35:25 +09:00
Paul Masurel
e30449743c Shortens blocks' last_key in the SSTable block index. (#1361)
Right now we store last key in the blocks of the SSTable index.
This PR replaces the last key by a shorter string that is greater or
equal and still lesser than the next key.
This property is sufficiently to ensure the block index
works properly.

Related to quickwit#1366
2022-05-06 16:29:06 +08:00
Paul Masurel
ed26552296 Minor changes in query parsing for quickwit#1334. (#1356)
Quickwit's still heavily relies on generating field names
containing a '.' for nested object, yet allows for
user defined field names to contain a dot.

In order to reuse tantivy query parser, we will end up
using quickwit field names directly into tantivy.
Only '.' will be escaped.

This PR makes minor changes in how tantivy query parser parses
a field name and resolves it to a field.
Some of the new edge case behavior is hacky.

Closes #1355
2022-05-06 13:20:10 +09:00
Saroh
65d129afbd better function names 2022-05-05 10:12:28 +02:00
Antoine G
386ffab76c Fix documentation regression (#1359)
This breaks the doc on doc.rs as the type seems to shadow the struct https://docs.rs/tantivy/latest/tantivy/termdict/type.TermDictionary.html
introduced by #1293 which may not have been up to date with what was done in #1242
2022-05-05 14:59:25 +09:00
Pasha Podolsky
57a8d0359c Make FruitHandle and MultiFruit public (#1360)
* Make `FruitHandle` and `MultiFruit` public

* Add docs for `MultiFruit` and `FruitHandle`
2022-05-05 14:58:33 +09:00
Saroh
14cb66ee00 move helper to indexer module 2022-05-04 18:01:57 +02:00
Saroh
9e38343352 expose helpers for json field writer manipulation
closes #1302
2022-05-04 18:01:45 +02:00
PSeitz
944302ae2f Merge pull request #1350 from quickwit-oss/update_edition
update edition
2022-05-04 11:02:52 +02:00
Paul Masurel
be70804d17 Removed AtomicUsize. 2022-05-04 16:45:24 +09:00
PSeitz
a1afc80600 Update src/core/executor.rs
Co-authored-by: Paul Masurel <paul@quickwit.io>
2022-05-04 08:39:44 +02:00
Paul Masurel
02e24fda52 Clippy fix 2022-05-04 12:24:07 +09:00
PSeitz
7e3c0c5392 Merge pull request #1353 from quickwit-oss/fix_docs
minor docs fixes
2022-05-02 07:48:25 +02:00
Pascal Seitz
fdb2524f9e minor docs fixes 2022-05-02 12:26:12 +08:00
Pascal Seitz
4db655ae82 update dependencies, update edition 2022-04-28 22:50:55 +08:00
Pascal Seitz
bb44cc84c4 update dependencies 2022-04-28 20:55:36 +08:00
PSeitz
8c1e1cf1ad Merge pull request #1349 from quickwit-oss/fix_error_message
print whole query on syntax error
2022-04-28 09:31:45 +02:00
Pascal Seitz
b5b16948b0 print whole query on syntax error 2022-04-27 12:48:30 +08:00
PSeitz
c305d3a2a2 Merge pull request #1346 from quickwit-oss/term_agg
term agg
2022-04-26 07:08:07 +02:00
PSeitz
038d234ff1 Merge pull request #1347 from quickwit-oss/query_parser_error
fix query parser error field not found
2022-04-26 07:01:48 +02:00
Pascal Seitz
c45eb9a9fa improve readability, add json test 2022-04-26 11:22:34 +08:00
Pascal Seitz
824d6f96fe return query on parse error 2022-04-22 16:11:36 +08:00
Pascal Seitz
7cf821bac0 fix query parser error field not found 2022-04-22 12:40:00 +08:00
PSeitz
ae83fc8298 bump uuid to 1.0 (#1345) 2022-04-22 10:02:24 +09:00
dependabot[bot]
a7bc361145 Update pprof requirement from 0.7 to 0.8 (#1343)
Updates the requirements on [pprof](https://github.com/tikv/pprof-rs) to permit the latest version.
- [Release notes](https://github.com/tikv/pprof-rs/releases)
- [Changelog](https://github.com/tikv/pprof-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/tikv/pprof-rs/commits)

---
updated-dependencies:
- dependency-name: pprof
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2022-04-21 09:35:13 +09:00
Pascal Seitz
2805291400 minor fixes 2022-04-20 14:22:44 +08:00
Pascal Seitz
6614a2cba0 fix is_fast for bytes field 2022-04-20 12:02:38 +08:00
Pascal Seitz
6f4d203d1b return error on missing sub aggregation 2022-04-20 11:19:36 +08:00
Pascal Seitz
1be6c6111c support order property on term aggregations
support order property on term aggregations
order can be by doc_count, key, or a metric sub_aggregation
2022-04-20 00:34:38 +08:00
PSeitz
c7c3eab256 Merge pull request #1340 from PSeitz/term_agg
fix collecting term_dict field names
2022-04-18 08:21:27 +02:00
Pascal Seitz
ec69875d15 fix collecting term_dict field names
fix collecting term_dict field names for sub_aggregations, minor refactoring
2022-04-15 17:49:20 +08:00
PSeitz
d832cfcfd8 Merge pull request #1329 from quickwit-oss/term_agg
add term aggregation
2022-04-14 14:45:21 +08:00
Pascal Seitz
ab6b532cc4 add comments 2022-04-14 12:06:36 +08:00
Pascal Seitz
4b6047f7d7 return Option from as_ methods 2022-04-14 10:48:36 +08:00
Pascal Seitz
5ca04beb94 add min_doc_count test 2022-04-13 19:51:18 +08:00
Pascal Seitz
902d05ebec refactor getffreader function 2022-04-13 19:51:18 +08:00
Pascal Seitz
f1b298642a remove unnecessary benchmarks 2022-04-13 19:51:18 +08:00
Pascal Seitz
dd13dedaeb forward errors, remove unwrap 2022-04-13 19:51:18 +08:00
Pascal Seitz
46724b4a05 add segment_size, add get term dict fields, add tests 2022-04-13 19:51:18 +08:00
Pascal Seitz
24432bf523 add term aggregation 2022-04-13 19:51:18 +08:00
PSeitz
31d3bcfff2 Merge pull request #1334 from PSeitz/minor_fixes
fix DateTime naming, fix docs, cleanup
2022-04-13 13:13:57 +08:00
Pascal Seitz
706fbd6886 fix DateTime naming, fix docs, cleanup 2022-04-13 13:01:00 +08:00
PSeitz
8a8a048015 fix coverage (#1335) 2022-04-13 13:47:47 +09:00
dependabot[bot]
c72549cb9a Bump codecov/codecov-action from 2 to 3 (#1328)
Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 2 to 3.
- [Release notes](https://github.com/codecov/codecov-action/releases)
- [Changelog](https://github.com/codecov/codecov-action/blob/master/CHANGELOG.md)
- [Commits](https://github.com/codecov/codecov-action/compare/v2...v3)

---
updated-dependencies:
- dependency-name: codecov/codecov-action
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2022-04-11 21:26:52 +09:00
PSeitz
d6f803212c Merge pull request #1325 from quickwit-oss/term_agg
fast field on string
2022-04-04 15:34:31 +08:00
Pascal Seitz
dac73537d2 update changelog 2022-04-04 14:15:40 +08:00
Pascal Seitz
bb5254de12 always serialize, use enum as param 2022-04-04 13:50:23 +08:00
Maxim Kraynyuchenko
be5218c2f6 Company Logos were not visible in Dark Theme. (#1326) 2022-04-04 11:53:31 +09:00
Pascal Seitz
ec9478830a add text test
move get multiple values to test code
remove sorting term ids per docidi for non facets
2022-03-30 11:31:33 +08:00
Pascal Seitz
8807bfd13d fast field on string
enables FAST on string fields, which creates a fastfield containing the term ordinals
2022-03-29 12:40:10 +08:00
Maxim Kraynyuchenko
447811c111 Update README following sections: features, benchmark illustration & FAQ. (#1318)
* Updated features, benchmark illustration & FAQ.
* Updated README: Feat,Graph,Non-Feat,Companies,FAQ
2022-03-23 10:02:09 +09:00
PSeitz
f29acf5d8c fix clippy (#1321) 2022-03-22 12:48:23 +09:00
Uwe Klotz
125707dbe0 Replace chrono with time (#1307)
For date values `chrono` has been replaced with `time` 
- The `time` crate is re-exported as `tantivy::time` instead of `tantivy::chrono`.
- The type alias `tantivy::DateTime` has been removed.
- `Value::Date` wraps `time::PrimitiveDateTime` without time zone information.
- Internally date/time values are stored as seconds since UNIX epoch in UTC.
- Converting a `time::OffsetDateTime` to `Value::Date` implicitly converts the value into UTC.
If this is not desired do the time zone conversion yourself and use `time::PrimitiveDateTime`
directly instead.

Closes #1304
2022-03-21 10:50:19 +09:00
Paul Masurel
46d5de920d Removes all usage of block_on, and use a oneshot channel instead. (#1315)
* Removes all usage of block_on, and use a oneshot channel instead.

Calling `block_on` panics in certain context.
For instance, it panics when it is called in a the context of another
call to block.

Using it in tantivy is unnecessary. We replace it by a thin wrapper
around a oneshot channel that supports both async/sync.

* Removing needless uses of async in the API.

Co-authored-by: PSeitz <PSeitz@users.noreply.github.com>
2022-03-18 16:54:58 +09:00
PSeitz
d2a7bcf217 fix fmt (#1317) 2022-03-18 15:53:27 +09:00
PSeitz
141b9aa245 Merge pull request #1306 from PSeitz/histogram
add Histogram aggregation
2022-03-18 05:03:46 +01:00
PSeitz
c5a6282fa8 Update src/aggregation/bucket/histogram/histogram.rs
Co-authored-by: Paul Masurel <paul@quickwit.io>
2022-03-18 04:55:31 +01:00
PSeitz
c0f524e1a3 Update src/aggregation/bucket/histogram/histogram.rs
Co-authored-by: Paul Masurel <paul@quickwit.io>
2022-03-18 04:55:25 +01:00
Paul Masurel
958b2bee08 Clippy comments (#1316) 2022-03-17 18:57:55 +09:00
Pascal Seitz
f619658e2c rename 2022-03-17 16:37:57 +08:00
Pascal Seitz
aa391bf843 refactor parameters 2022-03-17 16:28:37 +08:00
Pascal Seitz
47dcbdbeae handle empty results, empty indices, add tests 2022-03-17 10:24:34 +08:00
Pascal Seitz
691245bf20 make code more concise 2022-03-16 14:21:58 +08:00
Pascal Seitz
90798d4b39 address comments, add single bucket test 2022-03-16 13:58:13 +08:00
Pascal Seitz
0b6d9f90cf improve docs 2022-03-16 12:39:26 +08:00
PSeitz
8a5a12d961 add setter to json object options (#1311) 2022-03-16 10:36:30 +09:00
Pascal Seitz
e73542e2e8 Elasticsearch behaviour on hard/extended_bounds 2022-03-15 16:46:45 +08:00
Pascal Seitz
0262e44bbd merge_fruits pass by value 2022-03-15 12:59:22 +08:00
Pascal Seitz
613aad7a8a vec optional, improve performance 2022-03-14 21:29:07 +08:00
Pascal Seitz
1aa88b0c51 improve performance 2022-03-14 20:28:08 +08:00
Pascal Seitz
564fa38085 move sub_aggregations to own vec, use itertools minmax 2022-03-14 16:20:26 +08:00
dependabot[bot]
59ec21479f Update pprof requirement from 0.6 to 0.7 (#1305)
Updates the requirements on [pprof](https://github.com/tikv/pprof-rs) to permit the latest version.
- [Release notes](https://github.com/tikv/pprof-rs/releases)
- [Changelog](https://github.com/tikv/pprof-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/tikv/pprof-rs/commits)

---
updated-dependencies:
- dependency-name: pprof
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2022-03-14 13:57:22 +09:00
PSeitz
42283f9e91 fix error message UnknownTokenizer (#1308)
closes #1303
2022-03-14 13:54:47 +09:00
PSeitz
b105bf72e1 use defaults in meta.json (#1310)
This change allows to have unset fields in meta.json and fall back to their defaults
Currently it is required to explicitly put e.g. fieldnorms: false
2022-03-14 13:54:06 +09:00
Pascal Seitz
226f577803 Add Histogram aggregation 2022-03-11 21:52:07 +08:00
98 changed files with 6336 additions and 1164 deletions

View File

@@ -13,12 +13,11 @@ jobs:
- uses: actions/checkout@v3
- name: Install Rust
run: rustup toolchain install nightly --component llvm-tools-preview
- name: Install cargo-llvm-cov
run: curl -LsSf https://github.com/taiki-e/cargo-llvm-cov/releases/latest/download/cargo-llvm-cov-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin
- uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
run: cargo +nightly llvm-cov --all-features --workspace --lcov --output-path lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v2
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
files: lcov.info

View File

@@ -33,7 +33,7 @@ jobs:
components: rustfmt, clippy
- name: Run tests
run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,failpoints --verbose --workspace
run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints --verbose --workspace
- name: Run tests quickwit feature
run: cargo +stable test --features mmap,quickwit,failpoints --verbose --workspace

View File

@@ -1,3 +1,18 @@
Tantivy 0.18
================================
- For date values `chrono` has been replaced with `time` (@uklotzde) #1304 :
- The `time` crate is re-exported as `tantivy::time` instead of `tantivy::chrono`.
- The type alias `tantivy::DateTime` has been removed.
- `Value::Date` wraps `time::PrimitiveDateTime` without time zone information.
- Internally date/time values are stored as seconds since UNIX epoch in UTC.
- Converting a `time::OffsetDateTime` to `Value::Date` implicitly converts the value into UTC.
If this is not desired do the time zone conversion yourself and use `time::PrimitiveDateTime`
directly instead.
- Add [histogram](https://github.com/quickwit-oss/tantivy/pull/1306) aggregation (@PSeitz)
- Add support for fastfield on text fields (@PSeitz)
- Add terms aggregation (@PSeitz)
- Add support for zstd compression (@kryesh)
Tantivy 0.17
================================
- LogMergePolicy now triggers merges if the ratio of deleted documents reaches a threshold (@shikhar @fulmicoton) [#115](https://github.com/quickwit-oss/tantivy/issues/115)
@@ -8,7 +23,7 @@ Tantivy 0.17
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-oss/tantivy/issues/922)
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-oss/tantivy/issues/1225)
- Fix opening bytes index with dynamic codec (@PSeitz) [#1278](https://github.com/quickwit-oss/tantivy/issues/1278)
- Added an aggregation collector compatible with Elasticsearch (@PSeitz)
- Added an aggregation collector for range, average and stats compatible with Elasticsearch. (@PSeitz)
- Added a JSON schema type @fulmicoton [#1251](https://github.com/quickwit-oss/tantivy/issues/1251)
- Added support for slop in phrase queries @halvorboe [#1068](https://github.com/quickwit-oss/tantivy/issues/1068)

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.17.0"
version = "0.18.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -10,70 +10,72 @@ homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
readme = "README.md"
keywords = ["search", "information", "retrieval"]
edition = "2018"
edition = "2021"
[dependencies]
base64 = "0.13"
oneshot = "0.1.3"
base64 = "0.13.0"
byteorder = "1.4.3"
crc32fast = "1.2.1"
once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
tantivy-fst = "0.3"
memmap2 = {version = "0.5", optional=true}
lz4_flex = { version = "0.9", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3", optional = true }
crc32fast = "1.3.2"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
tantivy-fst = "0.3.0"
memmap2 = { version = "0.5.3", optional = true }
lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3.4", optional = true }
zstd = { version = "0.11", optional = true }
snap = { version = "1.0.5", optional = true }
tempfile = { version = "3.2", optional = true }
log = "0.4.14"
serde = { version = "1.0.126", features = ["derive"] }
serde_json = "1.0.64"
num_cpus = "1.13"
tempfile = { version = "3.3.0", optional = true }
log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] }
serde_json = "1.0.79"
num_cpus = "1.13.1"
fs2={ version = "0.4.3", optional = true }
levenshtein_automata = "0.2"
uuid = { version = "0.8.2", features = ["v4", "serde"] }
crossbeam = "0.8.1"
futures = { version = "0.3.15", features = ["thread-pool"] }
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
common = { version = "0.2", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
ownedbytes = { version="0.2", path="./ownedbytes" }
stable_deref_trait = "1.2"
rust-stemmers = "1.2"
downcast-rs = "1.2"
levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
tantivy-query-grammar = { version="0.18.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.2", path="./bitpacker" }
common = { version = "0.3", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version="0.2", path="./fastfield_codecs", default-features = false }
ownedbytes = { version="0.3", path="./ownedbytes" }
stable_deref_trait = "1.2.0"
rust-stemmers = "1.2.0"
downcast-rs = "1.2.0"
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
census = "0.4"
census = "0.4.0"
fnv = "1.0.7"
thiserror = "1.0.24"
thiserror = "1.0.30"
htmlescape = "0.3.1"
fail = "0.5"
murmurhash32 = "0.2"
chrono = "0.4.19"
smallvec = "1.6.1"
rayon = "1.5"
lru = "0.7.0"
fastdivide = "0.4"
itertools = "0.10.0"
measure_time = "0.8.0"
pretty_assertions = "1.1.0"
serde_cbor = {version="0.11", optional=true}
async-trait = "0.1"
fail = "0.5.0"
murmurhash32 = "0.2.0"
time = { version = "0.3.9", features = ["serde-well-known"] }
smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.7.5"
fastdivide = "0.4.0"
itertools = "0.10.3"
measure_time = "0.8.2"
pretty_assertions = "1.2.1"
serde_cbor = { version = "0.11.2", optional = true }
async-trait = "0.1.53"
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
[dev-dependencies]
rand = "0.8.3"
rand = "0.8.5"
maplit = "1.0.2"
matches = "0.1.8"
proptest = "1.0"
matches = "0.1.9"
proptest = "1.0.0"
criterion = "0.3.5"
test-log = "0.2.8"
test-log = "0.2.10"
env_logger = "0.9.0"
pprof = {version= "0.6", features=["flamegraph", "criterion"]}
pprof = { version = "0.9.0", features = ["flamegraph", "criterion"] }
futures = "0.3.21"
[dev-dependencies.fail]
version = "0.5"
version = "0.5.0"
features = ["failpoints"]
[profile.release]
@@ -92,6 +94,7 @@ mmap = ["fs2", "tempfile", "memmap2"]
brotli-compression = ["brotli"]
lz4-compression = ["lz4_flex"]
snappy-compression = ["snap"]
zstd-compression = ["zstd"]
failpoints = ["fail/failpoints"]
unstable = [] # useful for benches.

View File

@@ -5,9 +5,10 @@
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy)
![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png)
**Tantivy** is a **full text search engine library** written in Rust.
**Tantivy** is a **full-text search engine library** written in Rust.
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
an off-the-shelf search engine server, but rather a crate that can be used
@@ -15,19 +16,23 @@ to build such a search engine.
Tantivy is, in fact, strongly inspired by Lucene's design.
If you are looking for an alternative to Elasticsearch or Apache Solr, check out [Quickwit](https://github.com/quickwit-oss/quickwit), our search engine built on top of Tantivy.
# Benchmark
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
performance for different type of queries / collection.
The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns
performance for different types of queries/collections.
Your mileage WILL vary depending on the nature of queries and their load.
<img src="doc/assets/images/searchbenchmark.png">
# Features
- Full-text search
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
- Tiny startup time (<10ms), perfect for command line tools
- Tiny startup time (<10ms), perfect for command-line tools
- BM25 scoring (the same as Lucene)
- Natural query language (e.g. `(michael AND jackson) OR "king of pop"`)
- Phrase queries search (e.g. `"michael jackson"`)
@@ -42,23 +47,25 @@ Your mileage WILL vary depending on the nature of queries and their load.
- Range queries
- Faceted search
- Configurable indexing (optional term frequency and position indexing)
- JSON Field
- Aggregation Collector: range buckets, average, and stats metrics
- LogMergePolicy with deletes
- Searcher Warmer API
- Cheesy logo with a horse
## Non-features
- Distributed search is out of the scope of Tantivy. That being said, Tantivy is a
library upon which one could build a distributed search. Serializable/mergeable collector state for instance,
are within the scope of Tantivy.
Distributed search is out of the scope of Tantivy, but if you are looking for this feature, check out [Quickwit](https://github.com/quickwit-oss/quickwit/).
# Getting started
Tantivy works on stable Rust (>= 1.27) and supports Linux, MacOS, and Windows.
Tantivy works on stable Rust (>= 1.27) and supports Linux, macOS, and Windows.
- [Tantivy's simple search example](https://tantivy-search.github.io/examples/basic_search.html)
- [tantivy-cli and its tutorial](https://github.com/quickwit-oss/tantivy-cli) - `tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
- [tantivy-cli and its tutorial](https://github.com/quickwit-oss/tantivy-cli) - `tantivy-cli` is an actual command-line interface that makes it easy for you to create a search engine,
index documents, and search via the CLI or a small server with a REST API.
It walks you through getting a wikipedia search engine up and running in a few minutes.
It walks you through getting a Wikipedia search engine up and running in a few minutes.
- [Reference doc for the last released version](https://docs.rs/tantivy/)
# How can I support this project?
@@ -118,3 +125,31 @@ By default, `rustc` compiles everything in the `examples/` directory in debug mo
rust-gdb target/debug/examples/$EXAMPLE_NAME
$ gdb run
```
# Companies Using Tantivy
<p align="left">
<img align="center" src="doc/assets/images/Nuclia.png#gh-light-mode-only" alt="Nuclia" height="25" width="auto" /> &nbsp;
<img align="center" src="doc/assets/images/humanfirst.png#gh-light-mode-only" alt="Humanfirst.ai" height="30" width="auto" />
<img align="center" src="doc/assets/images/element.io.svg#gh-light-mode-only" alt="Element.io" height="25" width="auto" />
<img align="center" src="doc/assets/images/nuclia-dark-theme.png#gh-dark-mode-only" alt="Nuclia" height="35" width="auto" /> &nbsp;
<img align="center" src="doc/assets/images/humanfirst.ai-dark-theme.png#gh-dark-mode-only" alt="Humanfirst.ai" height="25" width="auto" />&nbsp; &nbsp;
<img align="center" src="doc/assets/images/element-dark-theme.png#gh-dark-mode-only" alt="Element.io" height="25" width="auto" />
</p>
# FAQ
### Can I use Tantivy in other languages?
- Python → [tantivy-py](https://github.com/quickwit-oss/tantivy-py)
- Ruby → [tantiny](https://github.com/baygeldin/tantiny)
You can also find other bindings on [GitHub](https://github.com/search?q=tantivy) but they may be less maintained.
### What are some examples of Tantivy use?
- [seshat](https://github.com/matrix-org/seshat/): A matrix message database/indexer
- [tantiny](https://github.com/baygeldin/tantiny): Tiny full-text search for Ruby
- [lnx](https://github.com/lnx-search/lnx): adaptable, typo tolerant search engine with a REST API
- and [more](https://github.com/search?q=tantivy)!
### On average, how much faster is Tantivy compared to Lucene?
- According to our [search latency benchmark](https://tantivy-search.github.io/bench/), Tantivy is approximately 2x faster than Lucene.

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-bitpacker"
version = "0.1.1"
version = "0.2.0"
edition = "2018"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-common"
version = "0.2.0"
version = "0.3.0"
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
edition = "2018"
@@ -10,7 +10,7 @@ description = "common traits and utility functions used by multiple tantivy subc
[dependencies]
byteorder = "1.4.3"
ownedbytes = { version="0.2", path="../ownedbytes" }
ownedbytes = { version="0.3", path="../ownedbytes" }
[dev-dependencies]
proptest = "1.0.0"

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

View File

@@ -0,0 +1,8 @@
<svg width="518" height="112" viewBox="0 0 518 112" fill="none" xmlns="http://www.w3.org/2000/svg">
<path fill-rule="evenodd" clip-rule="evenodd" d="M56 112C86.9279 112 112 86.9279 112 56C112 25.0721 86.9279 0 56 0C25.0721 0 0 25.0721 0 56C0 86.9279 25.0721 112 56 112Z" fill="#0DBD8B"/>
<path fill-rule="evenodd" clip-rule="evenodd" d="M45.7615 26.093C45.7615 23.8325 47.5977 22.0001 49.8629 22.0001C65.2154 22.0001 77.6611 34.4199 77.6611 49.7406C77.6611 52.001 75.8248 53.8335 73.5597 53.8335C71.2945 53.8335 69.4583 52.001 69.4583 49.7406C69.4583 38.9408 60.6851 30.1859 49.8629 30.1859C47.5977 30.1859 45.7615 28.3534 45.7615 26.093Z" fill="white"/>
<path fill-rule="evenodd" clip-rule="evenodd" d="M85.8986 45.6477C88.1637 45.6477 89.9999 47.4801 89.9999 49.7406C89.9999 65.0612 77.5543 77.4811 62.2017 77.4811C59.9366 77.4811 58.1003 75.6486 58.1003 73.3882C58.1003 71.1277 59.9366 69.2953 62.2017 69.2953C73.024 69.2953 81.7972 60.5403 81.7972 49.7406C81.7972 47.4801 83.6334 45.6477 85.8986 45.6477Z" fill="white"/>
<path fill-rule="evenodd" clip-rule="evenodd" d="M66.3031 85.907C66.3031 88.1675 64.4668 89.9999 62.2017 89.9999C46.8492 89.9999 34.4035 77.58 34.4035 62.2594C34.4035 59.9989 36.2398 58.1665 38.5049 58.1665C40.77 58.1665 42.6063 59.9989 42.6063 62.2594C42.6063 73.0592 51.3795 81.8141 62.2017 81.8141C64.4668 81.8141 66.3031 83.6466 66.3031 85.907Z" fill="white"/>
<path fill-rule="evenodd" clip-rule="evenodd" d="M26.1014 66.3523C23.8363 66.3523 22.0001 64.5199 22.0001 62.2594C22 46.9388 34.4457 34.5189 49.7983 34.5189C52.0634 34.5189 53.8997 36.3514 53.8997 38.6118C53.8997 40.8723 52.0634 42.7047 49.7983 42.7047C38.976 42.7047 30.2028 51.4597 30.2028 62.2594C30.2028 64.5199 28.3666 66.3523 26.1014 66.3523Z" fill="white"/>
<path d="M197 63.5H157.5C157.967 67.6333 159.467 70.9333 162 73.4C164.533 75.8 167.867 77 172 77C174.733 77 177.2 76.3333 179.4 75C181.6 73.6667 183.167 71.8667 184.1 69.6H196.1C194.5 74.8667 191.5 79.1333 187.1 82.4C182.767 85.6 177.633 87.2 171.7 87.2C163.967 87.2 157.7 84.6333 152.9 79.5C148.167 74.3667 145.8 67.8667 145.8 60C145.8 52.3333 148.2 45.9 153 40.7C157.8 35.5 164 32.9 171.6 32.9C179.2 32.9 185.333 35.4667 190 40.6C194.733 45.6667 197.1 52.0667 197.1 59.8L197 63.5ZM171.6 42.6C167.867 42.6 164.767 43.7 162.3 45.9C159.833 48.1 158.3 51.0333 157.7 54.7H185.3C184.767 51.0333 183.3 48.1 180.9 45.9C178.5 43.7 175.4 42.6 171.6 42.6ZM205.289 70.5V11H217.189V70.7C217.189 73.3667 218.656 74.7 221.589 74.7L223.689 74.6V85.9C222.556 86.1 221.356 86.2 220.089 86.2C214.956 86.2 211.189 84.9 208.789 82.3C206.456 79.7 205.289 75.7667 205.289 70.5ZM279.109 63.5H239.609C240.076 67.6333 241.576 70.9333 244.109 73.4C246.643 75.8 249.976 77 254.109 77C256.843 77 259.309 76.3333 261.509 75C263.709 73.6667 265.276 71.8667 266.209 69.6H278.209C276.609 74.8667 273.609 79.1333 269.209 82.4C264.876 85.6 259.743 87.2 253.809 87.2C246.076 87.2 239.809 84.6333 235.009 79.5C230.276 74.3667 227.909 67.8667 227.909 60C227.909 52.3333 230.309 45.9 235.109 40.7C239.909 35.5 246.109 32.9 253.709 32.9C261.309 32.9 267.443 35.4667 272.109 40.6C276.843 45.6667 279.209 52.0667 279.209 59.8L279.109 63.5ZM253.709 42.6C249.976 42.6 246.876 43.7 244.409 45.9C241.943 48.1 240.409 51.0333 239.809 54.7H267.409C266.876 51.0333 265.409 48.1 263.009 45.9C260.609 43.7 257.509 42.6 253.709 42.6ZM332.798 56.2V86H320.898V54.9C320.898 47.0333 317.632 43.1 311.098 43.1C307.565 43.1 304.732 44.2333 302.598 46.5C300.532 48.7667 299.498 51.8667 299.498 55.8V86H287.598V34.1H298.598V41C299.865 38.6667 301.798 36.7333 304.398 35.2C306.998 33.6667 310.232 32.9 314.098 32.9C321.298 32.9 326.498 35.6333 329.698 41.1C334.098 35.6333 339.965 32.9 347.298 32.9C353.365 32.9 358.032 34.8 361.298 38.6C364.565 42.3333 366.198 47.2667 366.198 53.4V86H354.298V54.9C354.298 47.0333 351.032 43.1 344.498 43.1C340.898 43.1 338.032 44.2667 335.898 46.6C333.832 48.8667 332.798 52.0667 332.798 56.2ZM425.379 63.5H385.879C386.346 67.6333 387.846 70.9333 390.379 73.4C392.912 75.8 396.246 77 400.379 77C403.112 77 405.579 76.3333 407.779 75C409.979 73.6667 411.546 71.8667 412.479 69.6H424.479C422.879 74.8667 419.879 79.1333 415.479 82.4C411.146 85.6 406.012 87.2 400.079 87.2C392.346 87.2 386.079 84.6333 381.279 79.5C376.546 74.3667 374.179 67.8667 374.179 60C374.179 52.3333 376.579 45.9 381.379 40.7C386.179 35.5 392.379 32.9 399.979 32.9C407.579 32.9 413.712 35.4667 418.379 40.6C423.112 45.6667 425.479 52.0667 425.479 59.8L425.379 63.5ZM399.979 42.6C396.246 42.6 393.146 43.7 390.679 45.9C388.212 48.1 386.679 51.0333 386.079 54.7H413.679C413.146 51.0333 411.679 48.1 409.279 45.9C406.879 43.7 403.779 42.6 399.979 42.6ZM444.868 34.1V41C446.068 38.7333 448.035 36.8333 450.768 35.3C453.568 33.7 456.935 32.9 460.868 32.9C467.001 32.9 471.735 34.7667 475.068 38.5C478.468 42.2333 480.168 47.2 480.168 53.4V86H468.268V54.9C468.268 51.2333 467.401 48.3667 465.668 46.3C464.001 44.1667 461.435 43.1 457.968 43.1C454.168 43.1 451.168 44.2333 448.968 46.5C446.835 48.7667 445.768 51.9 445.768 55.9V86H433.868V34.1H444.868ZM514.922 75.4V85.7C513.455 86.1 511.389 86.3 508.722 86.3C498.589 86.3 493.522 81.2 493.522 71V43.6H485.622V34.1H493.522V20.6H505.422V34.1H515.122V43.6H505.422V69.8C505.422 73.8667 507.355 75.9 511.222 75.9L514.922 75.4Z" fill="black"/>
</svg>

After

Width:  |  Height:  |  Size: 5.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 102 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 653 KiB

View File

@@ -20,9 +20,7 @@ fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
@@ -124,7 +122,7 @@ fn main() -> tantivy::Result<()> {
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
let res: Value = serde_json::to_value(&agg_res)?;
println!("{}", serde_json::to_string_pretty(&res)?);
Ok(())

View File

@@ -1,7 +1,8 @@
// # Json field example
//
// This example shows how the json field can be used
// to make tantivy partially schemaless.
// to make tantivy partially schemaless by setting it as
// default query parser field.
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
@@ -10,10 +11,6 @@ use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
//
// We need two fields:
// - a timestamp
// - a json object field
let mut schema_builder = Schema::builder();
schema_builder.add_date_field("timestamp", FAST | STORED);
let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
@@ -43,7 +40,8 @@ fn main() -> tantivy::Result<()> {
"attributes": {
"target": "submit-button",
"cart": {"product_id": 133},
"description": "das keyboard"
"description": "das keyboard",
"event_type": "holiday-sale"
}
}"#,
)?;
@@ -53,6 +51,9 @@ fn main() -> tantivy::Result<()> {
let reader = index.reader()?;
let searcher = reader.searcher();
// # Default fields: event_type and attributes
// By setting attributes as a default field it allows omitting attributes itself, e.g. "target",
// instead of "attributes.target"
let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]);
{
let query = query_parser.parse_query("target:submit-button")?;
@@ -70,10 +71,34 @@ fn main() -> tantivy::Result<()> {
assert_eq!(count_docs, 1);
}
{
let query = query_parser
.parse_query("event_type:click AND cart.product_id:133")
.unwrap();
let hits = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
let query = query_parser.parse_query("click AND cart.product_id:133")?;
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(hits.len(), 1);
}
{
// The sub-fields in the json field marked as default field still need to be explicitly
// addressed
let query = query_parser.parse_query("click AND 133")?;
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(hits.len(), 0);
}
{
// Default json fields are ignored if they collide with the schema
let query = query_parser.parse_query("event_type:holiday-sale")?;
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(hits.len(), 0);
}
// # Query via full attribute path
{
// This only searches in our schema's `event_type` field
let query = query_parser.parse_query("event_type:click")?;
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(hits.len(), 2);
}
{
// Default json fields can still be accessed by full path
let query = query_parser.parse_query("attributes.event_type:holiday-sale")?;
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
assert_eq!(hits.len(), 1);
}
Ok(())

View File

@@ -1,6 +1,6 @@
[package]
name = "fastfield_codecs"
version = "0.1.0"
version = "0.2.0"
authors = ["Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
edition = "2018"
@@ -9,8 +9,8 @@ description = "Fast field codecs used by tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
common = { version = "0.2", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
common = { version = "0.3", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
prettytable-rs = {version="0.8.0", optional= true}
rand = {version="0.8.3", optional= true}

View File

@@ -1,7 +1,7 @@
[package]
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
name = "ownedbytes"
version = "0.2.0"
version = "0.3.0"
edition = "2018"
description = "Expose data as static slice"
license = "MIT"

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-query-grammar"
version = "0.15.0"
version = "0.18.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]

View File

@@ -18,7 +18,7 @@ use crate::Occur;
const SPECIAL_CHARS: &[char] = &[
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
];
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*| )"#;
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*|\s)"#;
/// Parses a field_name
/// A field name must have at least one character and be followed by a colon.
@@ -34,7 +34,8 @@ fn field_name<'a>() -> impl Parser<&'a str, Output = String> {
take_while(|c| !SPECIAL_CHARS.contains(&c)),
),
'\\',
satisfy(|c| SPECIAL_CHARS.contains(&c)),
satisfy(|_| true), /* if the next character is not a special char, the \ will be treated
* as the \ character. */
))
.skip(char(':'))
.map(|s| ESCAPED_SPECIAL_CHARS_RE.replace_all(&s, "$1").to_string())
@@ -67,7 +68,7 @@ fn word<'a>() -> impl Parser<&'a str, Output = String> {
///
/// NOTE: also accepts 999999-99-99T99:99:99.266051969+99:99
/// We delegate rejecting such invalid dates to the logical AST compuation code
/// which invokes chrono::DateTime::parse_from_rfc3339 on the value to actually parse
/// which invokes time::OffsetDateTime::parse(..., &Rfc3339) on the value to actually parse
/// it (instead of merely extracting the datetime value as string as done here).
fn date_time<'a>() -> impl Parser<&'a str, Output = String> {
let two_digits = || recognize::<String, _, _>((digit(), digit()));
@@ -516,15 +517,27 @@ mod test {
}
#[test]
fn test_field_name() -> TestParseResult {
fn test_field_name() {
assert_eq!(
super::field_name().parse(".my.field.name:a"),
Ok((".my.field.name".to_string(), "a"))
);
assert_eq!(
super::field_name().parse(r#"my\ field:a"#),
Ok(("my field".to_string(), "a"))
);
assert_eq!(
super::field_name().parse(r#"にんじん:a"#),
Ok(("にんじん".to_string(), "a"))
);
assert_eq!(
super::field_name().parse("my\\ field\\ name:a"),
Ok(("my field name".to_string(), "a"))
);
assert_eq!(
super::field_name().parse(r#"my\field:a"#),
Ok((r#"my\field"#.to_string(), "a"))
);
assert!(super::field_name().parse("my field:a").is_err());
assert_eq!(
super::field_name().parse("\\(1\\+1\\):2"),
@@ -534,14 +547,21 @@ mod test {
super::field_name().parse("my_field_name:a"),
Ok(("my_field_name".to_string(), "a"))
);
assert_eq!(
super::field_name().parse("myfield.b:hello").unwrap(),
("myfield.b".to_string(), "hello")
);
assert_eq!(
super::field_name().parse(r#"myfield\.b:hello"#).unwrap(),
(r#"myfield\.b"#.to_string(), "hello")
);
assert!(super::field_name().parse("my_field_name").is_err());
assert!(super::field_name().parse(":a").is_err());
assert!(super::field_name().parse("-my_field:a").is_err());
assert_eq!(
super::field_name().parse("_my_field:a")?,
("_my_field".to_string(), "a")
super::field_name().parse("_my_field:a"),
Ok(("_my_field".to_string(), "a"))
);
Ok(())
}
#[test]

View File

@@ -49,7 +49,9 @@ use std::collections::{HashMap, HashSet};
use serde::{Deserialize, Serialize};
pub use super::bucket::RangeAggregation;
use super::bucket::{HistogramAggregation, TermsAggregation};
use super::metric::{AverageAggregation, StatsAggregation};
use super::VecWithNames;
/// The top-level aggregation request structure, which contains [Aggregation] and their user defined
/// names. It is also used in [buckets](BucketAggregation) to define sub-aggregations.
@@ -57,6 +59,70 @@ use super::metric::{AverageAggregation, StatsAggregation};
/// The key is the user defined name of the aggregation.
pub type Aggregations = HashMap<String, Aggregation>;
/// Like Aggregations, but optimized to work with the aggregation result
#[derive(Clone, Debug)]
pub(crate) struct AggregationsInternal {
pub(crate) metrics: VecWithNames<MetricAggregation>,
pub(crate) buckets: VecWithNames<BucketAggregationInternal>,
}
impl From<Aggregations> for AggregationsInternal {
fn from(aggs: Aggregations) -> Self {
let mut metrics = vec![];
let mut buckets = vec![];
for (key, agg) in aggs {
match agg {
Aggregation::Bucket(bucket) => buckets.push((
key,
BucketAggregationInternal {
bucket_agg: bucket.bucket_agg,
sub_aggregation: bucket.sub_aggregation.into(),
},
)),
Aggregation::Metric(metric) => metrics.push((key, metric)),
}
}
Self {
metrics: VecWithNames::from_entries(metrics),
buckets: VecWithNames::from_entries(buckets),
}
}
}
#[derive(Clone, Debug)]
// Like BucketAggregation, but optimized to work with the result
pub(crate) struct BucketAggregationInternal {
/// Bucket aggregation strategy to group documents.
pub bucket_agg: BucketAggregationType,
/// The sub_aggregations in the buckets. Each bucket will aggregate on the document set in the
/// bucket.
pub sub_aggregation: AggregationsInternal,
}
impl BucketAggregationInternal {
pub(crate) fn as_histogram(&self) -> Option<&HistogramAggregation> {
match &self.bucket_agg {
BucketAggregationType::Histogram(histogram) => Some(histogram),
_ => None,
}
}
pub(crate) fn as_term(&self) -> Option<&TermsAggregation> {
match &self.bucket_agg {
BucketAggregationType::Terms(terms) => Some(terms),
_ => None,
}
}
}
/// Extract all fields, where the term directory is used in the tree.
pub fn get_term_dict_field_names(aggs: &Aggregations) -> HashSet<String> {
let mut term_dict_field_names = Default::default();
for el in aggs.values() {
el.get_term_dict_field_names(&mut term_dict_field_names)
}
term_dict_field_names
}
/// Extract all fast field names used in the tree.
pub fn get_fast_field_names(aggs: &Aggregations) -> HashSet<String> {
let mut fast_field_names = Default::default();
@@ -79,6 +145,12 @@ pub enum Aggregation {
}
impl Aggregation {
fn get_term_dict_field_names(&self, term_field_names: &mut HashSet<String>) {
if let Aggregation::Bucket(bucket) = self {
bucket.get_term_dict_field_names(term_field_names)
}
}
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
match self {
Aggregation::Bucket(bucket) => bucket.get_fast_field_names(fast_field_names),
@@ -111,6 +183,12 @@ pub struct BucketAggregation {
}
impl BucketAggregation {
fn get_term_dict_field_names(&self, term_dict_field_names: &mut HashSet<String>) {
if let BucketAggregationType::Terms(terms) = &self.bucket_agg {
term_dict_field_names.insert(terms.field.to_string());
}
term_dict_field_names.extend(get_term_dict_field_names(&self.sub_aggregation));
}
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
self.bucket_agg.get_fast_field_names(fast_field_names);
fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
@@ -123,12 +201,22 @@ pub enum BucketAggregationType {
/// Put data into buckets of user-defined ranges.
#[serde(rename = "range")]
Range(RangeAggregation),
/// Put data into buckets of user-defined ranges.
#[serde(rename = "histogram")]
Histogram(HistogramAggregation),
/// Put data into buckets of terms.
#[serde(rename = "terms")]
Terms(TermsAggregation),
}
impl BucketAggregationType {
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
match self {
BucketAggregationType::Terms(terms) => fast_field_names.insert(terms.field.to_string()),
BucketAggregationType::Range(range) => fast_field_names.insert(range.field.to_string()),
BucketAggregationType::Histogram(histogram) => {
fast_field_names.insert(histogram.field.to_string())
}
};
}
}

View File

@@ -1,12 +1,16 @@
//! This will enhance the request tree with access to the fastfield and metadata.
use std::sync::Arc;
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
use super::bucket::RangeAggregation;
use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
use super::metric::{AverageAggregation, StatsAggregation};
use super::VecWithNames;
use crate::fastfield::{type_and_cardinality, DynamicFastFieldReader, FastType};
use crate::fastfield::{
type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader,
};
use crate::schema::{Cardinality, Type};
use crate::{SegmentReader, TantivyError};
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
#[derive(Clone, Default)]
pub(crate) struct AggregationsWithAccessor {
@@ -27,11 +31,32 @@ impl AggregationsWithAccessor {
}
}
#[derive(Clone)]
pub(crate) enum FastFieldAccessor {
Multi(MultiValuedFastFieldReader<u64>),
Single(DynamicFastFieldReader<u64>),
}
impl FastFieldAccessor {
pub fn as_single(&self) -> Option<&DynamicFastFieldReader<u64>> {
match self {
FastFieldAccessor::Multi(_) => None,
FastFieldAccessor::Single(reader) => Some(reader),
}
}
pub fn as_multi(&self) -> Option<&MultiValuedFastFieldReader<u64>> {
match self {
FastFieldAccessor::Multi(reader) => Some(reader),
FastFieldAccessor::Single(_) => None,
}
}
}
#[derive(Clone)]
pub struct BucketAggregationWithAccessor {
/// In general there can be buckets without fast field access, e.g. buckets that are created
/// based on search terms. So eventually this needs to be Option or moved.
pub(crate) accessor: DynamicFastFieldReader<u64>,
pub(crate) accessor: FastFieldAccessor,
pub(crate) inverted_index: Option<Arc<InvertedIndexReader>>,
pub(crate) field_type: Type,
pub(crate) bucket_agg: BucketAggregationType,
pub(crate) sub_aggregation: AggregationsWithAccessor,
@@ -43,11 +68,25 @@ impl BucketAggregationWithAccessor {
sub_aggregation: &Aggregations,
reader: &SegmentReader,
) -> crate::Result<BucketAggregationWithAccessor> {
let mut inverted_index = None;
let (accessor, field_type) = match &bucket {
BucketAggregationType::Range(RangeAggregation {
field: field_name,
ranges: _,
}) => get_ff_reader_and_validate(reader, field_name)?,
}) => get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?,
BucketAggregationType::Histogram(HistogramAggregation {
field: field_name, ..
}) => get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?,
BucketAggregationType::Terms(TermsAggregation {
field: field_name, ..
}) => {
let field = reader
.schema()
.get_field(field_name)
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
inverted_index = Some(reader.inverted_index(field)?);
get_ff_reader_and_validate(reader, field_name, Cardinality::MultiValues)?
}
};
let sub_aggregation = sub_aggregation.clone();
Ok(BucketAggregationWithAccessor {
@@ -55,6 +94,7 @@ impl BucketAggregationWithAccessor {
field_type,
sub_aggregation: get_aggs_with_accessor_and_validate(&sub_aggregation, reader)?,
bucket_agg: bucket.clone(),
inverted_index,
})
}
}
@@ -75,10 +115,14 @@ impl MetricAggregationWithAccessor {
match &metric {
MetricAggregation::Average(AverageAggregation { field: field_name })
| MetricAggregation::Stats(StatsAggregation { field: field_name }) => {
let (accessor, field_type) = get_ff_reader_and_validate(reader, field_name)?;
let (accessor, field_type) =
get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?;
Ok(MetricAggregationWithAccessor {
accessor,
accessor: accessor
.as_single()
.expect("unexpected fast field cardinality")
.clone(),
field_type,
metric: metric.clone(),
})
@@ -115,32 +159,45 @@ pub(crate) fn get_aggs_with_accessor_and_validate(
))
}
/// Get fast field reader with given cardinatility.
fn get_ff_reader_and_validate(
reader: &SegmentReader,
field_name: &str,
) -> crate::Result<(DynamicFastFieldReader<u64>, Type)> {
cardinality: Cardinality,
) -> crate::Result<(FastFieldAccessor, Type)> {
let field = reader
.schema()
.get_field(field_name)
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
let field_type = reader.schema().get_field_entry(field).field_type();
if let Some((ff_type, cardinality)) = type_and_cardinality(field_type) {
if cardinality == Cardinality::MultiValues || ff_type == FastType::Date {
if let Some((ff_type, field_cardinality)) = type_and_cardinality(field_type) {
if ff_type == FastType::Date {
return Err(TantivyError::InvalidArgument(
"Unsupported field type date in aggregation".to_string(),
));
}
if cardinality != field_cardinality {
return Err(TantivyError::InvalidArgument(format!(
"Invalid field type in aggregation {:?}, only Cardinality::SingleValue supported",
field_type.value_type()
"Invalid field cardinality on field {} expected {:?}, but got {:?}",
field_name, cardinality, field_cardinality
)));
}
} else {
return Err(TantivyError::InvalidArgument(format!(
"Only single value fast fields of type f64, u64, i64 are supported, but got {:?} ",
"Only fast fields of type f64, u64, i64 are supported, but got {:?} ",
field_type.value_type()
)));
};
let ff_fields = reader.fast_fields();
ff_fields
.u64_lenient(field)
.map(|field| (field, field_type.value_type()))
match cardinality {
Cardinality::SingleValue => ff_fields
.u64_lenient(field)
.map(|field| (FastFieldAccessor::Single(field), field_type.value_type())),
Cardinality::MultiValues => ff_fields
.u64s_lenient(field)
.map(|field| (FastFieldAccessor::Multi(field), field_type.value_type())),
}
}

View File

@@ -7,29 +7,132 @@
use std::cmp::Ordering;
use std::collections::HashMap;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use super::agg_req::{
Aggregations, AggregationsInternal, BucketAggregationInternal, MetricAggregation,
};
use super::bucket::{intermediate_buckets_to_final_buckets, GetDocCount};
use super::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
IntermediateMetricResult, IntermediateRangeBucketEntry,
};
use super::metric::{SingleMetricResult, Stats};
use super::Key;
use super::{Key, VecWithNames};
use crate::TantivyError;
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
/// The final aggegation result.
pub struct AggregationResults(pub HashMap<String, AggregationResult>);
impl From<IntermediateAggregationResults> for AggregationResults {
fn from(tree: IntermediateAggregationResults) -> Self {
Self(
tree.0
.into_iter()
.map(|(key, agg)| (key, agg.into()))
.collect(),
)
impl AggregationResults {
pub(crate) fn get_value_from_aggregation(
&self,
name: &str,
agg_property: &str,
) -> crate::Result<Option<f64>> {
if let Some(agg) = self.0.get(name) {
agg.get_value_from_aggregation(name, agg_property)
} else {
// Validation is be done during request parsing, so we can't reach this state.
Err(TantivyError::InternalError(format!(
"Can't find aggregation {:?} in sub_aggregations",
name
)))
}
}
/// Convert and intermediate result and its aggregation request to the final result
pub fn from_intermediate_and_req(
results: IntermediateAggregationResults,
agg: Aggregations,
) -> crate::Result<Self> {
AggregationResults::from_intermediate_and_req_internal(results, &(agg.into()))
}
/// Convert and intermediate result and its aggregation request to the final result
///
/// Internal function, CollectorAggregations is used instead Aggregations, which is optimized
/// for internal processing, by splitting metric and buckets into seperate groups.
pub(crate) fn from_intermediate_and_req_internal(
intermediate_results: IntermediateAggregationResults,
req: &AggregationsInternal,
) -> crate::Result<Self> {
// Important assumption:
// When the tree contains buckets/metric, we expect it to have all buckets/metrics from the
// request
let mut results: HashMap<String, AggregationResult> = HashMap::new();
if let Some(buckets) = intermediate_results.buckets {
add_coverted_final_buckets_to_result(&mut results, buckets, &req.buckets)?
} else {
// When there are no buckets, we create empty buckets, so that the serialized json
// format is constant
add_empty_final_buckets_to_result(&mut results, &req.buckets)?
};
if let Some(metrics) = intermediate_results.metrics {
add_converted_final_metrics_to_result(&mut results, metrics);
} else {
// When there are no metrics, we create empty metric results, so that the serialized
// json format is constant
add_empty_final_metrics_to_result(&mut results, &req.metrics)?;
}
Ok(Self(results))
}
}
fn add_converted_final_metrics_to_result(
results: &mut HashMap<String, AggregationResult>,
metrics: VecWithNames<IntermediateMetricResult>,
) {
results.extend(
metrics
.into_iter()
.map(|(key, metric)| (key, AggregationResult::MetricResult(metric.into()))),
);
}
fn add_empty_final_metrics_to_result(
results: &mut HashMap<String, AggregationResult>,
req_metrics: &VecWithNames<MetricAggregation>,
) -> crate::Result<()> {
results.extend(req_metrics.iter().map(|(key, req)| {
let empty_bucket = IntermediateMetricResult::empty_from_req(req);
(
key.to_string(),
AggregationResult::MetricResult(empty_bucket.into()),
)
}));
Ok(())
}
fn add_empty_final_buckets_to_result(
results: &mut HashMap<String, AggregationResult>,
req_buckets: &VecWithNames<BucketAggregationInternal>,
) -> crate::Result<()> {
let requested_buckets = req_buckets.iter();
for (key, req) in requested_buckets {
let empty_bucket = AggregationResult::BucketResult(BucketResult::empty_from_req(req)?);
results.insert(key.to_string(), empty_bucket);
}
Ok(())
}
fn add_coverted_final_buckets_to_result(
results: &mut HashMap<String, AggregationResult>,
buckets: VecWithNames<IntermediateBucketResult>,
req_buckets: &VecWithNames<BucketAggregationInternal>,
) -> crate::Result<()> {
assert_eq!(buckets.len(), req_buckets.len());
let buckets_with_request = buckets.into_iter().zip(req_buckets.values());
for ((key, bucket), req) in buckets_with_request {
let result =
AggregationResult::BucketResult(BucketResult::from_intermediate_and_req(bucket, req)?);
results.insert(key, result);
}
Ok(())
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
@@ -41,15 +144,20 @@ pub enum AggregationResult {
/// Metric result variant.
MetricResult(MetricResult),
}
impl From<IntermediateAggregationResult> for AggregationResult {
fn from(tree: IntermediateAggregationResult) -> Self {
match tree {
IntermediateAggregationResult::Bucket(bucket) => {
AggregationResult::BucketResult(bucket.into())
}
IntermediateAggregationResult::Metric(metric) => {
AggregationResult::MetricResult(metric.into())
}
impl AggregationResult {
pub(crate) fn get_value_from_aggregation(
&self,
_name: &str,
agg_property: &str,
) -> crate::Result<Option<f64>> {
match self {
AggregationResult::BucketResult(_bucket) => Err(TantivyError::InternalError(
"Tried to retrieve value from bucket aggregation. This is not supported and \
should not happen during collection, but should be catched during validation"
.to_string(),
)),
AggregationResult::MetricResult(metric) => metric.get_value(agg_property),
}
}
}
@@ -64,6 +172,14 @@ pub enum MetricResult {
Stats(Stats),
}
impl MetricResult {
fn get_value(&self, agg_property: &str) -> crate::Result<Option<f64>> {
match self {
MetricResult::Average(avg) => Ok(avg.value),
MetricResult::Stats(stats) => stats.get_value(agg_property),
}
}
}
impl From<IntermediateMetricResult> for MetricResult {
fn from(metric: IntermediateMetricResult) -> Self {
match metric {
@@ -81,35 +197,147 @@ impl From<IntermediateMetricResult> for MetricResult {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)]
pub enum BucketResult {
/// This is the default entry for a bucket, which contains a key, count, and optionally
/// This is the range entry for a bucket, which contains a key, count, from, to, and optionally
/// sub_aggregations.
Range {
/// The range buckets sorted by range.
buckets: Vec<RangeBucketEntry>,
},
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
/// sub_aggregations.
Histogram {
/// The buckets.
///
/// If there are holes depends on the request, if min_doc_count is 0, then there are no
/// holes between the first and last bucket.
/// See [HistogramAggregation](super::bucket::HistogramAggregation)
buckets: Vec<BucketEntry>,
},
/// This is the term result
Terms {
/// The buckets.
///
/// See [TermsAggregation](super::bucket::TermsAggregation)
buckets: Vec<BucketEntry>,
/// The number of documents that didnt make it into to TOP N due to shard_size or size
sum_other_doc_count: u64,
#[serde(skip_serializing_if = "Option::is_none")]
/// The upper bound error for the doc count of each term.
doc_count_error_upper_bound: Option<u64>,
},
}
impl From<IntermediateBucketResult> for BucketResult {
fn from(result: IntermediateBucketResult) -> Self {
match result {
IntermediateBucketResult::Range(range_map) => {
let mut buckets: Vec<RangeBucketEntry> = range_map
.into_iter()
.map(|(_, bucket)| bucket.into())
.collect_vec();
impl BucketResult {
pub(crate) fn empty_from_req(req: &BucketAggregationInternal) -> crate::Result<Self> {
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
BucketResult::from_intermediate_and_req(empty_bucket, req)
}
buckets.sort_by(|a, b| {
a.from
fn from_intermediate_and_req(
bucket_result: IntermediateBucketResult,
req: &BucketAggregationInternal,
) -> crate::Result<Self> {
match bucket_result {
IntermediateBucketResult::Range(range_res) => {
let mut buckets: Vec<RangeBucketEntry> = range_res
.buckets
.into_iter()
.map(|(_, bucket)| {
RangeBucketEntry::from_intermediate_and_req(bucket, &req.sub_aggregation)
})
.collect::<crate::Result<Vec<_>>>()?;
buckets.sort_by(|left, right| {
// TODO use total_cmp next stable rust release
left.from
.unwrap_or(f64::MIN)
.partial_cmp(&b.from.unwrap_or(f64::MIN))
.partial_cmp(&right.from.unwrap_or(f64::MIN))
.unwrap_or(Ordering::Equal)
});
BucketResult::Range { buckets }
Ok(BucketResult::Range { buckets })
}
IntermediateBucketResult::Histogram { buckets } => {
let buckets = intermediate_buckets_to_final_buckets(
buckets,
req.as_histogram()
.expect("unexpected aggregation, expected histogram aggregation"),
&req.sub_aggregation,
)?;
Ok(BucketResult::Histogram { buckets })
}
IntermediateBucketResult::Terms(terms) => terms.into_final_result(
req.as_term()
.expect("unexpected aggregation, expected term aggregation"),
&req.sub_aggregation,
),
}
}
}
/// This is the default entry for a bucket, which contains a key, count, and optionally
/// sub_aggregations.
///
/// # JSON Format
/// ```json
/// {
/// ...
/// "my_histogram": {
/// "buckets": [
/// {
/// "key": "2.0",
/// "doc_count": 5
/// },
/// {
/// "key": "4.0",
/// "doc_count": 2
/// },
/// {
/// "key": "6.0",
/// "doc_count": 3
/// }
/// ]
/// }
/// ...
/// }
/// ```
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct BucketEntry {
/// The identifier of the bucket.
pub key: Key,
/// Number of documents in the bucket.
pub doc_count: u64,
#[serde(flatten)]
/// Sub-aggregations in this bucket.
pub sub_aggregation: AggregationResults,
}
impl BucketEntry {
pub(crate) fn from_intermediate_and_req(
entry: IntermediateHistogramBucketEntry,
req: &AggregationsInternal,
) -> crate::Result<Self> {
Ok(BucketEntry {
key: Key::F64(entry.key),
doc_count: entry.doc_count,
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
entry.sub_aggregation,
req,
)?,
})
}
}
impl GetDocCount for &BucketEntry {
fn doc_count(&self) -> u64 {
self.doc_count
}
}
impl GetDocCount for BucketEntry {
fn doc_count(&self) -> u64 {
self.doc_count
}
}
/// This is the range entry for a bucket, which contains a key, count, and optionally
/// sub_aggregations.
///
@@ -139,7 +367,7 @@ impl From<IntermediateBucketResult> for BucketResult {
/// }
/// ...
/// }
/// ```
/// ```
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct RangeBucketEntry {
/// The identifier of the bucket.
@@ -157,14 +385,20 @@ pub struct RangeBucketEntry {
pub to: Option<f64>,
}
impl From<IntermediateRangeBucketEntry> for RangeBucketEntry {
fn from(entry: IntermediateRangeBucketEntry) -> Self {
RangeBucketEntry {
impl RangeBucketEntry {
fn from_intermediate_and_req(
entry: IntermediateRangeBucketEntry,
req: &AggregationsInternal,
) -> crate::Result<Self> {
Ok(RangeBucketEntry {
key: entry.key,
doc_count: entry.doc_count,
sub_aggregation: entry.sub_aggregation.into(),
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
entry.sub_aggregation,
req,
)?,
to: entry.to,
from: entry.from,
}
})
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,2 @@
mod histogram;
pub use histogram::*;

View File

@@ -7,7 +7,134 @@
//! Results of intermediate buckets are
//! [IntermediateBucketResult](super::intermediate_agg_result::IntermediateBucketResult)
mod histogram;
mod range;
mod term_agg;
use std::collections::HashMap;
pub(crate) use histogram::SegmentHistogramCollector;
pub use histogram::*;
pub(crate) use range::SegmentRangeCollector;
pub use range::*;
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
pub use term_agg::*;
/// Order for buckets in a bucket aggregation.
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
pub enum Order {
/// Asc order
#[serde(rename = "asc")]
Asc,
/// Desc order
#[serde(rename = "desc")]
Desc,
}
impl Default for Order {
fn default() -> Self {
Order::Desc
}
}
#[derive(Clone, Debug, PartialEq)]
/// Order property by which to apply the order
pub enum OrderTarget {
/// The key of the bucket
Key,
/// The doc count of the bucket
Count,
/// Order by value of the sub aggregation metric with identified by given `String`.
///
/// Only single value metrics are supported currently
SubAggregation(String),
}
impl Default for OrderTarget {
fn default() -> Self {
OrderTarget::Count
}
}
impl From<&str> for OrderTarget {
fn from(val: &str) -> Self {
match val {
"_key" => OrderTarget::Key,
"_count" => OrderTarget::Count,
_ => OrderTarget::SubAggregation(val.to_string()),
}
}
}
impl ToString for OrderTarget {
fn to_string(&self) -> String {
match self {
OrderTarget::Key => "_key".to_string(),
OrderTarget::Count => "_count".to_string(),
OrderTarget::SubAggregation(agg) => agg.to_string(),
}
}
}
/// Set the order. target is either "_count", "_key", or the name of
/// a metric sub_aggregation.
///
/// De/Serializes to elasticsearch compatible JSON.
///
/// Examples in JSON format:
/// { "_count": "asc" }
/// { "_key": "asc" }
/// { "average_price": "asc" }
#[derive(Clone, Default, Debug, PartialEq)]
pub struct CustomOrder {
/// The target property by which to sort by
pub target: OrderTarget,
/// The order asc or desc
pub order: Order,
}
impl Serialize for CustomOrder {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer {
let map: HashMap<String, Order> =
std::iter::once((self.target.to_string(), self.order)).collect();
map.serialize(serializer)
}
}
impl<'de> Deserialize<'de> for CustomOrder {
fn deserialize<D>(deserializer: D) -> Result<CustomOrder, D::Error>
where D: Deserializer<'de> {
HashMap::<String, Order>::deserialize(deserializer).and_then(|map| {
if let Some((key, value)) = map.into_iter().next() {
Ok(CustomOrder {
target: key.as_str().into(),
order: value,
})
} else {
Err(de::Error::custom(
"unexpected empty map in order".to_string(),
))
}
})
}
}
#[test]
fn custom_order_serde_test() {
let order = CustomOrder {
target: OrderTarget::Key,
order: Order::Desc,
};
let order_str = serde_json::to_string(&order).unwrap();
assert_eq!(order_str, "{\"_key\":\"desc\"}");
let order_deser = serde_json::from_str(&order_str).unwrap();
assert_eq!(order, order_deser);
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("{}");
assert!(order_deser.is_err());
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("[]");
assert!(order_deser.is_err());
}

View File

@@ -1,3 +1,4 @@
use std::fmt::Debug;
use std::ops::Range;
use serde::{Deserialize, Serialize};
@@ -5,10 +6,10 @@ use serde::{Deserialize, Serialize};
use crate::aggregation::agg_req_with_accessor::{
AggregationsWithAccessor, BucketAggregationWithAccessor,
};
use crate::aggregation::intermediate_agg_result::IntermediateBucketResult;
use crate::aggregation::segment_agg_result::{
SegmentAggregationResultsCollector, SegmentRangeBucketEntry,
use crate::aggregation::intermediate_agg_result::{
IntermediateBucketResult, IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
};
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key};
use crate::fastfield::FastFieldReader;
use crate::schema::Type;
@@ -30,20 +31,25 @@ use crate::{DocId, TantivyError};
/// [crate::aggregation::intermediate_agg_result::IntermediateRangeBucketEntry] on the
/// DistributedAggregationCollector.
///
/// # Limitations/Compatibility
/// Overlapping ranges are not yet supported.
///
/// The keyed parameter (elasticsearch) is not yet supported.
///
/// # Request JSON Format
/// ```json
/// {
/// "range": {
/// "my_ranges": {
/// "field": "score",
/// "ranges": [
/// { "to": 3.0 },
/// { "from": 3.0, "to": 7.0 },
/// { "from": 7.0, "to": 20.0 }
/// { "from": 7.0, "to": 20.0 },
/// { "from": 20.0 }
/// ]
/// }
/// }
/// ```
/// }
/// ```
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct RangeAggregation {
/// The field to aggregate on.
@@ -97,22 +103,72 @@ pub struct SegmentRangeCollector {
field_type: Type,
}
#[derive(Clone, PartialEq)]
pub(crate) struct SegmentRangeBucketEntry {
pub key: Key,
pub doc_count: u64,
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
/// The from range of the bucket. Equals f64::MIN when None.
pub from: Option<f64>,
/// The to range of the bucket. Equals f64::MAX when None. Open interval, `to` is not
/// inclusive.
pub to: Option<f64>,
}
impl Debug for SegmentRangeBucketEntry {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SegmentRangeBucketEntry")
.field("key", &self.key)
.field("doc_count", &self.doc_count)
.field("from", &self.from)
.field("to", &self.to)
.finish()
}
}
impl SegmentRangeBucketEntry {
pub(crate) fn into_intermediate_bucket_entry(
self,
agg_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<IntermediateRangeBucketEntry> {
let sub_aggregation = if let Some(sub_aggregation) = self.sub_aggregation {
sub_aggregation.into_intermediate_aggregations_result(agg_with_accessor)?
} else {
Default::default()
};
Ok(IntermediateRangeBucketEntry {
key: self.key,
doc_count: self.doc_count,
sub_aggregation,
from: self.from,
to: self.to,
})
}
}
impl SegmentRangeCollector {
pub fn into_intermediate_bucket_result(self) -> IntermediateBucketResult {
pub fn into_intermediate_bucket_result(
self,
agg_with_accessor: &BucketAggregationWithAccessor,
) -> crate::Result<IntermediateBucketResult> {
let field_type = self.field_type;
let buckets = self
.buckets
.into_iter()
.map(move |range_bucket| {
(
Ok((
range_to_string(&range_bucket.range, &field_type),
range_bucket.bucket.into(),
)
range_bucket
.bucket
.into_intermediate_bucket_entry(&agg_with_accessor.sub_aggregation)?,
))
})
.collect();
.collect::<crate::Result<_>>()?;
IntermediateBucketResult::Range(buckets)
Ok(IntermediateBucketResult::Range(
IntermediateRangeBucketResult { buckets },
))
}
pub(crate) fn from_req_and_validate(
@@ -170,11 +226,15 @@ impl SegmentRangeCollector {
force_flush: bool,
) {
let mut iter = doc.chunks_exact(4);
let accessor = bucket_with_accessor
.accessor
.as_single()
.expect("unexpected fast field cardinatility");
for docs in iter.by_ref() {
let val1 = bucket_with_accessor.accessor.get(docs[0]);
let val2 = bucket_with_accessor.accessor.get(docs[1]);
let val3 = bucket_with_accessor.accessor.get(docs[2]);
let val4 = bucket_with_accessor.accessor.get(docs[3]);
let val1 = accessor.get(docs[0]);
let val2 = accessor.get(docs[1]);
let val3 = accessor.get(docs[2]);
let val4 = accessor.get(docs[3]);
let bucket_pos1 = self.get_bucket_pos(val1);
let bucket_pos2 = self.get_bucket_pos(val2);
let bucket_pos3 = self.get_bucket_pos(val3);
@@ -186,7 +246,7 @@ impl SegmentRangeCollector {
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation);
}
for doc in iter.remainder() {
let val = bucket_with_accessor.accessor.get(*doc);
let val = accessor.get(*doc);
let bucket_pos = self.get_bucket_pos(val);
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation);
}
@@ -341,7 +401,8 @@ mod tests {
ranges,
};
SegmentRangeCollector::from_req_and_validate(&req, &Default::default(), field_type).unwrap()
SegmentRangeCollector::from_req_and_validate(&req, &Default::default(), field_type)
.expect("unexpected error")
}
#[test]
@@ -482,11 +543,7 @@ mod tests {
#[test]
fn range_binary_search_test_f64() {
let ranges = vec![
//(f64::MIN..10.0).into(),
(10.0..100.0).into(),
//(100.0..f64::MAX).into(),
];
let ranges = vec![(10.0..100.0).into()];
let collector = get_collector_from_ranges(ranges, Type::F64);
let search = |val: u64| collector.get_bucket_pos(val);

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,7 @@ use super::intermediate_agg_result::IntermediateAggregationResults;
use super::segment_agg_result::SegmentAggregationResultsCollector;
use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_validate;
use crate::collector::{Collector, SegmentCollector};
use crate::{SegmentReader, TantivyError};
use crate::SegmentReader;
/// Collector for aggregations.
///
@@ -75,13 +75,7 @@ impl Collector for AggregationCollector {
_segment_local_id: crate::SegmentOrdinal,
reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let aggs_with_accessor = get_aggs_with_accessor_and_validate(&self.agg, reader)?;
let result =
SegmentAggregationResultsCollector::from_req_and_validate(&aggs_with_accessor)?;
Ok(AggregationSegmentCollector {
aggs: aggs_with_accessor,
result,
})
AggregationSegmentCollector::from_agg_req_and_reader(&self.agg, reader)
}
fn requires_scoring(&self) -> bool {
@@ -92,28 +86,28 @@ impl Collector for AggregationCollector {
&self,
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
) -> crate::Result<Self::Fruit> {
merge_fruits(segment_fruits).map(|res| res.into())
let res = merge_fruits(segment_fruits)?;
AggregationResults::from_intermediate_and_req(res, self.agg.clone())
}
}
fn merge_fruits(
mut segment_fruits: Vec<IntermediateAggregationResults>,
mut segment_fruits: Vec<crate::Result<IntermediateAggregationResults>>,
) -> crate::Result<IntermediateAggregationResults> {
if let Some(mut fruit) = segment_fruits.pop() {
if let Some(fruit) = segment_fruits.pop() {
let mut fruit = fruit?;
for next_fruit in segment_fruits {
fruit.merge_fruits(&next_fruit);
fruit.merge_fruits(next_fruit?);
}
Ok(fruit)
} else {
Err(TantivyError::InvalidArgument(
"no fruits provided in merge_fruits".to_string(),
))
Ok(IntermediateAggregationResults::default())
}
}
/// AggregationSegmentCollector does the aggregation collection on a segment.
pub struct AggregationSegmentCollector {
aggs: AggregationsWithAccessor,
aggs_with_accessor: AggregationsWithAccessor,
result: SegmentAggregationResultsCollector,
}
@@ -128,22 +122,24 @@ impl AggregationSegmentCollector {
let result =
SegmentAggregationResultsCollector::from_req_and_validate(&aggs_with_accessor)?;
Ok(AggregationSegmentCollector {
aggs: aggs_with_accessor,
aggs_with_accessor,
result,
})
}
}
impl SegmentCollector for AggregationSegmentCollector {
type Fruit = IntermediateAggregationResults;
type Fruit = crate::Result<IntermediateAggregationResults>;
#[inline]
fn collect(&mut self, doc: crate::DocId, _score: crate::Score) {
self.result.collect(doc, &self.aggs);
self.result.collect(doc, &self.aggs_with_accessor);
}
fn harvest(mut self) -> Self::Fruit {
self.result.flush_staged_docs(&self.aggs, true);
self.result.into()
self.result
.flush_staged_docs(&self.aggs_with_accessor, true);
self.result
.into_intermediate_aggregations_result(&self.aggs_with_accessor)
}
}

View File

@@ -2,43 +2,90 @@
//! Intermediate aggregation results can be used to merge results between segments or between
//! indices.
use std::collections::HashMap;
use std::cmp::Ordering;
use fnv::FnvHashMap;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use super::metric::{IntermediateAverage, IntermediateStats};
use super::segment_agg_result::{
SegmentAggregationResultsCollector, SegmentBucketResultCollector, SegmentMetricResultCollector,
SegmentRangeBucketEntry,
use super::agg_req::{AggregationsInternal, BucketAggregationType, MetricAggregation};
use super::agg_result::BucketResult;
use super::bucket::{
cut_off_buckets, get_agg_name_and_property, GetDocCount, Order, OrderTarget,
SegmentHistogramBucketEntry, TermsAggregation,
};
use super::metric::{IntermediateAverage, IntermediateStats};
use super::segment_agg_result::SegmentMetricResultCollector;
use super::{Key, SerializedKey, VecWithNames};
use crate::aggregation::agg_result::{AggregationResults, BucketEntry};
use crate::aggregation::bucket::TermsAggregationInternal;
/// Contains the intermediate aggregation result, which is optimized to be merged with other
/// intermediate results.
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateAggregationResults(pub(crate) VecWithNames<IntermediateAggregationResult>);
impl From<SegmentAggregationResultsCollector> for IntermediateAggregationResults {
fn from(tree: SegmentAggregationResultsCollector) -> Self {
let mut data = vec![];
for (key, bucket) in tree.buckets.into_iter() {
data.push((key, IntermediateAggregationResult::Bucket(bucket.into())));
}
for (key, metric) in tree.metrics.into_iter() {
data.push((key, IntermediateAggregationResult::Metric(metric.into())));
}
Self(VecWithNames::from_entries(data))
}
pub struct IntermediateAggregationResults {
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) metrics: Option<VecWithNames<IntermediateMetricResult>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) buckets: Option<VecWithNames<IntermediateBucketResult>>,
}
impl IntermediateAggregationResults {
pub(crate) fn empty_from_req(req: &AggregationsInternal) -> Self {
let metrics = if req.metrics.is_empty() {
None
} else {
let metrics = req
.metrics
.iter()
.map(|(key, req)| {
(
key.to_string(),
IntermediateMetricResult::empty_from_req(req),
)
})
.collect();
Some(VecWithNames::from_entries(metrics))
};
let buckets = if req.buckets.is_empty() {
None
} else {
let buckets = req
.buckets
.iter()
.map(|(key, req)| {
(
key.to_string(),
IntermediateBucketResult::empty_from_req(&req.bucket_agg),
)
})
.collect();
Some(VecWithNames::from_entries(buckets))
};
Self { metrics, buckets }
}
/// Merge an other intermediate aggregation result into this result.
///
/// The order of the values need to be the same on both results. This is ensured when the same
/// (key values) are present on the underlying VecWithNames struct.
pub fn merge_fruits(&mut self, other: &IntermediateAggregationResults) {
for (tree_left, tree_right) in self.0.values_mut().zip(other.0.values()) {
tree_left.merge_fruits(tree_right);
pub fn merge_fruits(&mut self, other: IntermediateAggregationResults) {
if let (Some(buckets_left), Some(buckets_right)) = (&mut self.buckets, other.buckets) {
for (bucket_left, bucket_right) in
buckets_left.values_mut().zip(buckets_right.into_values())
{
bucket_left.merge_fruits(bucket_right);
}
}
if let (Some(metrics_left), Some(metrics_right)) = (&mut self.metrics, other.metrics) {
for (metric_left, metric_right) in
metrics_left.values_mut().zip(metrics_right.into_values())
{
metric_left.merge_fruits(metric_right);
}
}
}
}
@@ -52,28 +99,6 @@ pub enum IntermediateAggregationResult {
Metric(IntermediateMetricResult),
}
impl IntermediateAggregationResult {
fn merge_fruits(&mut self, other: &IntermediateAggregationResult) {
match (self, other) {
(
IntermediateAggregationResult::Bucket(res_left),
IntermediateAggregationResult::Bucket(res_right),
) => {
res_left.merge_fruits(res_right);
}
(
IntermediateAggregationResult::Metric(res_left),
IntermediateAggregationResult::Metric(res_right),
) => {
res_left.merge_fruits(res_right);
}
_ => {
panic!("incompatible types in aggregation tree on merge fruits");
}
}
}
}
/// Holds the intermediate data for metric results
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum IntermediateMetricResult {
@@ -97,7 +122,17 @@ impl From<SegmentMetricResultCollector> for IntermediateMetricResult {
}
impl IntermediateMetricResult {
fn merge_fruits(&mut self, other: &IntermediateMetricResult) {
pub(crate) fn empty_from_req(req: &MetricAggregation) -> Self {
match req {
MetricAggregation::Average(_) => {
IntermediateMetricResult::Average(IntermediateAverage::default())
}
MetricAggregation::Stats(_) => {
IntermediateMetricResult::Stats(IntermediateStats::default())
}
}
}
fn merge_fruits(&mut self, other: IntermediateMetricResult) {
match (self, other) {
(
IntermediateMetricResult::Average(avg_data_left),
@@ -112,7 +147,7 @@ impl IntermediateMetricResult {
stats_left.merge_fruits(stats_right);
}
_ => {
panic!("incompatible fruit types in tree {:?}", other);
panic!("incompatible fruit types in tree");
}
}
}
@@ -124,36 +159,227 @@ impl IntermediateMetricResult {
pub enum IntermediateBucketResult {
/// This is the range entry for a bucket, which contains a key, count, from, to, and optionally
/// sub_aggregations.
Range(HashMap<SerializedKey, IntermediateRangeBucketEntry>),
Range(IntermediateRangeBucketResult),
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
/// sub_aggregations.
Histogram {
/// The buckets
buckets: Vec<IntermediateHistogramBucketEntry>,
},
/// Term aggregation
Terms(IntermediateTermBucketResult),
}
impl From<SegmentBucketResultCollector> for IntermediateBucketResult {
fn from(collector: SegmentBucketResultCollector) -> Self {
match collector {
SegmentBucketResultCollector::Range(range) => range.into_intermediate_bucket_result(),
impl IntermediateBucketResult {
pub(crate) fn empty_from_req(req: &BucketAggregationType) -> Self {
match req {
BucketAggregationType::Terms(_) => IntermediateBucketResult::Terms(Default::default()),
BucketAggregationType::Range(_) => IntermediateBucketResult::Range(Default::default()),
BucketAggregationType::Histogram(_) => {
IntermediateBucketResult::Histogram { buckets: vec![] }
}
}
}
fn merge_fruits(&mut self, other: IntermediateBucketResult) {
match (self, other) {
(
IntermediateBucketResult::Terms(term_res_left),
IntermediateBucketResult::Terms(term_res_right),
) => {
merge_maps(&mut term_res_left.entries, term_res_right.entries);
term_res_left.sum_other_doc_count += term_res_right.sum_other_doc_count;
term_res_left.doc_count_error_upper_bound +=
term_res_right.doc_count_error_upper_bound;
}
(
IntermediateBucketResult::Range(range_res_left),
IntermediateBucketResult::Range(range_res_right),
) => {
merge_maps(&mut range_res_left.buckets, range_res_right.buckets);
}
(
IntermediateBucketResult::Histogram {
buckets: buckets_left,
..
},
IntermediateBucketResult::Histogram {
buckets: buckets_right,
..
},
) => {
let buckets = buckets_left
.drain(..)
.merge_join_by(buckets_right.into_iter(), |left, right| {
left.key.partial_cmp(&right.key).unwrap_or(Ordering::Equal)
})
.map(|either| match either {
itertools::EitherOrBoth::Both(mut left, right) => {
left.merge_fruits(right);
left
}
itertools::EitherOrBoth::Left(left) => left,
itertools::EitherOrBoth::Right(right) => right,
})
.collect();
*buckets_left = buckets;
}
(IntermediateBucketResult::Range(_), _) => {
panic!("try merge on different types")
}
(IntermediateBucketResult::Histogram { .. }, _) => {
panic!("try merge on different types")
}
(IntermediateBucketResult::Terms { .. }, _) => {
panic!("try merge on different types")
}
}
}
}
impl IntermediateBucketResult {
fn merge_fruits(&mut self, other: &IntermediateBucketResult) {
match (self, other) {
(
IntermediateBucketResult::Range(entries_left),
IntermediateBucketResult::Range(entries_right),
) => {
for (name, entry_left) in entries_left.iter_mut() {
if let Some(entry_right) = entries_right.get(name) {
entry_left.merge_fruits(entry_right);
}
}
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
/// Range aggregation including error counts
pub struct IntermediateRangeBucketResult {
pub(crate) buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry>,
}
for (key, res) in entries_right.iter() {
if !entries_left.contains_key(key) {
entries_left.insert(key.clone(), res.clone());
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
/// Term aggregation including error counts
pub struct IntermediateTermBucketResult {
pub(crate) entries: FnvHashMap<String, IntermediateTermBucketEntry>,
pub(crate) sum_other_doc_count: u64,
pub(crate) doc_count_error_upper_bound: u64,
}
impl IntermediateTermBucketResult {
pub(crate) fn into_final_result(
self,
req: &TermsAggregation,
sub_aggregation_req: &AggregationsInternal,
) -> crate::Result<BucketResult> {
let req = TermsAggregationInternal::from_req(req);
let mut buckets: Vec<BucketEntry> = self
.entries
.into_iter()
.filter(|bucket| bucket.1.doc_count >= req.min_doc_count)
.map(|(key, entry)| {
Ok(BucketEntry {
key: Key::Str(key),
doc_count: entry.doc_count,
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
entry.sub_aggregation,
sub_aggregation_req,
)?,
})
})
.collect::<crate::Result<_>>()?;
let order = req.order.order;
match req.order.target {
OrderTarget::Key => {
buckets.sort_by(|left, right| {
if req.order.order == Order::Desc {
left.key.partial_cmp(&right.key)
} else {
right.key.partial_cmp(&left.key)
}
.expect("expected type string, which is always sortable")
});
}
OrderTarget::Count => {
if req.order.order == Order::Desc {
buckets.sort_unstable_by_key(|bucket| std::cmp::Reverse(bucket.doc_count()));
} else {
buckets.sort_unstable_by_key(|bucket| bucket.doc_count());
}
}
OrderTarget::SubAggregation(name) => {
let (agg_name, agg_property) = get_agg_name_and_property(&name);
let mut buckets_with_val = buckets
.into_iter()
.map(|bucket| {
let val = bucket
.sub_aggregation
.get_value_from_aggregation(agg_name, agg_property)?
.unwrap_or(f64::NAN);
Ok((bucket, val))
})
.collect::<crate::Result<Vec<_>>>()?;
buckets_with_val.sort_by(|(_, val1), (_, val2)| {
// TODO use total_cmp in next rust stable release
match &order {
Order::Desc => val2.partial_cmp(val1).unwrap_or(std::cmp::Ordering::Equal),
Order::Asc => val1.partial_cmp(val2).unwrap_or(std::cmp::Ordering::Equal),
}
});
buckets = buckets_with_val
.into_iter()
.map(|(bucket, _val)| bucket)
.collect_vec();
}
}
// We ignore _term_doc_count_before_cutoff here, because it increases the upperbound error
// only for terms that didn't make it into the top N.
//
// This can be interesting, as a value of quality of the results, but not good to check the
// actual error count for the returned terms.
let (_term_doc_count_before_cutoff, sum_other_doc_count) =
cut_off_buckets(&mut buckets, req.size as usize);
let doc_count_error_upper_bound = if req.show_term_doc_count_error {
Some(self.doc_count_error_upper_bound)
} else {
None
};
Ok(BucketResult::Terms {
buckets,
sum_other_doc_count: self.sum_other_doc_count + sum_other_doc_count,
doc_count_error_upper_bound,
})
}
}
trait MergeFruits {
fn merge_fruits(&mut self, other: Self);
}
fn merge_maps<V: MergeFruits + Clone>(
entries_left: &mut FnvHashMap<SerializedKey, V>,
mut entries_right: FnvHashMap<SerializedKey, V>,
) {
for (name, entry_left) in entries_left.iter_mut() {
if let Some(entry_right) = entries_right.remove(name) {
entry_left.merge_fruits(entry_right);
}
}
for (key, res) in entries_right.into_iter() {
entries_left.entry(key).or_insert(res);
}
}
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
/// sub_aggregations.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateHistogramBucketEntry {
/// The unique the bucket is identified.
pub key: f64,
/// The number of documents in the bucket.
pub doc_count: u64,
/// The sub_aggregation in this bucket.
pub sub_aggregation: IntermediateAggregationResults,
}
impl From<SegmentHistogramBucketEntry> for IntermediateHistogramBucketEntry {
fn from(entry: SegmentHistogramBucketEntry) -> Self {
IntermediateHistogramBucketEntry {
key: entry.key,
doc_count: entry.doc_count,
sub_aggregation: Default::default(),
}
}
}
@@ -166,7 +392,6 @@ pub struct IntermediateRangeBucketEntry {
pub key: Key,
/// The number of documents in the bucket.
pub doc_count: u64,
pub(crate) values: Option<Vec<u64>>,
/// The sub_aggregation in this bucket.
pub sub_aggregation: IntermediateAggregationResults,
/// The from range of the bucket. Equals f64::MIN when None.
@@ -177,49 +402,54 @@ pub struct IntermediateRangeBucketEntry {
pub to: Option<f64>,
}
impl From<SegmentRangeBucketEntry> for IntermediateRangeBucketEntry {
fn from(entry: SegmentRangeBucketEntry) -> Self {
let sub_aggregation = if let Some(sub_aggregation) = entry.sub_aggregation {
sub_aggregation.into()
} else {
Default::default()
};
// let sub_aggregation = entry.sub_aggregation.into();
/// This is the term entry for a bucket, which contains a count, and optionally
/// sub_aggregations.
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateTermBucketEntry {
/// The number of documents in the bucket.
pub doc_count: u64,
/// The sub_aggregation in this bucket.
pub sub_aggregation: IntermediateAggregationResults,
}
IntermediateRangeBucketEntry {
key: entry.key,
doc_count: entry.doc_count,
values: None,
sub_aggregation,
to: entry.to,
from: entry.from,
}
impl MergeFruits for IntermediateTermBucketEntry {
fn merge_fruits(&mut self, other: IntermediateTermBucketEntry) {
self.doc_count += other.doc_count;
self.sub_aggregation.merge_fruits(other.sub_aggregation);
}
}
impl IntermediateRangeBucketEntry {
fn merge_fruits(&mut self, other: &IntermediateRangeBucketEntry) {
impl MergeFruits for IntermediateRangeBucketEntry {
fn merge_fruits(&mut self, other: IntermediateRangeBucketEntry) {
self.doc_count += other.doc_count;
self.sub_aggregation.merge_fruits(&other.sub_aggregation);
self.sub_aggregation.merge_fruits(other.sub_aggregation);
}
}
impl MergeFruits for IntermediateHistogramBucketEntry {
fn merge_fruits(&mut self, other: IntermediateHistogramBucketEntry) {
self.doc_count += other.doc_count;
self.sub_aggregation.merge_fruits(other.sub_aggregation);
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use pretty_assertions::assert_eq;
use super::*;
fn get_sub_test_tree(data: &[(String, u64)]) -> IntermediateAggregationResults {
let mut map = HashMap::new();
let mut buckets = HashMap::new();
let mut buckets = FnvHashMap::default();
for (key, doc_count) in data {
buckets.insert(
key.to_string(),
IntermediateRangeBucketEntry {
key: Key::Str(key.to_string()),
doc_count: *doc_count,
values: None,
sub_aggregation: Default::default(),
from: None,
to: None,
@@ -228,21 +458,25 @@ mod tests {
}
map.insert(
"my_agg_level2".to_string(),
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(buckets)),
IntermediateBucketResult::Range(IntermediateRangeBucketResult { buckets }),
);
IntermediateAggregationResults(VecWithNames::from_entries(map.into_iter().collect()))
IntermediateAggregationResults {
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),
metrics: Default::default(),
}
}
fn get_test_tree(data: &[(String, u64, String, u64)]) -> IntermediateAggregationResults {
fn get_intermediat_tree_with_ranges(
data: &[(String, u64, String, u64)],
) -> IntermediateAggregationResults {
let mut map = HashMap::new();
let mut buckets = HashMap::new();
let mut buckets: FnvHashMap<_, _> = Default::default();
for (key, doc_count, sub_aggregation_key, sub_aggregation_count) in data {
buckets.insert(
key.to_string(),
IntermediateRangeBucketEntry {
key: Key::Str(key.to_string()),
doc_count: *doc_count,
values: None,
from: None,
to: None,
sub_aggregation: get_sub_test_tree(&[(
@@ -254,25 +488,28 @@ mod tests {
}
map.insert(
"my_agg_level1".to_string(),
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(buckets)),
IntermediateBucketResult::Range(IntermediateRangeBucketResult { buckets }),
);
IntermediateAggregationResults(VecWithNames::from_entries(map.into_iter().collect()))
IntermediateAggregationResults {
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),
metrics: Default::default(),
}
}
#[test]
fn test_merge_fruits_tree_1() {
let mut tree_left = get_test_tree(&[
let mut tree_left = get_intermediat_tree_with_ranges(&[
("red".to_string(), 50, "1900".to_string(), 25),
("blue".to_string(), 30, "1900".to_string(), 30),
]);
let tree_right = get_test_tree(&[
let tree_right = get_intermediat_tree_with_ranges(&[
("red".to_string(), 60, "1900".to_string(), 30),
("blue".to_string(), 25, "1900".to_string(), 50),
]);
tree_left.merge_fruits(&tree_right);
tree_left.merge_fruits(tree_right);
let tree_expected = get_test_tree(&[
let tree_expected = get_intermediat_tree_with_ranges(&[
("red".to_string(), 110, "1900".to_string(), 55),
("blue".to_string(), 55, "1900".to_string(), 80),
]);
@@ -282,18 +519,18 @@ mod tests {
#[test]
fn test_merge_fruits_tree_2() {
let mut tree_left = get_test_tree(&[
let mut tree_left = get_intermediat_tree_with_ranges(&[
("red".to_string(), 50, "1900".to_string(), 25),
("blue".to_string(), 30, "1900".to_string(), 30),
]);
let tree_right = get_test_tree(&[
let tree_right = get_intermediat_tree_with_ranges(&[
("red".to_string(), 60, "1900".to_string(), 30),
("green".to_string(), 25, "1900".to_string(), 50),
]);
tree_left.merge_fruits(&tree_right);
tree_left.merge_fruits(tree_right);
let tree_expected = get_test_tree(&[
let tree_expected = get_intermediat_tree_with_ranges(&[
("red".to_string(), 110, "1900".to_string(), 55),
("blue".to_string(), 30, "1900".to_string(), 30),
("green".to_string(), 25, "1900".to_string(), 50),
@@ -301,4 +538,18 @@ mod tests {
assert_eq!(tree_left, tree_expected);
}
#[test]
fn test_merge_fruits_tree_empty() {
let mut tree_left = get_intermediat_tree_with_ranges(&[
("red".to_string(), 50, "1900".to_string(), 25),
("blue".to_string(), 30, "1900".to_string(), 30),
]);
let orig = tree_left.clone();
tree_left.merge_fruits(IntermediateAggregationResults::default());
assert_eq!(tree_left, orig);
}
}

View File

@@ -19,8 +19,8 @@ use crate::DocId;
/// "avg": {
/// "field": "score",
/// }
/// }
/// ```
/// }
/// ```
pub struct AverageAggregation {
/// The field name to compute the stats on.
pub field: String,
@@ -94,7 +94,7 @@ impl IntermediateAverage {
}
/// Merge average data into this instance.
pub fn merge_fruits(&mut self, other: &IntermediateAverage) {
pub fn merge_fruits(&mut self, other: IntermediateAverage) {
self.sum += other.sum;
self.doc_count += other.doc_count;
}

View File

@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use crate::aggregation::f64_from_fastfield_u64;
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
use crate::schema::Type;
use crate::DocId;
use crate::{DocId, TantivyError};
/// A multi-value metric aggregation that computes stats of numeric values that are
/// extracted from the aggregated documents.
@@ -17,7 +17,7 @@ use crate::DocId;
/// "field": "score",
/// }
/// }
/// ```
/// ```
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct StatsAggregation {
@@ -53,6 +53,23 @@ pub struct Stats {
pub avg: Option<f64>,
}
impl Stats {
pub(crate) fn get_value(&self, agg_property: &str) -> crate::Result<Option<f64>> {
match agg_property {
"count" => Ok(Some(self.count as f64)),
"sum" => Ok(Some(self.sum)),
"standard_deviation" => Ok(self.standard_deviation),
"min" => Ok(self.min),
"max" => Ok(self.max),
"avg" => Ok(self.avg),
_ => Err(TantivyError::InvalidArgument(format!(
"unknown property {} on stats metric aggregation",
agg_property
))),
}
}
}
/// IntermediateStats contains the mergeable version for stats.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateStats {
@@ -62,9 +79,8 @@ pub struct IntermediateStats {
min: f64,
max: f64,
}
impl IntermediateStats {
fn new() -> Self {
impl Default for IntermediateStats {
fn default() -> Self {
Self {
count: 0,
sum: 0.0,
@@ -73,7 +89,9 @@ impl IntermediateStats {
max: f64::MIN,
}
}
}
impl IntermediateStats {
pub(crate) fn avg(&self) -> Option<f64> {
if self.count == 0 {
None
@@ -92,7 +110,7 @@ impl IntermediateStats {
}
/// Merge data from other stats into this instance.
pub fn merge_fruits(&mut self, other: &IntermediateStats) {
pub fn merge_fruits(&mut self, other: IntermediateStats) {
self.count += other.count;
self.sum += other.sum;
self.squared_sum += other.squared_sum;
@@ -142,7 +160,7 @@ impl SegmentStatsCollector {
pub fn from_req(field_type: Type) -> Self {
Self {
field_type,
stats: IntermediateStats::new(),
stats: IntermediateStats::default(),
}
}
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
@@ -182,12 +200,50 @@ mod tests {
};
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::metric::StatsAggregation;
use crate::aggregation::tests::get_test_index_2_segments;
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values};
use crate::aggregation::AggregationCollector;
use crate::query::TermQuery;
use crate::query::{AllQuery, TermQuery};
use crate::schema::IndexRecordOption;
use crate::Term;
#[test]
fn test_aggregation_stats_empty_index() -> crate::Result<()> {
// test index without segments
let values = vec![];
let index = get_test_index_from_values(false, &values)?;
let agg_req_1: Aggregations = vec![(
"stats".to_string(),
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
"score".to_string(),
))),
)]
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1);
let reader = index.reader()?;
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
assert_eq!(
res["stats"],
json!({
"avg": Value::Null,
"count": 0,
"max": Value::Null,
"min": Value::Null,
"standard_deviation": Value::Null,
"sum": 0.0
})
);
Ok(())
}
#[test]
fn test_aggregation_stats() -> crate::Result<()> {
let index = get_test_index_2_segments(false)?;

View File

@@ -18,6 +18,11 @@
//! Create an [AggregationCollector] from this request. AggregationCollector implements the
//! `Collector` trait and can be passed as collector into `searcher.search()`.
//!
//! #### Limitations
//!
//! Currently aggregations work only on single value fast fields of type u64, f64, i64 and
//! fast fields on text fields.
//!
//! # JSON Format
//! Aggregations request and result structures de/serialize into elasticsearch compatible JSON.
//!
@@ -28,9 +33,15 @@
//! let agg_res = searcher.search(&term_query, &collector).unwrap_err();
//! let json_response_string: String = &serde_json::to_string(&agg_res)?;
//! ```
//! # Limitations
//!
//! Currently aggregations work only on single value fast fields of type u64, f64 and i64.
//! # Supported Aggregations
//! - [Bucket](bucket)
//! - [Histogram](bucket::HistogramAggregation)
//! - [Range](bucket::RangeAggregation)
//! - [Terms](bucket::TermsAggregation)
//! - [Metric](metric)
//! - [Average](metric::AverageAggregation)
//! - [Stats](metric::StatsAggregation)
//!
//! # Example
//! Compute the average metric, by building [agg_req::Aggregations], which is built from an (String,
@@ -90,7 +101,8 @@
//! }
//! }
//! "#;
//! let agg_req: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
//! let agg_req: Aggregations =
//! serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
//! ```
//! # Code Organization
//!
@@ -101,8 +113,7 @@
//! Buckets can contain sub-aggregations. In this example we create buckets with the range
//! aggregation and then calculate the average on each bucket.
//! ```
//! use tantivy::aggregation::agg_req::{Aggregations, Aggregation, BucketAggregation,
//! MetricAggregation, BucketAggregationType};
//! use tantivy::aggregation::agg_req::*;
//! use tantivy::aggregation::metric::AverageAggregation;
//! use tantivy::aggregation::bucket::RangeAggregation;
//! let sub_agg_req_1: Aggregations = vec![(
@@ -138,7 +149,8 @@
//! IntermediateAggregationResults provides the
//! [merge_fruits](intermediate_agg_result::IntermediateAggregationResults::merge_fruits) method to
//! merge multiple results. The merged result can then be converted into
//! [agg_result::AggregationResults] via the [Into] trait.
//! [agg_result::AggregationResults] via the
//! [agg_result::AggregationResults::from_intermediate_and_req] method.
pub mod agg_req;
mod agg_req_with_accessor;
@@ -167,6 +179,7 @@ pub(crate) struct VecWithNames<T: Clone> {
values: Vec<T>,
keys: Vec<String>,
}
impl<T: Clone> Default for VecWithNames<T> {
fn default() -> Self {
Self {
@@ -189,6 +202,14 @@ impl<T: Clone> From<HashMap<String, T>> for VecWithNames<T> {
}
impl<T: Clone> VecWithNames<T> {
fn from_other<K: Clone + Into<T>>(entries: VecWithNames<K>) -> Self {
let values = entries.values.into_iter().map(Into::into).collect();
Self {
keys: entries.keys,
values,
}
}
fn from_entries(mut entries: Vec<(String, T)>) -> Self {
// Sort to ensure order of elements match across multiple instances
entries.sort_by(|left, right| left.0.cmp(&right.0));
@@ -212,6 +233,9 @@ impl<T: Clone> VecWithNames<T> {
fn keys(&self) -> impl Iterator<Item = &str> + '_ {
self.keys.iter().map(|key| key.as_str())
}
fn into_values(self) -> impl Iterator<Item = T> {
self.values.into_iter()
}
fn values(&self) -> impl Iterator<Item = &T> + '_ {
self.values.iter()
}
@@ -224,6 +248,14 @@ impl<T: Clone> VecWithNames<T> {
fn is_empty(&self) -> bool {
self.keys.is_empty()
}
fn len(&self) -> usize {
self.keys.len()
}
fn get(&self, name: &str) -> Option<&T> {
self.keys()
.position(|key| key == name)
.map(|pos| &self.values[pos])
}
}
/// The serialized key is used in a HashMap.
@@ -284,21 +316,22 @@ pub(crate) fn f64_to_fastfield_u64(val: f64, field_type: &Type) -> Option<u64> {
#[cfg(test)]
mod tests {
use futures::executor::block_on;
use serde_json::Value;
use super::agg_req::{Aggregation, Aggregations, BucketAggregation};
use super::bucket::RangeAggregation;
use super::collector::AggregationCollector;
use super::metric::AverageAggregation;
use crate::aggregation::agg_req::{BucketAggregationType, MetricAggregation};
use crate::aggregation::agg_req::{
get_term_dict_field_names, BucketAggregationType, MetricAggregation,
};
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::bucket::TermsAggregation;
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
use crate::aggregation::segment_agg_result::DOC_BLOCK_SIZE;
use crate::aggregation::DistributedAggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{Cardinality, IndexRecordOption, Schema, TextFieldIndexing};
use crate::schema::{Cardinality, IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use crate::{Index, Term};
fn get_avg_req(field_name: &str) -> Aggregation {
@@ -310,16 +343,87 @@ mod tests {
pub fn get_test_index_with_num_docs(
merge_segments: bool,
num_docs: usize,
) -> crate::Result<Index> {
get_test_index_from_values(
merge_segments,
&(0..num_docs).map(|el| el as f64).collect::<Vec<f64>>(),
)
}
pub fn exec_request(agg_req: Aggregations, index: &Index) -> crate::Result<Value> {
exec_request_with_query(agg_req, index, None)
}
pub fn exec_request_with_query(
agg_req: Aggregations,
index: &Index,
query: Option<(&str, &str)>,
) -> crate::Result<Value> {
let collector = AggregationCollector::from_aggs(agg_req);
let reader = index.reader()?;
let searcher = reader.searcher();
let agg_res = if let Some((field, term)) = query {
let text_field = reader.searcher().schema().get_field(field).unwrap();
let term_query = TermQuery::new(
Term::from_field_text(text_field, term),
IndexRecordOption::Basic,
);
searcher.search(&term_query, &collector)?
} else {
searcher.search(&AllQuery, &collector)?
};
// Test serialization/deserialization rountrip
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
Ok(res)
}
pub fn get_test_index_from_values(
merge_segments: bool,
values: &[f64],
) -> crate::Result<Index> {
// Every value gets its own segment
let mut segment_and_values = vec![];
for value in values {
segment_and_values.push(vec![(*value, value.to_string())]);
}
get_test_index_from_values_and_terms(merge_segments, &segment_and_values)
}
pub fn get_test_index_from_terms(
merge_segments: bool,
values: &[Vec<&str>],
) -> crate::Result<Index> {
// Every value gets its own segment
let segment_and_values = values
.iter()
.map(|terms| {
terms
.iter()
.enumerate()
.map(|(i, term)| (i as f64, term.to_string()))
.collect()
})
.collect::<Vec<_>>();
get_test_index_from_values_and_terms(merge_segments, &segment_and_values)
}
pub fn get_test_index_from_values_and_terms(
merge_segments: bool,
segment_and_values: &[Vec<(f64, String)>],
) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let text_fieldtype = crate::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_fast()
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
let string_field_id = schema_builder.add_text_field("string_id", STRING | FAST);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
@@ -332,25 +436,29 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_for_tests()?;
for i in 0..num_docs {
// writing the segment
index_writer.add_document(doc!(
text_field => "cool",
score_field => i as u64,
score_field_f64 => i as f64,
score_field_i64 => i as i64,
fraction_field => i as f64/100.0,
))?;
for values in segment_and_values {
for (i, term) in values {
let i = *i;
// writing the segment
index_writer.add_document(doc!(
text_field => "cool",
text_field_id => term.to_string(),
string_field_id => term.to_string(),
score_field => i as u64,
score_field_f64 => i as f64,
score_field_i64 => i as i64,
fraction_field => i as f64/100.0,
))?;
}
index_writer.commit()?;
}
index_writer.commit()?;
}
if merge_segments {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests()?;
block_on(index_writer.merge(&segment_ids))?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -362,22 +470,20 @@ mod tests {
merge_segments: bool,
use_distributed_collector: bool,
) -> crate::Result<()> {
let index = get_test_index_with_num_docs(merge_segments, 300)?;
let mut values_and_terms = (0..80)
.map(|val| vec![(val as f64, "terma".to_string())])
.collect::<Vec<_>>();
values_and_terms.last_mut().unwrap()[0].1 = "termb".to_string();
let index = get_test_index_from_values_and_terms(merge_segments, &values_and_terms)?;
let reader = index.reader()?;
let text_field = reader.searcher().schema().get_field("text").unwrap();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
assert_eq!(DOC_BLOCK_SIZE, 256);
assert_eq!(DOC_BLOCK_SIZE, 64);
// In the tree we cache Documents of DOC_BLOCK_SIZE, before passing them down as one block.
//
// Build a request so that on the first level we have one full cache, which is then flushed.
// The same cache should have some residue docs at the end, which are flushed (Range 0-266)
// -> 266 docs
// The same cache should have some residue docs at the end, which are flushed (Range 0-70)
// -> 70 docs
//
// The second level should also have some residue docs in the cache that are flushed at the
// end.
@@ -385,38 +491,70 @@ mod tests {
// A second bucket on the first level should have the cache unfilled
// let elasticsearch_compatible_json_req = r#"
let elasticsearch_compatible_json_req = r#"
let elasticsearch_compatible_json = json!(
{
"bucketsL1": {
"range": {
"field": "score",
"ranges": [ { "to": 3.0 }, { "from": 3.0, "to": 266.0 }, { "from": 266.0 } ]
"ranges": [ { "to": 3.0f64 }, { "from": 3.0f64, "to": 70.0f64 }, { "from": 70.0f64 } ]
},
"aggs": {
"bucketsL2": {
"range": {
"field": "score",
"ranges": [ { "to": 100.0 }, { "from": 100.0, "to": 266.0 }, { "from": 266.0 } ]
"ranges": [ { "to": 30.0f64 }, { "from": 30.0f64, "to": 70.0f64 }, { "from": 70.0f64 } ]
}
}
}
},
"histogram_test":{
"histogram": {
"field": "score",
"interval": 70.0,
"offset": 3.0,
},
"aggs": {
"bucketsL2": {
"histogram": {
"field": "score",
"interval": 70.0
}
}
}
},
"term_agg_test":{
"terms": {
"field": "string_id"
},
"aggs": {
"bucketsL2": {
"histogram": {
"field": "score",
"interval": 70.0
}
}
}
}
}
"#;
});
let agg_req: Aggregations =
serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
.unwrap();
let agg_res: AggregationResults = if use_distributed_collector {
let collector = DistributedAggregationCollector::from_aggs(agg_req);
let collector = DistributedAggregationCollector::from_aggs(agg_req.clone());
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap().into()
AggregationResults::from_intermediate_and_req(
searcher.search(&AllQuery, &collector).unwrap(),
agg_req,
)
.unwrap()
} else {
let collector = AggregationCollector::from_aggs(agg_req);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
searcher.search(&AllQuery, &collector).unwrap()
};
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
@@ -426,15 +564,15 @@ mod tests {
res["bucketsL1"]["buckets"][0]["bucketsL2"]["buckets"][0]["doc_count"],
3
);
assert_eq!(res["bucketsL1"]["buckets"][1]["key"], "3-266");
assert_eq!(res["bucketsL1"]["buckets"][1]["doc_count"], 266 - 3);
assert_eq!(res["bucketsL1"]["buckets"][1]["key"], "3-70");
assert_eq!(res["bucketsL1"]["buckets"][1]["doc_count"], 70 - 3);
assert_eq!(
res["bucketsL1"]["buckets"][1]["bucketsL2"]["buckets"][0]["doc_count"],
97
27
);
assert_eq!(
res["bucketsL1"]["buckets"][1]["bucketsL2"]["buckets"][1]["doc_count"],
166
40
);
assert_eq!(
res["bucketsL1"]["buckets"][1]["bucketsL2"]["buckets"][2]["doc_count"],
@@ -442,9 +580,49 @@ mod tests {
);
assert_eq!(
res["bucketsL1"]["buckets"][2]["bucketsL2"]["buckets"][2]["doc_count"],
300 - 266
80 - 70
);
assert_eq!(res["bucketsL1"]["buckets"][2]["doc_count"], 80 - 70);
assert_eq!(
res["term_agg_test"],
json!(
{
"buckets": [
{
"bucketsL2": {
"buckets": [
{
"doc_count": 70,
"key": 0.0
},
{
"doc_count": 9,
"key": 70.0
}
]
},
"doc_count": 79,
"key": "terma"
},
{
"bucketsL2": {
"buckets": [
{
"doc_count": 1,
"key": 70.0
}
]
},
"doc_count": 1,
"key": "termb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
)
);
assert_eq!(res["bucketsL1"]["buckets"][2]["doc_count"], 300 - 266);
Ok(())
}
@@ -461,12 +639,12 @@ mod tests {
let mut schema_builder = Schema::builder();
let text_fieldtype = crate::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_fast()
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
schema_builder.add_text_field("dummy_text", STRING);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
@@ -553,7 +731,7 @@ mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests()?;
block_on(index_writer.merge(&segment_ids))?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -671,10 +849,21 @@ mod tests {
IndexRecordOption::Basic,
);
let sub_agg_req: Aggregations =
vec![("average_in_range".to_string(), get_avg_req("score"))]
.into_iter()
.collect();
let sub_agg_req: Aggregations = vec![
("average_in_range".to_string(), get_avg_req("score")),
(
"term_agg".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
}),
),
]
.into_iter()
.collect();
let agg_req: Aggregations = if use_elastic_json_req {
let elasticsearch_compatible_json_req = r#"
{
@@ -690,7 +879,8 @@ mod tests {
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } }
"average_in_range": { "avg": { "field": "score" } },
"term_agg": { "terms": { "field": "text" } }
}
},
"rangei64": {
@@ -705,7 +895,8 @@ mod tests {
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } }
"average_in_range": { "avg": { "field": "score" } },
"term_agg": { "terms": { "field": "text" } }
}
},
"average": {
@@ -723,7 +914,8 @@ mod tests {
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } }
"average_in_range": { "avg": { "field": "score" } },
"term_agg": { "terms": { "field": "text" } }
}
}
}
@@ -782,6 +974,9 @@ mod tests {
agg_req
};
let field_names = get_term_dict_field_names(&agg_req);
assert_eq!(field_names, vec!["text".to_string()].into_iter().collect());
let agg_res: AggregationResults = if use_distributed_collector {
let collector = DistributedAggregationCollector::from_aggs(agg_req.clone());
@@ -790,7 +985,7 @@ mod tests {
// Test de/serialization roundtrip on intermediate_agg_result
let res: IntermediateAggregationResults =
serde_json::from_str(&serde_json::to_string(&res).unwrap()).unwrap();
res.into()
AggregationResults::from_intermediate_and_req(res, agg_req.clone()).unwrap()
} else {
let collector = AggregationCollector::from_aggs(agg_req.clone());
@@ -922,10 +1117,10 @@ mod tests {
searcher.search(&AllQuery, &collector).unwrap_err()
};
let agg_res = avg_on_field("text");
let agg_res = avg_on_field("dummy_text");
assert_eq!(
format!("{:?}", agg_res),
r#"InvalidArgument("Only single value fast fields of type f64, u64, i64 are supported, but got Str ")"#
r#"InvalidArgument("Only fast fields of type f64, u64, i64 are supported, but got Str ")"#
);
let agg_res = avg_on_field("not_exist_field");
@@ -937,7 +1132,7 @@ mod tests {
let agg_res = avg_on_field("scores_i64");
assert_eq!(
format!("{:?}", agg_res),
r#"InvalidArgument("Invalid field type in aggregation I64, only Cardinality::SingleValue supported")"#
r#"InvalidArgument("Invalid field cardinality on field scores_i64 expected SingleValue, but got MultiValues")"#
);
Ok(())
@@ -946,10 +1141,12 @@ mod tests {
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::prelude::SliceRandom;
use rand::{thread_rng, Rng};
use test::{self, Bencher};
use super::*;
use crate::aggregation::bucket::{HistogramAggregation, HistogramBounds, TermsAggregation};
use crate::aggregation::metric::StatsAggregation;
use crate::query::AllQuery;
@@ -957,12 +1154,14 @@ mod tests {
let mut schema_builder = Schema::builder();
let text_fieldtype = crate::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let text_field_many_terms =
schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms =
schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
@@ -970,6 +1169,10 @@ mod tests {
schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = vec!["INFO", "ERROR", "WARN", "DEBUG"];
let many_terms_data = (0..15_000)
.map(|num| format!("author{}", num))
.collect::<Vec<_>>();
{
let mut rng = thread_rng();
let mut index_writer = index.writer_for_tests()?;
@@ -978,6 +1181,8 @@ mod tests {
let val: f64 = rng.gen_range(0.0..1_000_000.0);
index_writer.add_document(doc!(
text_field => "cool",
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
score_field => val as u64,
score_field_f64 => val as f64,
score_field_i64 => val as i64,
@@ -990,7 +1195,7 @@ mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests()?;
block_on(index_writer.merge(&segment_ids))?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -1129,6 +1334,64 @@ mod tests {
});
}
#[bench]
fn bench_aggregation_terms_few(b: &mut Bencher) {
let index = get_test_index_bench(false).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text_few_terms".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req);
let searcher = reader.searcher();
let agg_res: AggregationResults =
searcher.search(&AllQuery, &collector).unwrap().into();
agg_res
});
}
#[bench]
fn bench_aggregation_terms_many(b: &mut Bencher) {
let index = get_test_index_bench(false).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text_many_terms".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req);
let searcher = reader.searcher();
let agg_res: AggregationResults =
searcher.search(&AllQuery, &collector).unwrap().into();
agg_res
});
}
#[bench]
fn bench_aggregation_range_only(b: &mut Bencher) {
let index = get_test_index_bench(false).unwrap();
@@ -1165,6 +1428,110 @@ mod tests {
});
}
// hard bounds has a different algorithm, because it actually limits collection range
#[bench]
fn bench_aggregation_histogram_only_hard_bounds(b: &mut Bencher) {
let index = get_test_index_bench(false).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = vec![(
"rangef64".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
field: "score_f64".to_string(),
interval: 100f64,
hard_bounds: Some(HistogramBounds {
min: 1000.0,
max: 300_000.0,
}),
..Default::default()
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1);
let searcher = reader.searcher();
let agg_res: AggregationResults =
searcher.search(&AllQuery, &collector).unwrap().into();
agg_res
});
}
#[bench]
fn bench_aggregation_histogram_with_avg(b: &mut Bencher) {
let index = get_test_index_bench(false).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let sub_agg_req: Aggregations = vec![(
"average_f64".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score_f64".to_string()),
)),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = vec![(
"rangef64".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
field: "score_f64".to_string(),
interval: 100f64, // 1000 buckets
..Default::default()
}),
sub_aggregation: sub_agg_req,
}),
)]
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1);
let searcher = reader.searcher();
let agg_res: AggregationResults =
searcher.search(&AllQuery, &collector).unwrap().into();
agg_res
});
}
#[bench]
fn bench_aggregation_histogram_only(b: &mut Bencher) {
let index = get_test_index_bench(false).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = vec![(
"rangef64".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
field: "score_f64".to_string(),
interval: 100f64, // 1000 buckets
..Default::default()
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req_1);
let searcher = reader.searcher();
let agg_res: AggregationResults =
searcher.search(&AllQuery, &collector).unwrap().into();
agg_res
});
}
#[bench]
fn bench_aggregation_sub_tree(b: &mut Bencher) {
let index = get_test_index_bench(false).unwrap();

View File

@@ -9,25 +9,37 @@ use super::agg_req::MetricAggregation;
use super::agg_req_with_accessor::{
AggregationsWithAccessor, BucketAggregationWithAccessor, MetricAggregationWithAccessor,
};
use super::bucket::SegmentRangeCollector;
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
use super::intermediate_agg_result::{IntermediateAggregationResults, IntermediateBucketResult};
use super::metric::{
AverageAggregation, SegmentAverageCollector, SegmentStatsCollector, StatsAggregation,
};
use super::{Key, VecWithNames};
use super::VecWithNames;
use crate::aggregation::agg_req::BucketAggregationType;
use crate::DocId;
pub(crate) const DOC_BLOCK_SIZE: usize = 256;
pub(crate) const DOC_BLOCK_SIZE: usize = 64;
pub(crate) type DocBlock = [DocId; DOC_BLOCK_SIZE];
#[derive(Clone, PartialEq)]
pub(crate) struct SegmentAggregationResultsCollector {
pub(crate) metrics: VecWithNames<SegmentMetricResultCollector>,
pub(crate) buckets: VecWithNames<SegmentBucketResultCollector>,
pub(crate) metrics: Option<VecWithNames<SegmentMetricResultCollector>>,
pub(crate) buckets: Option<VecWithNames<SegmentBucketResultCollector>>,
staged_docs: DocBlock,
num_staged_docs: usize,
}
impl Default for SegmentAggregationResultsCollector {
fn default() -> Self {
Self {
metrics: Default::default(),
buckets: Default::default(),
staged_docs: [0; DOC_BLOCK_SIZE],
num_staged_docs: Default::default(),
}
}
}
impl Debug for SegmentAggregationResultsCollector {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SegmentAggregationResultsCollector")
@@ -40,6 +52,25 @@ impl Debug for SegmentAggregationResultsCollector {
}
impl SegmentAggregationResultsCollector {
pub fn into_intermediate_aggregations_result(
self,
agg_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<IntermediateAggregationResults> {
let buckets = if let Some(buckets) = self.buckets {
let entries = buckets
.into_iter()
.zip(agg_with_accessor.buckets.values())
.map(|((key, bucket), acc)| Ok((key, bucket.into_intermediate_bucket_result(acc)?)))
.collect::<crate::Result<Vec<(String, _)>>>()?;
Some(VecWithNames::from_entries(entries))
} else {
None
};
let metrics = self.metrics.map(VecWithNames::from_other);
Ok(IntermediateAggregationResults { metrics, buckets })
}
pub(crate) fn from_req_and_validate(req: &AggregationsWithAccessor) -> crate::Result<Self> {
let buckets = req
.buckets
@@ -50,7 +81,7 @@ impl SegmentAggregationResultsCollector {
SegmentBucketResultCollector::from_req_and_validate(req)?,
))
})
.collect::<crate::Result<_>>()?;
.collect::<crate::Result<Vec<(String, _)>>>()?;
let metrics = req
.metrics
.entries()
@@ -60,10 +91,20 @@ impl SegmentAggregationResultsCollector {
SegmentMetricResultCollector::from_req_and_validate(req)?,
))
})
.collect::<crate::Result<_>>()?;
.collect::<crate::Result<Vec<(String, _)>>>()?;
let metrics = if metrics.is_empty() {
None
} else {
Some(VecWithNames::from_entries(metrics))
};
let buckets = if buckets.is_empty() {
None
} else {
Some(VecWithNames::from_entries(buckets))
};
Ok(SegmentAggregationResultsCollector {
metrics: VecWithNames::from_entries(metrics),
buckets: VecWithNames::from_entries(buckets),
metrics,
buckets,
staged_docs: [0; DOC_BLOCK_SIZE],
num_staged_docs: 0,
})
@@ -82,29 +123,33 @@ impl SegmentAggregationResultsCollector {
}
}
#[inline(never)]
pub(crate) fn flush_staged_docs(
&mut self,
agg_with_accessor: &AggregationsWithAccessor,
force_flush: bool,
) {
for (agg_with_accessor, collector) in agg_with_accessor
.metrics
.values()
.zip(self.metrics.values_mut())
{
collector.collect_block(&self.staged_docs[..self.num_staged_docs], agg_with_accessor);
if self.num_staged_docs == 0 {
return;
}
for (agg_with_accessor, collector) in agg_with_accessor
.buckets
.values()
.zip(self.buckets.values_mut())
{
collector.collect_block(
&self.staged_docs[..self.num_staged_docs],
agg_with_accessor,
force_flush,
);
if let Some(metrics) = &mut self.metrics {
for (collector, agg_with_accessor) in
metrics.values_mut().zip(agg_with_accessor.metrics.values())
{
collector
.collect_block(&self.staged_docs[..self.num_staged_docs], agg_with_accessor);
}
}
if let Some(buckets) = &mut self.buckets {
for (collector, agg_with_accessor) in
buckets.values_mut().zip(agg_with_accessor.buckets.values())
{
collector.collect_block(
&self.staged_docs[..self.num_staged_docs],
agg_with_accessor,
force_flush,
);
}
}
self.num_staged_docs = 0;
@@ -151,11 +196,40 @@ impl SegmentMetricResultCollector {
#[derive(Clone, Debug, PartialEq)]
pub(crate) enum SegmentBucketResultCollector {
Range(SegmentRangeCollector),
Histogram(Box<SegmentHistogramCollector>),
Terms(Box<SegmentTermCollector>),
}
impl SegmentBucketResultCollector {
pub fn into_intermediate_bucket_result(
self,
agg_with_accessor: &BucketAggregationWithAccessor,
) -> crate::Result<IntermediateBucketResult> {
match self {
SegmentBucketResultCollector::Terms(terms) => {
terms.into_intermediate_bucket_result(agg_with_accessor)
}
SegmentBucketResultCollector::Range(range) => {
range.into_intermediate_bucket_result(agg_with_accessor)
}
SegmentBucketResultCollector::Histogram(histogram) => {
histogram.into_intermediate_bucket_result(agg_with_accessor)
}
}
}
pub fn from_req_and_validate(req: &BucketAggregationWithAccessor) -> crate::Result<Self> {
match &req.bucket_agg {
BucketAggregationType::Terms(terms_req) => Ok(Self::Terms(Box::new(
SegmentTermCollector::from_req_and_validate(
terms_req,
&req.sub_aggregation,
req.field_type,
req.accessor
.as_multi()
.expect("unexpected fast field cardinality"),
)?,
))),
BucketAggregationType::Range(range_req) => {
Ok(Self::Range(SegmentRangeCollector::from_req_and_validate(
range_req,
@@ -163,6 +237,16 @@ impl SegmentBucketResultCollector {
req.field_type,
)?))
}
BucketAggregationType::Histogram(histogram) => Ok(Self::Histogram(Box::new(
SegmentHistogramCollector::from_req_and_validate(
histogram,
&req.sub_aggregation,
req.field_type,
req.accessor
.as_single()
.expect("unexpected fast field cardinality"),
)?,
))),
}
}
@@ -177,28 +261,12 @@ impl SegmentBucketResultCollector {
SegmentBucketResultCollector::Range(range) => {
range.collect_block(doc, bucket_with_accessor, force_flush);
}
SegmentBucketResultCollector::Histogram(histogram) => {
histogram.collect_block(doc, bucket_with_accessor, force_flush)
}
SegmentBucketResultCollector::Terms(terms) => {
terms.collect_block(doc, bucket_with_accessor, force_flush)
}
}
}
}
#[derive(Clone, PartialEq)]
pub(crate) struct SegmentRangeBucketEntry {
pub key: Key,
pub doc_count: u64,
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
/// The from range of the bucket. Equals f64::MIN when None.
pub from: Option<f64>,
/// The to range of the bucket. Equals f64::MAX when None.
pub to: Option<f64>,
}
impl Debug for SegmentRangeBucketEntry {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SegmentRangeBucketEntry")
.field("key", &self.key)
.field("doc_count", &self.doc_count)
.field("from", &self.from)
.field("to", &self.to)
.finish()
}
}

View File

@@ -152,9 +152,9 @@ mod tests {
use query::AllQuery;
use super::{add_vecs, HistogramCollector, HistogramComputer};
use crate::chrono::{TimeZone, Utc};
use crate::schema::{Schema, FAST};
use crate::{doc, query, Index};
use crate::time::{Date, Month};
use crate::{doc, query, DateTime, Index};
#[test]
fn test_add_histograms_simple() {
@@ -273,16 +273,20 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)))?;
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)))?;
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(
doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1986, Month::March, 9)?.with_hms(0, 0, 0)?)),
)?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let all_query = AllQuery;
let week_histogram_collector = HistogramCollector::new(
date_field,
Utc.ymd(1980, 1, 1).and_hms(0, 0, 0),
DateTime::from_primitive(
Date::from_calendar_date(1980, Month::January, 1)?.with_hms(0, 0, 0)?,
),
3600 * 24 * 365, // it is just for a unit test... sorry leap years.
10,
);

View File

@@ -92,7 +92,7 @@ mod histogram_collector;
pub use histogram_collector::HistogramCollector;
mod multi_collector;
pub use self::multi_collector::MultiCollector;
pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit};
mod top_collector;

View File

@@ -5,6 +5,7 @@ use super::{Collector, SegmentCollector};
use crate::collector::Fruit;
use crate::{DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
/// MultiFruit keeps Fruits from every nested Collector
pub struct MultiFruit {
sub_fruits: Vec<Option<Box<dyn Fruit>>>,
}
@@ -79,12 +80,17 @@ impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector
}
}
/// FruitHandle stores reference to the corresponding collector inside MultiCollector
pub struct FruitHandle<TFruit: Fruit> {
pos: usize,
_phantom: PhantomData<TFruit>,
}
impl<TFruit: Fruit> FruitHandle<TFruit> {
/// Extract a typed fruit off a multifruit.
///
/// This function involves downcasting and can panic if the multifruit was
/// created using faulty code.
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
*boxed_fruit

View File

@@ -1,11 +1,11 @@
use std::str::FromStr;
use super::*;
use crate::collector::{Count, FilterCollector, TopDocs};
use crate::core::SegmentReader;
use crate::fastfield::{BytesFastFieldReader, DynamicFastFieldReader, FastFieldReader};
use crate::query::{AllQuery, QueryParser};
use crate::schema::{Field, Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::{doc, DateTime, DocAddress, DocId, Document, Index, Score, Searcher, SegmentOrdinal};
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
@@ -26,11 +26,11 @@ pub fn test_filter_collector() -> crate::Result<()> {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()))?;
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()))?;
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()))?;
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()))?;
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()))?;
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_utc(OffsetDateTime::parse("1898-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2020-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2019-04-20T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_utc(OffsetDateTime::parse("2018-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.commit()?;
let reader = index.reader()?;
@@ -55,7 +55,9 @@ pub fn test_filter_collector() -> crate::Result<()> {
assert_eq!(filtered_top_docs.len(), 0);
fn date_filter(value: DateTime) -> bool {
(value - DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()).num_weeks() > 0
(value.into_utc() - OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
.whole_weeks()
> 0
}
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));

View File

@@ -714,7 +714,9 @@ mod tests {
use crate::collector::Collector;
use crate::query::{AllQuery, Query, QueryParser};
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::{DocAddress, DocId, Index, IndexWriter, Score, SegmentReader};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Score, SegmentReader};
fn make_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
@@ -890,28 +892,32 @@ mod tests {
#[test]
fn test_top_field_collector_datetime() -> crate::Result<()> {
use std::str::FromStr;
let mut schema_builder = Schema::builder();
let name = schema_builder.add_text_field("name", TEXT);
let birthday = schema_builder.add_date_field("birthday", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let pr_birthday = crate::DateTime::from_str("1898-04-09T00:00:00+00:00")?;
let pr_birthday = DateTime::from_utc(OffsetDateTime::parse(
"1898-04-09T00:00:00+00:00",
&Rfc3339,
)?);
index_writer.add_document(doc!(
name => "Paul Robeson",
birthday => pr_birthday
birthday => pr_birthday,
))?;
let mr_birthday = crate::DateTime::from_str("1947-11-08T00:00:00+00:00")?;
let mr_birthday = DateTime::from_utc(OffsetDateTime::parse(
"1947-11-08T00:00:00+00:00",
&Rfc3339,
)?);
index_writer.add_document(doc!(
name => "Minnie Riperton",
birthday => mr_birthday
birthday => mr_birthday,
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
let top_docs: Vec<(crate::DateTime, DocAddress)> =
searcher.search(&AllQuery, &top_collector)?;
let top_docs: Vec<(DateTime, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
&[

View File

@@ -1,6 +1,7 @@
use crossbeam::channel;
use rayon::{ThreadPool, ThreadPoolBuilder};
use crate::TantivyError;
/// Search executor whether search request are single thread or multithread.
///
/// We don't expose Rayon thread pool directly here for several reasons.
@@ -47,16 +48,19 @@ impl Executor {
match self {
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
Executor::ThreadPool(pool) => {
let args_with_indices: Vec<(usize, A)> = args.enumerate().collect();
let num_fruits = args_with_indices.len();
let args: Vec<A> = args.collect();
let num_fruits = args.len();
let fruit_receiver = {
let (fruit_sender, fruit_receiver) = channel::unbounded();
let (fruit_sender, fruit_receiver) = crossbeam_channel::unbounded();
pool.scope(|scope| {
for arg_with_idx in args_with_indices {
scope.spawn(|_| {
let (idx, arg) = arg_with_idx;
let fruit = f(arg);
if let Err(err) = fruit_sender.send((idx, fruit)) {
for (idx, arg) in args.into_iter().enumerate() {
// We name references for f and fruit_sender_ref because we do not
// want these two to be moved into the closure.
let f_ref = &f;
let fruit_sender_ref = &fruit_sender;
scope.spawn(move |_| {
let fruit = f_ref(arg);
if let Err(err) = fruit_sender_ref.send((idx, fruit)) {
error!(
"Failed to send search task. It probably means all search \
threads have panicked. {:?}",
@@ -71,18 +75,19 @@ impl Executor {
// This is important as it makes it possible for the fruit_receiver iteration to
// terminate.
};
// This is lame, but safe.
let mut results_with_position = Vec::with_capacity(num_fruits);
let mut result_placeholders: Vec<Option<R>> =
std::iter::repeat_with(|| None).take(num_fruits).collect();
for (pos, fruit_res) in fruit_receiver {
let fruit = fruit_res?;
results_with_position.push((pos, fruit));
result_placeholders[pos] = Some(fruit);
}
results_with_position.sort_by_key(|(pos, _)| *pos);
assert_eq!(results_with_position.len(), num_fruits);
Ok(results_with_position
.into_iter()
.map(|(_, fruit)| fruit)
.collect::<Vec<_>>())
let results: Vec<R> = result_placeholders.into_iter().flatten().collect();
if results.len() != num_fruits {
return Err(TantivyError::InternalError(
"One of the mapped execution failed.".to_string(),
));
}
Ok(results)
}
}
}

View File

@@ -74,6 +74,7 @@ fn load_metas(
pub struct IndexBuilder {
schema: Option<Schema>,
index_settings: IndexSettings,
tokenizer_manager: TokenizerManager,
}
impl Default for IndexBuilder {
fn default() -> Self {
@@ -86,6 +87,7 @@ impl IndexBuilder {
Self {
schema: None,
index_settings: IndexSettings::default(),
tokenizer_manager: TokenizerManager::default(),
}
}
@@ -103,6 +105,12 @@ impl IndexBuilder {
self
}
/// Set the tokenizers .
pub fn tokenizers(mut self, tokenizers: TokenizerManager) -> Self {
self.tokenizer_manager = tokenizers;
self
}
/// Creates a new index using the `RAMDirectory`.
///
/// The index will be allocated in anonymous memory.
@@ -154,7 +162,8 @@ impl IndexBuilder {
if !Index::exists(&*dir)? {
return self.create(dir);
}
let index = Index::open(dir)?;
let mut index = Index::open(dir)?;
index.set_tokenizers(self.tokenizer_manager.clone());
if index.schema() == self.get_expect_schema()? {
Ok(index)
} else {
@@ -176,7 +185,8 @@ impl IndexBuilder {
)?;
let mut metas = IndexMeta::with_schema(self.get_expect_schema()?);
metas.index_settings = self.index_settings;
let index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
let mut index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
index.set_tokenizers(self.tokenizer_manager);
Ok(index)
}
}
@@ -304,6 +314,11 @@ impl Index {
}
}
/// Setter for the tokenizer manager.
pub fn set_tokenizers(&mut self, tokenizers: TokenizerManager) {
self.tokenizers = tokenizers;
}
/// Accessor for the tokenizer manager.
pub fn tokenizers(&self) -> &TokenizerManager {
&self.tokenizers
@@ -314,20 +329,31 @@ impl Index {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let tokenizer_manager: &TokenizerManager = self.tokenizers();
let tokenizer_name_opt: Option<TextAnalyzer> = match field_type {
FieldType::Str(text_options) => text_options
.get_indexing_options()
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)),
_ => None,
let indexing_options_opt = match field_type {
FieldType::JsonObject(options) => options.get_text_indexing_options(),
FieldType::Str(options) => options.get_indexing_options(),
_ => {
return Err(TantivyError::SchemaError(format!(
"{:?} is not a text field.",
field_entry.name()
)))
}
};
match tokenizer_name_opt {
Some(tokenizer) => Ok(tokenizer),
None => Err(TantivyError::SchemaError(format!(
"{:?} is not a text field.",
field_entry.name()
))),
}
let indexing_options = indexing_options_opt.ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"No indexing options set for field {:?}",
field_entry
))
})?;
tokenizer_manager
.get(indexing_options.tokenizer())
.ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"No Tokenizer found for field {:?}",
field_entry
))
})
}
/// Create a default `IndexReader` for the given index.
@@ -557,7 +583,8 @@ impl fmt::Debug for Index {
mod tests {
use crate::directory::{RamDirectory, WatchCallback};
use crate::schema::{Field, Schema, INDEXED, TEXT};
use crate::{Directory, Index, IndexReader, IndexSettings, ReloadPolicy};
use crate::tokenizer::TokenizerManager;
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy};
#[test]
fn test_indexer_for_field() {
@@ -573,6 +600,21 @@ mod tests {
);
}
#[test]
fn test_set_tokenizer_manager() {
let mut schema_builder = Schema::builder();
schema_builder.add_u64_field("num_likes", INDEXED);
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = IndexBuilder::new()
// set empty tokenizer manager
.tokenizers(TokenizerManager::new())
.schema(schema)
.create_in_ram()
.unwrap();
assert!(index.tokenizers().get("raw").is_none());
}
#[test]
fn test_index_exists() {
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
@@ -702,7 +744,7 @@ mod tests {
.try_into()?;
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64))?;
let (sender, receiver) = crossbeam::channel::unbounded();
let (sender, receiver) = crossbeam_channel::unbounded();
let _handle = index.directory_mut().watch(WatchCallback::new(move || {
let _ = sender.send(());
}));
@@ -737,7 +779,7 @@ mod tests {
reader: &IndexReader,
) -> crate::Result<()> {
let mut reader_index = reader.index();
let (sender, receiver) = crossbeam::channel::unbounded();
let (sender, receiver) = crossbeam_channel::unbounded();
let _watch_handle = reader_index
.directory_mut()
.watch(WatchCallback::new(move || {
@@ -781,24 +823,24 @@ mod tests {
for i in 0u64..8_000u64 {
writer.add_document(doc!(field => i))?;
}
let (sender, receiver) = crossbeam::channel::unbounded();
let _handle = directory.watch(WatchCallback::new(move || {
let _ = sender.send(());
}));
writer.commit()?;
let mem_right_after_commit = directory.total_mem_usage();
assert!(receiver.recv().is_ok());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()?;
assert_eq!(reader.searcher().num_docs(), 8_000);
assert_eq!(reader.searcher().segment_readers().len(), 8);
writer.wait_merging_threads()?;
let mem_right_after_merge_finished = directory.total_mem_usage();
reader.reload().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 8_000);
assert!(
mem_right_after_merge_finished < mem_right_after_commit,

View File

@@ -239,7 +239,7 @@ impl InnerSegmentMeta {
///
/// Contains settings which are applied on the whole
/// index, like presort documents.
#[derive(Clone, Default, Serialize, Deserialize, Eq, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct IndexSettings {
/// Sorts the documents by information
/// provided in `IndexSortByField`
@@ -248,13 +248,32 @@ pub struct IndexSettings {
/// The `Compressor` used to compress the doc store.
#[serde(default)]
pub docstore_compression: Compressor,
#[serde(default = "default_docstore_blocksize")]
/// The size of each block that will be compressed and written to disk
pub docstore_blocksize: usize,
}
/// Must be a function to be compatible with serde defaults
fn default_docstore_blocksize() -> usize {
16_384
}
impl Default for IndexSettings {
fn default() -> Self {
Self {
sort_by_field: None,
docstore_compression: Compressor::default(),
docstore_blocksize: default_docstore_blocksize(),
}
}
}
/// Settings to presort the documents in an index
///
/// Presorting documents can greatly performance
/// in some scenarios, by applying top n
/// optimizations.
#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct IndexSortByField {
/// The field to sort the documents by
pub field: String,
@@ -262,7 +281,7 @@ pub struct IndexSortByField {
pub order: Order,
}
/// The order to sort by
#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)]
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub enum Order {
/// Ascending Order
Asc,
@@ -298,12 +317,12 @@ pub struct IndexMeta {
pub schema: Schema,
/// Opstamp associated to the last `commit` operation.
pub opstamp: Opstamp,
#[serde(skip_serializing_if = "Option::is_none")]
/// Payload associated to the last commit.
///
/// Upon commit, clients can optionally add a small `String` payload to their commit
/// to help identify this commit.
/// This payload is entirely unused by tantivy.
#[serde(skip_serializing_if = "Option::is_none")]
pub payload: Option<String>,
}
@@ -374,6 +393,7 @@ impl fmt::Debug for IndexMeta {
mod tests {
use super::IndexMeta;
use crate::core::index_meta::UntrackedIndexMeta;
use crate::schema::{Schema, TEXT};
use crate::{IndexSettings, IndexSortByField, Order};
@@ -400,7 +420,12 @@ mod tests {
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(
json,
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"#
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
);
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
assert_eq!(index_metas.index_settings, deser_meta.index_settings);
assert_eq!(index_metas.schema, deser_meta.schema);
assert_eq!(index_metas.opstamp, deser_meta.opstamp);
}
}

View File

@@ -35,7 +35,7 @@ const ZERO_ARRAY: [u8; 8] = [0u8; 8];
#[cfg(test)]
fn create_uuid() -> Uuid {
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &ZERO_ARRAY).unwrap()
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &ZERO_ARRAY)
}
#[cfg(not(test))]
@@ -57,7 +57,7 @@ impl SegmentId {
/// Picking the first 8 chars is ok to identify
/// segments in a display message (e.g. a5c4dfcb).
pub fn short_uuid_string(&self) -> String {
(&self.0.to_simple_ref().to_string()[..8]).to_string()
(&self.0.as_simple().to_string()[..8]).to_string()
}
/// Returns a segment uuid string.
@@ -65,7 +65,7 @@ impl SegmentId {
/// It consists in 32 lowercase hexadecimal chars
/// (e.g. a5c4dfcbdfe645089129e308e26d5523)
pub fn uuid_string(&self) -> String {
self.0.to_simple_ref().to_string()
self.0.as_simple().to_string()
}
/// Build a `SegmentId` string from the full uuid string.

View File

@@ -169,7 +169,7 @@ impl SegmentReader {
let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?;
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fast_field_readers =
let fast_fields_readers =
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
@@ -196,7 +196,7 @@ impl SegmentReader {
max_doc,
termdict_composite,
postings_composite,
fast_fields_readers: fast_field_readers,
fast_fields_readers,
fieldnorm_readers,
segment_id: segment.id(),
delete_opstamp: segment.meta().delete_opstamp(),

View File

@@ -53,7 +53,9 @@ impl FileWatcher {
if metafile_has_changed {
info!("Meta file {:?} was modified", path);
current_checksum_opt = Some(checksum);
futures::executor::block_on(callbacks.broadcast());
// We actually ignore callbacks failing here.
// We just wait for the end of their execution.
let _ = callbacks.broadcast().wait();
}
}
@@ -108,7 +110,7 @@ mod tests {
let tmp_file = tmp_dir.path().join("watched.txt");
let counter: Arc<AtomicUsize> = Default::default();
let (tx, rx) = crossbeam::channel::unbounded();
let (tx, rx) = crossbeam_channel::unbounded();
let timeout = Duration::from_millis(100);
let watcher = FileWatcher::new(&tmp_file);
@@ -151,7 +153,7 @@ mod tests {
let tmp_file = tmp_dir.path().join("watched.txt");
let counter: Arc<AtomicUsize> = Default::default();
let (tx, rx) = crossbeam::channel::unbounded();
let (tx, rx) = crossbeam_channel::unbounded();
let timeout = Duration::from_millis(100);
let watcher = FileWatcher::new(&tmp_file);

View File

@@ -6,9 +6,6 @@ use std::sync::atomic::{AtomicBool, AtomicUsize};
use std::sync::Arc;
use std::time::Duration;
use futures::channel::oneshot;
use futures::executor::block_on;
use super::*;
#[cfg(feature = "mmap")]
@@ -184,7 +181,7 @@ fn test_directory_delete(directory: &dyn Directory) -> crate::Result<()> {
fn test_watch(directory: &dyn Directory) {
let counter: Arc<AtomicUsize> = Default::default();
let (tx, rx) = crossbeam::channel::unbounded();
let (tx, rx) = crossbeam_channel::unbounded();
let timeout = Duration::from_millis(500);
let handle = directory
@@ -249,8 +246,8 @@ fn test_lock_blocking(directory: &dyn Directory) {
std::thread::spawn(move || {
//< lock_a_res is sent to the thread.
in_thread_clone.store(true, SeqCst);
let _just_sync = block_on(receiver);
// explicitely droping lock_a_res. It would have been sufficient to just force it
let _just_sync = receiver.recv();
// explicitely dropping lock_a_res. It would have been sufficient to just force it
// to be part of the move, but the intent seems clearer that way.
drop(lock_a_res);
});
@@ -273,7 +270,7 @@ fn test_lock_blocking(directory: &dyn Directory) {
assert!(in_thread.load(SeqCst));
assert!(lock_a_res.is_ok());
});
assert!(block_on(receiver2).is_ok());
assert!(receiver2.recv().is_ok());
assert!(sender.send(()).is_ok());
assert!(join_handle.join().is_ok());
}

View File

@@ -1,7 +1,6 @@
use std::sync::{Arc, RwLock, Weak};
use futures::channel::oneshot;
use futures::{Future, TryFutureExt};
use crate::FutureResult;
/// Cloneable wrapper for callbacks registered when watching files of a `Directory`.
#[derive(Clone)]
@@ -74,12 +73,11 @@ impl WatchCallbackList {
}
/// Triggers all callbacks
pub fn broadcast(&self) -> impl Future<Output = ()> {
pub fn broadcast(&self) -> FutureResult<()> {
let callbacks = self.list_callback();
let (sender, receiver) = oneshot::channel();
let result = receiver.unwrap_or_else(|_| ());
let (result, sender) = FutureResult::create("One of the callback panicked.");
if callbacks.is_empty() {
let _ = sender.send(());
let _ = sender.send(Ok(()));
return result;
}
let spawn_res = std::thread::Builder::new()
@@ -88,7 +86,7 @@ impl WatchCallbackList {
for callback in callbacks {
callback.call();
}
let _ = sender.send(());
let _ = sender.send(Ok(()));
});
if let Err(err) = spawn_res {
error!(
@@ -106,8 +104,6 @@ mod tests {
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use futures::executor::block_on;
use crate::directory::{WatchCallback, WatchCallbackList};
#[test]
@@ -118,22 +114,18 @@ mod tests {
let inc_callback = WatchCallback::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
});
block_on(watch_event_router.broadcast());
watch_event_router.broadcast().wait().unwrap();
assert_eq!(0, counter.load(Ordering::SeqCst));
let handle_a = watch_event_router.subscribe(inc_callback);
assert_eq!(0, counter.load(Ordering::SeqCst));
block_on(watch_event_router.broadcast());
watch_event_router.broadcast().wait().unwrap();
assert_eq!(1, counter.load(Ordering::SeqCst));
block_on(async {
(
watch_event_router.broadcast().await,
watch_event_router.broadcast().await,
watch_event_router.broadcast().await,
)
});
watch_event_router.broadcast().wait().unwrap();
watch_event_router.broadcast().wait().unwrap();
watch_event_router.broadcast().wait().unwrap();
assert_eq!(4, counter.load(Ordering::SeqCst));
mem::drop(handle_a);
block_on(watch_event_router.broadcast());
watch_event_router.broadcast().wait().unwrap();
assert_eq!(4, counter.load(Ordering::SeqCst));
}
@@ -150,19 +142,15 @@ mod tests {
let handle_a = watch_event_router.subscribe(inc_callback(1));
let handle_a2 = watch_event_router.subscribe(inc_callback(10));
assert_eq!(0, counter.load(Ordering::SeqCst));
block_on(async {
futures::join!(
watch_event_router.broadcast(),
watch_event_router.broadcast()
)
});
watch_event_router.broadcast().wait().unwrap();
watch_event_router.broadcast().wait().unwrap();
assert_eq!(22, counter.load(Ordering::SeqCst));
mem::drop(handle_a);
block_on(watch_event_router.broadcast());
watch_event_router.broadcast().wait().unwrap();
assert_eq!(32, counter.load(Ordering::SeqCst));
mem::drop(handle_a2);
block_on(watch_event_router.broadcast());
block_on(watch_event_router.broadcast());
watch_event_router.broadcast().wait().unwrap();
watch_event_router.broadcast().wait().unwrap();
assert_eq!(32, counter.load(Ordering::SeqCst));
}
@@ -176,15 +164,12 @@ mod tests {
});
let handle_a = watch_event_router.subscribe(inc_callback);
assert_eq!(0, counter.load(Ordering::SeqCst));
block_on(async {
let future1 = watch_event_router.broadcast();
let future2 = watch_event_router.broadcast();
futures::join!(future1, future2)
});
watch_event_router.broadcast().wait().unwrap();
watch_event_router.broadcast().wait().unwrap();
assert_eq!(2, counter.load(Ordering::SeqCst));
mem::drop(handle_a);
let _ = watch_event_router.broadcast();
block_on(watch_event_router.broadcast());
watch_event_router.broadcast().wait().unwrap();
assert_eq!(2, counter.load(Ordering::SeqCst));
}
}

View File

@@ -97,6 +97,10 @@ pub enum TantivyError {
/// Index incompatible with current version of Tantivy.
#[error("{0:?}")]
IncompatibleIndex(Incompatibility),
/// An internal error occurred. This is are internal states that should not be reached.
/// e.g. a datastructure is incorrectly inititalized.
#[error("Internal error: '{0}'")]
InternalError(String),
}
#[cfg(feature = "quickwit")]
@@ -149,9 +153,21 @@ impl<Guard> From<PoisonError<Guard>> for TantivyError {
}
}
impl From<chrono::ParseError> for TantivyError {
fn from(err: chrono::ParseError) -> TantivyError {
TantivyError::InvalidArgument(err.to_string())
impl From<time::error::Format> for TantivyError {
fn from(err: time::error::Format) -> TantivyError {
TantivyError::InvalidArgument(format!("Date formatting error: {err}"))
}
}
impl From<time::error::Parse> for TantivyError {
fn from(err: time::error::Parse) -> TantivyError {
TantivyError::InvalidArgument(format!("Date parsing error: {err}"))
}
}
impl From<time::error::ComponentRange> for TantivyError {
fn from(err: time::error::ComponentRange) -> TantivyError {
TantivyError::InvalidArgument(format!("Date range error: {err}"))
}
}

View File

@@ -188,14 +188,14 @@ mod bench {
}
#[bench]
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
fn bench_alive_bitset_iter_deser_on_fly(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_deletebitset_access(bench: &mut Bencher) {
fn bench_alive_bitset_access(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| {
@@ -206,14 +206,14 @@ mod bench {
}
#[bench]
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
fn bench_alive_bitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
fn bench_alive_bitset_access_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| {

View File

@@ -30,9 +30,8 @@ pub use self::readers::FastFieldReaders;
pub(crate) use self::readers::{type_and_cardinality, FastType};
pub use self::serializer::{CompositeFastFieldSerializer, FastFieldDataAccess, FastFieldStats};
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use crate::chrono::{NaiveDateTime, Utc};
use crate::schema::{Cardinality, FieldType, Type, Value};
use crate::DocId;
use crate::{DateTime, DocId};
mod alive_bitset;
mod bytes;
@@ -161,14 +160,14 @@ impl FastValue for f64 {
}
}
impl FastValue for crate::DateTime {
impl FastValue for DateTime {
fn from_u64(timestamp_u64: u64) -> Self {
let timestamp_i64 = i64::from_u64(timestamp_u64);
crate::DateTime::from_utc(NaiveDateTime::from_timestamp(timestamp_i64, 0), Utc)
let unix_timestamp = i64::from_u64(timestamp_u64);
Self::from_unix_timestamp(unix_timestamp)
}
fn to_u64(&self) -> u64 {
self.timestamp().to_u64()
self.into_unix_timestamp().to_u64()
}
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
@@ -179,7 +178,7 @@ impl FastValue for crate::DateTime {
}
fn as_u64(&self) -> u64 {
self.timestamp().as_u64()
self.into_unix_timestamp().as_u64()
}
fn to_type() -> Type {
@@ -188,12 +187,32 @@ impl FastValue for crate::DateTime {
}
fn value_to_u64(value: &Value) -> u64 {
match *value {
Value::U64(ref val) => *val,
Value::I64(ref val) => common::i64_to_u64(*val),
Value::F64(ref val) => common::f64_to_u64(*val),
Value::Date(ref datetime) => common::i64_to_u64(datetime.timestamp()),
_ => panic!("Expected a u64/i64/f64 field, got {:?} ", value),
match value {
Value::U64(val) => val.to_u64(),
Value::I64(val) => val.to_u64(),
Value::F64(val) => val.to_u64(),
Value::Date(val) => val.to_u64(),
_ => panic!("Expected a u64/i64/f64/date field, got {:?} ", value),
}
}
/// The fast field type
pub enum FastFieldType {
/// Numeric type, e.g. f64.
Numeric,
/// Fast field stores string ids.
String,
/// Fast field stores string ids for facets.
Facet,
}
impl FastFieldType {
fn is_storing_term_ids(&self) -> bool {
matches!(self, FastFieldType::String | FastFieldType::Facet)
}
fn is_facet(&self) -> bool {
matches!(self, FastFieldType::Facet)
}
}
@@ -201,6 +220,7 @@ fn value_to_u64(value: &Value) -> u64 {
mod tests {
use std::collections::HashMap;
use std::ops::Range;
use std::path::Path;
use common::HasLen;
@@ -212,7 +232,8 @@ mod tests {
use super::*;
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::{Document, Field, NumericOptions, Schema, FAST};
use crate::schema::{Document, Field, NumericOptions, Schema, FAST, STRING, TEXT};
use crate::time::OffsetDateTime;
use crate::{Index, SegmentId, SegmentReader};
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
@@ -233,7 +254,7 @@ mod tests {
#[test]
pub fn test_fastfield_i64_u64() {
let datetime = crate::DateTime::from_utc(NaiveDateTime::from_timestamp(0i64, 0), Utc);
let datetime = DateTime::from_utc(OffsetDateTime::UNIX_EPOCH);
assert_eq!(i64::from_u64(datetime.to_u64()), 0i64);
}
@@ -489,7 +510,8 @@ mod tests {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()))?;
index_writer
.add_document(doc!(date_field =>DateTime::from_utc(OffsetDateTime::now_utc())))?;
index_writer.commit()?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
@@ -501,8 +523,7 @@ mod tests {
.map(SegmentReader::segment_id)
.collect();
assert_eq!(segment_ids.len(), 2);
let merge_future = index_writer.merge(&segment_ids[..]);
futures::executor::block_on(merge_future)?;
index_writer.merge(&segment_ids[..]).wait().unwrap();
reader.reload()?;
assert_eq!(reader.searcher().segment_readers().len(), 1);
Ok(())
@@ -510,7 +531,206 @@ mod tests {
#[test]
fn test_default_datetime() {
assert_eq!(crate::DateTime::make_zero().timestamp(), 0i64);
assert_eq!(0, DateTime::make_zero().into_unix_timestamp());
}
fn get_vals_for_docs(ff: &MultiValuedFastFieldReader<u64>, docs: Range<u32>) -> Vec<u64> {
let mut all = vec![];
for doc in docs {
let mut out = vec![];
ff.get_vals(doc, &mut out);
all.extend(out);
}
all
}
#[test]
fn test_text_fastfield() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// first segment
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
text_field => "BBBBB AAAAA", // term_ord 1,2
))?;
index_writer.add_document(doc!())?;
index_writer.add_document(doc!(
text_field => "AAAAA", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "AAAAA BBBBB", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "zumberthree", // term_ord 2, after merge term_ord 3
))?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s(text_field).unwrap();
assert_eq!(
get_vals_for_docs(&text_fast_field, 0..5),
vec![1, 0, 0, 0, 1, 2]
);
let mut out = vec![];
text_fast_field.get_vals(3, &mut out);
assert_eq!(out, vec![0, 1]);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 3);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
// default tokenizer applies lower case
assert_eq!(bytes, "aaaaa".as_bytes());
}
{
// second segment
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
text_field => "AAAAA", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "CCCCC AAAAA", // term_ord 1, after merge 2
))?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 2);
let segment_reader = searcher.segment_reader(1);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s(text_field).unwrap();
assert_eq!(get_vals_for_docs(&text_fast_field, 0..3), vec![0, 1, 0]);
}
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s(text_field).unwrap();
assert_eq!(
get_vals_for_docs(&text_fast_field, 0..8),
vec![1, 0, 0, 0, 1, 3 /* next segment */, 0, 2, 0]
);
Ok(())
}
#[test]
fn test_string_fastfield() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", STRING | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// first segment
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
text_field => "BBBBB", // term_ord 1
))?;
index_writer.add_document(doc!())?;
index_writer.add_document(doc!(
text_field => "AAAAA", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "AAAAA", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "zumberthree", // term_ord 2, after merge term_ord 3
))?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s(text_field).unwrap();
assert_eq!(get_vals_for_docs(&text_fast_field, 0..6), vec![1, 0, 0, 2]);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 3);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
assert_eq!(bytes, "AAAAA".as_bytes());
}
{
// second segment
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
text_field => "AAAAA", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "CCCCC", // term_ord 1, after merge 2
))?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 2);
let segment_reader = searcher.segment_reader(1);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s(text_field).unwrap();
assert_eq!(get_vals_for_docs(&text_fast_field, 0..2), vec![0, 1]);
}
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s(text_field).unwrap();
assert_eq!(
get_vals_for_docs(&text_fast_field, 0..9),
vec![1, 0, 0, 3 /* next segment */, 0, 2]
);
Ok(())
}
#[test]
@@ -527,16 +747,16 @@ mod tests {
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
date_field => crate::DateTime::from_u64(1i64.to_u64()),
multi_date_field => crate::DateTime::from_u64(2i64.to_u64()),
multi_date_field => crate::DateTime::from_u64(3i64.to_u64())
date_field => DateTime::from_u64(1i64.to_u64()),
multi_date_field => DateTime::from_u64(2i64.to_u64()),
multi_date_field => DateTime::from_u64(3i64.to_u64())
))?;
index_writer.add_document(doc!(
date_field => crate::DateTime::from_u64(4i64.to_u64())
date_field => DateTime::from_u64(4i64.to_u64())
))?;
index_writer.add_document(doc!(
multi_date_field => crate::DateTime::from_u64(5i64.to_u64()),
multi_date_field => crate::DateTime::from_u64(6i64.to_u64())
multi_date_field => DateTime::from_u64(5i64.to_u64()),
multi_date_field => DateTime::from_u64(6i64.to_u64())
))?;
index_writer.commit()?;
let reader = index.reader()?;
@@ -548,23 +768,23 @@ mod tests {
let dates_fast_field = fast_fields.dates(multi_date_field).unwrap();
let mut dates = vec![];
{
assert_eq!(date_fast_field.get(0u32).timestamp(), 1i64);
assert_eq!(date_fast_field.get(0u32).into_unix_timestamp(), 1i64);
dates_fast_field.get_vals(0u32, &mut dates);
assert_eq!(dates.len(), 2);
assert_eq!(dates[0].timestamp(), 2i64);
assert_eq!(dates[1].timestamp(), 3i64);
assert_eq!(dates[0].into_unix_timestamp(), 2i64);
assert_eq!(dates[1].into_unix_timestamp(), 3i64);
}
{
assert_eq!(date_fast_field.get(1u32).timestamp(), 4i64);
assert_eq!(date_fast_field.get(1u32).into_unix_timestamp(), 4i64);
dates_fast_field.get_vals(1u32, &mut dates);
assert!(dates.is_empty());
}
{
assert_eq!(date_fast_field.get(2u32).timestamp(), 0i64);
assert_eq!(date_fast_field.get(2u32).into_unix_timestamp(), 0i64);
dates_fast_field.get_vals(2u32, &mut dates);
assert_eq!(dates.len(), 2);
assert_eq!(dates[0].timestamp(), 5i64);
assert_eq!(dates[1].timestamp(), 6i64);
assert_eq!(dates[0].into_unix_timestamp(), 5i64);
assert_eq!(dates[1].into_unix_timestamp(), 6i64);
}
Ok(())
}

View File

@@ -6,9 +6,6 @@ pub use self::writer::MultiValuedFastFieldWriter;
#[cfg(test)]
mod tests {
use chrono::Duration;
use futures::executor::block_on;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
use test_log::test;
@@ -17,7 +14,9 @@ mod tests {
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema};
use crate::{Document, Index, Term};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{Duration, OffsetDateTime};
use crate::{DateTime, Document, Index, Term};
#[test]
fn test_multivalued_u64() -> crate::Result<()> {
@@ -70,22 +69,27 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let first_time_stamp = chrono::Utc::now();
index_writer.add_document(
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
)?;
index_writer.add_document(doc!(time_i=>0i64))?;
let first_time_stamp = OffsetDateTime::now_utc();
index_writer.add_document(doc!(
date_field => DateTime::from_utc(first_time_stamp),
date_field => DateTime::from_utc(first_time_stamp),
time_i=>1i64))?;
index_writer.add_document(doc!(time_i => 0i64))?;
// add one second
index_writer.add_document(
doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64),
)?;
index_writer.add_document(doc!(
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(1)),
time_i => 2i64))?;
// add another second
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64))?;
index_writer.add_document(doc!(
date_field => DateTime::from_utc(two_secs_ahead),
date_field => DateTime::from_utc(two_secs_ahead),
date_field => DateTime::from_utc(two_secs_ahead),
time_i => 3i64))?;
// add three seconds
index_writer.add_document(
doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64),
)?;
index_writer.add_document(doc!(
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(3)),
time_i => 4i64))?;
index_writer.commit()?;
let reader = index.reader()?;
@@ -97,7 +101,7 @@ mod tests {
let parser = QueryParser::for_index(&index, vec![]);
let query = parser.parse_query(&format!(
"multi_date_field:\"{}\"",
first_time_stamp.to_rfc3339()
first_time_stamp.format(&Rfc3339)?,
))?;
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
assert_eq!(results.len(), 1);
@@ -108,9 +112,8 @@ mod tests {
.get_first(date_field)
.expect("cannot find value")
.as_date()
.unwrap()
.timestamp(),
first_time_stamp.timestamp()
.unwrap(),
DateTime::from_utc(first_time_stamp),
);
assert_eq!(
retrieved_doc
@@ -124,7 +127,7 @@ mod tests {
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))?;
let query = parser.parse_query(&format!("\"{}\"", two_secs_ahead.format(&Rfc3339)?))?;
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
assert_eq!(results.len(), 1);
@@ -136,9 +139,8 @@ mod tests {
.get_first(date_field)
.expect("cannot find value")
.as_date()
.unwrap()
.timestamp(),
two_secs_ahead.timestamp()
.unwrap(),
DateTime::from_utc(two_secs_ahead)
);
assert_eq!(
retrieved_doc
@@ -154,8 +156,8 @@ mod tests {
let parser = QueryParser::for_index(&index, vec![date_field]);
let range_q = format!(
"multi_date_field:[{} TO {}}}",
(first_time_stamp + Duration::seconds(1)).to_rfc3339(),
(first_time_stamp + Duration::seconds(3)).to_rfc3339()
(first_time_stamp + Duration::seconds(1)).format(&Rfc3339)?,
(first_time_stamp + Duration::seconds(3)).format(&Rfc3339)?
);
let query = parser.parse_query(&range_q)?;
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
@@ -178,9 +180,8 @@ mod tests {
.get_first(date_field)
.expect("cannot find value")
.as_date()
.expect("value not of Date type")
.timestamp(),
(first_time_stamp + Duration::seconds(offset_sec)).timestamp()
.expect("value not of Date type"),
DateTime::from_utc(first_time_stamp + Duration::seconds(offset_sec)),
);
assert_eq!(
retrieved_doc
@@ -268,7 +269,7 @@ mod tests {
IndexingOp::Merge => {
let segment_ids = index.searchable_segment_ids()?;
if segment_ids.len() >= 2 {
block_on(index_writer.merge(&segment_ids))?;
index_writer.merge(&segment_ids).wait()?;
index_writer.segment_updater().wait_merging_thread()?;
}
}
@@ -283,7 +284,7 @@ mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
if !segment_ids.is_empty() {
block_on(index_writer.merge(&segment_ids)).unwrap();
index_writer.merge(&segment_ids).wait()?;
assert!(index_writer.wait_merging_threads().is_ok());
}
}

View File

@@ -27,22 +27,28 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
}
}
/// Returns `(start, stop)`, such that the values associated
/// to the given document are `start..stop`.
/// Returns `[start, end)`, such that the values associated
/// to the given document are `start..end`.
#[inline]
fn range(&self, doc: DocId) -> Range<u64> {
let start = self.idx_reader.get(doc);
let stop = self.idx_reader.get(doc + 1);
start..stop
let end = self.idx_reader.get(doc + 1);
start..end
}
/// Returns the array of values associated to the given `doc`.
#[inline]
fn get_vals_for_range(&self, range: Range<u64>, vals: &mut Vec<Item>) {
let len = (range.end - range.start) as usize;
vals.resize(len, Item::make_zero());
self.vals_reader.get_range(range.start, &mut vals[..]);
}
/// Returns the array of values associated to the given `doc`.
#[inline]
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let range = self.range(doc);
let len = (range.end - range.start) as usize;
vals.resize(len, Item::make_zero());
self.vals_reader.get_range(range.start, &mut vals[..]);
self.get_vals_for_range(range, vals);
}
/// Returns the minimum value for this fast field.

View File

@@ -4,7 +4,7 @@ use fnv::FnvHashMap;
use tantivy_bitpacker::minmax;
use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy;
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer};
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer, FastFieldType};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Document, Field};
@@ -38,17 +38,17 @@ pub struct MultiValuedFastFieldWriter {
field: Field,
vals: Vec<UnorderedTermId>,
doc_index: Vec<u64>,
is_facet: bool,
fast_field_type: FastFieldType,
}
impl MultiValuedFastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub(crate) fn new(field: Field, is_facet: bool) -> Self {
/// Creates a new `MultiValuedFastFieldWriter`
pub(crate) fn new(field: Field, fast_field_type: FastFieldType) -> Self {
MultiValuedFastFieldWriter {
field,
vals: Vec::new(),
doc_index: Vec::new(),
is_facet,
fast_field_type,
}
}
@@ -77,12 +77,13 @@ impl MultiValuedFastFieldWriter {
/// all of the matching field values present in the document.
pub fn add_document(&mut self, doc: &Document) {
self.next_doc();
// facets are indexed in the `SegmentWriter` as we encode their unordered id.
if !self.is_facet {
for field_value in doc.field_values() {
if field_value.field == self.field {
self.add_val(value_to_u64(field_value.value()));
}
// facets/texts are indexed in the `SegmentWriter` as we encode their unordered id.
if self.fast_field_type.is_storing_term_ids() {
return;
}
for field_value in doc.field_values() {
if field_value.field == self.field {
self.add_val(value_to_u64(field_value.value()));
}
}
}
@@ -158,15 +159,15 @@ impl MultiValuedFastFieldWriter {
{
// writing the values themselves.
let mut value_serializer: BitpackedFastFieldSerializerLegacy<'_, _>;
match mapping_opt {
Some(mapping) => {
value_serializer = serializer.new_u64_fast_field_with_idx(
self.field,
0u64,
mapping.len() as u64,
1,
)?;
if let Some(mapping) = mapping_opt {
value_serializer = serializer.new_u64_fast_field_with_idx(
self.field,
0u64,
mapping.len() as u64,
1,
)?;
if self.fast_field_type.is_facet() {
let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
for vals in self.get_ordered_values(doc_id_map) {
doc_vals.clear();
@@ -179,19 +180,27 @@ impl MultiValuedFastFieldWriter {
value_serializer.add_val(val)?;
}
}
}
None => {
let val_min_max = minmax(self.vals.iter().cloned());
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
value_serializer =
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
} else {
for vals in self.get_ordered_values(doc_id_map) {
// sort values in case of remapped doc_ids?
for &val in vals {
let remapped_vals = vals
.iter()
.map(|val| *mapping.get(val).expect("Missing term ordinal"));
for val in remapped_vals {
value_serializer.add_val(val)?;
}
}
}
} else {
let val_min_max = minmax(self.vals.iter().cloned());
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
value_serializer =
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
for vals in self.get_ordered_values(doc_id_map) {
// sort values in case of remapped doc_ids?
for &val in vals {
value_serializer.add_val(val)?;
}
}
}
value_serializer.close_field()?;
}

View File

@@ -5,7 +5,7 @@ use crate::fastfield::{
};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::space_usage::PerFieldSpaceUsage;
use crate::TantivyError;
use crate::{DateTime, TantivyError};
/// Provides access to all of the BitpackedFastFieldReader.
///
@@ -39,6 +39,9 @@ pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType,
.get_fastfield_cardinality()
.map(|cardinality| (FastType::Date, cardinality)),
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)),
FieldType::Str(options) if options.is_fast() => {
Some((FastType::U64, Cardinality::MultiValues))
}
_ => None,
}
}
@@ -147,10 +150,10 @@ impl FastFieldReaders {
self.typed_fast_field_reader(field)
}
/// Returns the `i64` fast field reader reader associated to `field`.
/// Returns the `date` fast field reader reader associated to `field`.
///
/// If `field` is not a i64 fast field, this method returns an Error.
pub fn date(&self, field: Field) -> crate::Result<DynamicFastFieldReader<crate::DateTime>> {
/// If `field` is not a date fast field, this method returns an Error.
pub fn date(&self, field: Field) -> crate::Result<DynamicFastFieldReader<DateTime>> {
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -195,13 +198,12 @@ impl FastFieldReaders {
self.typed_fast_field_multi_reader(field)
}
/// Returns a `crate::DateTime` multi-valued fast field reader reader associated to `field`.
/// Returns a `time::OffsetDateTime` multi-valued fast field reader reader associated to
/// `field`.
///
/// If `field` is not a `crate::DateTime` multi-valued fast field, this method returns an Error.
pub fn dates(
&self,
field: Field,
) -> crate::Result<MultiValuedFastFieldReader<crate::DateTime>> {
/// If `field` is not a `time::OffsetDateTime` multi-valued fast field, this method returns an
/// Error.
pub fn dates(&self, field: Field) -> crate::Result<MultiValuedFastFieldReader<DateTime>> {
self.check_type(field, FastType::Date, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(field)
}

View File

@@ -7,7 +7,7 @@ use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::MultiValuedFastFieldWriter;
use super::serializer::FastFieldStats;
use super::FastFieldDataAccess;
use super::{FastFieldDataAccess, FastFieldType};
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
@@ -16,6 +16,7 @@ use crate::termdict::TermOrdinal;
/// The `FastFieldsWriter` groups all of the fast field writers.
pub struct FastFieldsWriter {
term_id_writers: Vec<MultiValuedFastFieldWriter>,
single_value_writers: Vec<IntFastFieldWriter>,
multi_values_writers: Vec<MultiValuedFastFieldWriter>,
bytes_value_writers: Vec<BytesFastFieldWriter>,
@@ -33,6 +34,7 @@ impl FastFieldsWriter {
/// Create all `FastFieldWriter` required by the schema.
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
let mut single_value_writers = Vec::new();
let mut term_id_writers = Vec::new();
let mut multi_values_writers = Vec::new();
let mut bytes_value_writers = Vec::new();
@@ -50,15 +52,22 @@ impl FastFieldsWriter {
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValuedFastFieldWriter::new(field, false);
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::Numeric);
multi_values_writers.push(fast_field_writer);
}
None => {}
}
}
FieldType::Facet(_) => {
let fast_field_writer = MultiValuedFastFieldWriter::new(field, true);
multi_values_writers.push(fast_field_writer);
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::Facet);
term_id_writers.push(fast_field_writer);
}
FieldType::Str(_) if field_entry.is_fast() => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::String);
term_id_writers.push(fast_field_writer);
}
FieldType::Bytes(bytes_option) => {
if bytes_option.is_fast() {
@@ -70,6 +79,7 @@ impl FastFieldsWriter {
}
}
FastFieldsWriter {
term_id_writers,
single_value_writers,
multi_values_writers,
bytes_value_writers,
@@ -78,10 +88,15 @@ impl FastFieldsWriter {
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.single_value_writers
self.term_id_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.single_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.multi_values_writers
.iter()
@@ -94,6 +109,14 @@ impl FastFieldsWriter {
.sum::<usize>()
}
/// Get the `FastFieldWriter` associated to a field.
pub fn get_term_id_writer(&self, field: Field) -> Option<&MultiValuedFastFieldWriter> {
// TODO optimize
self.term_id_writers
.iter()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated to a field.
pub fn get_field_writer(&self, field: Field) -> Option<&IntFastFieldWriter> {
// TODO optimize
@@ -110,6 +133,17 @@ impl FastFieldsWriter {
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated to a field.
pub fn get_term_id_writer_mut(
&mut self,
field: Field,
) -> Option<&mut MultiValuedFastFieldWriter> {
// TODO optimize
self.term_id_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Returns the fast field multi-value writer for the given field.
///
/// Returns None if the field does not exist, or is not
@@ -137,6 +171,9 @@ impl FastFieldsWriter {
/// Indexes all of the fastfields of a new document.
pub fn add_document(&mut self, doc: &Document) {
for field_writer in &mut self.term_id_writers {
field_writer.add_document(doc);
}
for field_writer in &mut self.single_value_writers {
field_writer.add_document(doc);
}
@@ -156,6 +193,10 @@ impl FastFieldsWriter {
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
for field_writer in &self.term_id_writers {
let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
}
for field_writer in &self.single_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
@@ -244,6 +285,10 @@ impl IntFastFieldWriter {
self.val_count += 1;
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
///
///
/// Extract the value associated to the fast field for
/// this document.
///
@@ -254,18 +299,17 @@ impl IntFastFieldWriter {
/// instead.
/// If the document has more than one value for the given field,
/// only the first one is taken in account.
fn extract_val(&self, doc: &Document) -> u64 {
match doc.get_first(self.field) {
Some(v) => super::value_to_u64(v),
None => self.val_if_missing,
}
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
///
/// Values on text fast fields are skipped.
pub fn add_document(&mut self, doc: &Document) {
let val = self.extract_val(doc);
self.add_val(val);
match doc.get_first(self.field) {
Some(v) => {
self.add_val(super::value_to_u64(v));
}
None => {
self.add_val(self.val_if_missing);
}
};
}
/// get iterator over the data
@@ -284,6 +328,7 @@ impl IntFastFieldWriter {
} else {
(self.val_min, self.val_max)
};
let fastfield_accessor = WriterFastFieldAccessProvider {
doc_id_map,
vals: &self.vals,

130
src/future_result.rs Normal file
View File

@@ -0,0 +1,130 @@
use std::future::Future;
use std::pin::Pin;
use std::task::Poll;
use crate::TantivyError;
/// `FutureResult` is a handle that makes it possible to wait for the completion
/// of an ongoing task.
///
/// Contrary to some `Future`, it does not need to be polled for the task to
/// progress. Dropping the `FutureResult` does not cancel the task being executed
/// either.
///
/// - In a sync context, you can call `FutureResult::wait()`. The function
/// does not rely on `block_on`.
/// - In an async context, you can call simply use `FutureResult` as a future.
pub struct FutureResult<T> {
inner: Inner<T>,
}
enum Inner<T> {
FailedBeforeStart(Option<TantivyError>),
InProgress {
receiver: oneshot::Receiver<crate::Result<T>>,
error_msg_if_failure: &'static str,
},
}
impl<T> From<TantivyError> for FutureResult<T> {
fn from(err: TantivyError) -> Self {
FutureResult {
inner: Inner::FailedBeforeStart(Some(err)),
}
}
}
impl<T> FutureResult<T> {
pub(crate) fn create(
error_msg_if_failure: &'static str,
) -> (Self, oneshot::Sender<crate::Result<T>>) {
let (sender, receiver) = oneshot::channel();
let inner: Inner<T> = Inner::InProgress {
receiver,
error_msg_if_failure,
};
(FutureResult { inner }, sender)
}
/// Blocks until the scheduled result is available.
///
/// In an async context, you should simply use `ScheduledResult` as a future.
pub fn wait(self) -> crate::Result<T> {
match self.inner {
Inner::FailedBeforeStart(err) => Err(err.unwrap()),
Inner::InProgress {
receiver,
error_msg_if_failure,
} => receiver.recv().unwrap_or_else(|_| {
Err(crate::TantivyError::SystemError(
error_msg_if_failure.to_string(),
))
}),
}
}
}
impl<T> Future for FutureResult<T> {
type Output = crate::Result<T>;
fn poll(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<Self::Output> {
unsafe {
match &mut Pin::get_unchecked_mut(self).inner {
Inner::FailedBeforeStart(err) => Poll::Ready(Err(err.take().unwrap())),
Inner::InProgress {
receiver,
error_msg_if_failure,
} => match Future::poll(Pin::new_unchecked(receiver), cx) {
Poll::Ready(oneshot_res) => {
let res = oneshot_res.unwrap_or_else(|_| {
Err(crate::TantivyError::SystemError(
error_msg_if_failure.to_string(),
))
});
Poll::Ready(res)
}
Poll::Pending => Poll::Pending,
},
}
}
}
}
#[cfg(test)]
mod tests {
use futures::executor::block_on;
use super::FutureResult;
use crate::TantivyError;
#[test]
fn test_scheduled_result_failed_to_schedule() {
let scheduled_result: FutureResult<()> = FutureResult::from(TantivyError::Poisoned);
let res = block_on(scheduled_result);
assert!(matches!(res, Err(TantivyError::Poisoned)));
}
#[test]
fn test_scheduled_result_error() {
let (scheduled_result, tx): (FutureResult<()>, _) = FutureResult::create("failed");
drop(tx);
let res = block_on(scheduled_result);
assert!(matches!(res, Err(TantivyError::SystemError(_))));
}
#[test]
fn test_scheduled_result_sent_success() {
let (scheduled_result, tx): (FutureResult<u64>, _) = FutureResult::create("failed");
tx.send(Ok(2u64)).unwrap();
assert_eq!(block_on(scheduled_result).unwrap(), 2u64);
}
#[test]
fn test_scheduled_result_sent_error() {
let (scheduled_result, tx): (FutureResult<u64>, _) = FutureResult::create("failed");
tx.send(Err(TantivyError::Poisoned)).unwrap();
let res = block_on(scheduled_result);
assert!(matches!(res, Err(TantivyError::Poisoned)));
}
}

View File

@@ -116,14 +116,14 @@ pub fn demux(
) -> crate::Result<Vec<Index>> {
let mut indices = vec![];
for (target_segment_ord, output_directory) in output_directories.into_iter().enumerate() {
let delete_bitsets = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
let alive_bitset = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
.into_iter()
.map(Some)
.collect_vec();
let index = merge_filtered_segments(
segments,
target_settings.clone(),
delete_bitsets,
alive_bitset,
output_directory,
)?;
indices.push(index);
@@ -141,7 +141,7 @@ mod tests {
use crate::{DocAddress, Term};
#[test]
fn test_demux_map_to_deletebitset() {
fn test_demux_map_to_alive_bitset() {
let max_value = 2;
let mut demux_mapping = DemuxMapping::default();
// segment ordinal 0 mapping

View File

@@ -4,9 +4,6 @@ use std::thread;
use std::thread::JoinHandle;
use common::BitSet;
use crossbeam::channel;
use futures::executor::block_on;
use futures::future::Future;
use smallvec::smallvec;
use super::operation::{AddOperation, UserOperation};
@@ -24,7 +21,7 @@ use crate::indexer::operation::DeleteOperation;
use crate::indexer::stamper::Stamper;
use crate::indexer::{MergePolicy, SegmentEntry, SegmentWriter};
use crate::schema::{Document, IndexRecordOption, Term};
use crate::Opstamp;
use crate::{FutureResult, Opstamp};
// Size of the margin for the `memory_arena`. A segment is closed when the remaining memory
// in the `memory_arena` goes below MARGIN_IN_BYTES.
@@ -214,7 +211,7 @@ fn index_documents(
meta.untrack_temp_docstore();
// update segment_updater inventory to remove tempstore
let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt);
block_on(segment_updater.schedule_add_segment(segment_entry))?;
segment_updater.schedule_add_segment(segment_entry).wait()?;
Ok(())
}
@@ -291,7 +288,7 @@ impl IndexWriter {
return Err(TantivyError::InvalidArgument(err_msg));
}
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let delete_queue = DeleteQueue::new();
@@ -328,7 +325,7 @@ impl IndexWriter {
}
fn drop_sender(&mut self) {
let (sender, _receiver) = channel::bounded(1);
let (sender, _receiver) = crossbeam_channel::bounded(1);
self.operation_sender = sender;
}
@@ -368,7 +365,9 @@ impl IndexWriter {
pub fn add_segment(&self, segment_meta: SegmentMeta) -> crate::Result<()> {
let delete_cursor = self.delete_queue.cursor();
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
block_on(self.segment_updater.schedule_add_segment(segment_entry))
self.segment_updater
.schedule_add_segment(segment_entry)
.wait()
}
/// Creates a new segment.
@@ -465,8 +464,8 @@ impl IndexWriter {
}
/// Detects and removes the files that are not used by the index anymore.
pub async fn garbage_collect_files(&self) -> crate::Result<GarbageCollectionResult> {
self.segment_updater.schedule_garbage_collect().await
pub fn garbage_collect_files(&self) -> FutureResult<GarbageCollectionResult> {
self.segment_updater.schedule_garbage_collect()
}
/// Deletes all documents from the index
@@ -516,13 +515,10 @@ impl IndexWriter {
/// Merges a given list of segments
///
/// `segment_ids` is required to be non-empty.
pub fn merge(
&mut self,
segment_ids: &[SegmentId],
) -> impl Future<Output = crate::Result<SegmentMeta>> {
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> FutureResult<SegmentMeta> {
let merge_operation = self.segment_updater.make_merge_operation(segment_ids);
let segment_updater = self.segment_updater.clone();
async move { segment_updater.start_merge(merge_operation)?.await }
segment_updater.start_merge(merge_operation)
}
/// Closes the current document channel send.
@@ -535,7 +531,7 @@ impl IndexWriter {
/// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) {
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
self.operation_sender = document_sender;
self.index_writer_status = IndexWriterStatus::from(document_receiver);
}
@@ -781,7 +777,6 @@ impl Drop for IndexWriter {
mod tests {
use std::collections::{HashMap, HashSet};
use futures::executor::block_on;
use proptest::prelude::*;
use proptest::prop_oneof;
use proptest::strategy::Strategy;
@@ -1456,7 +1451,7 @@ mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() >= 2 {
block_on(index_writer.merge(&segment_ids)).unwrap();
index_writer.merge(&segment_ids).wait().unwrap();
assert!(index_writer.segment_updater().wait_merging_thread().is_ok());
}
}
@@ -1472,7 +1467,7 @@ mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() >= 2 {
block_on(index_writer.merge(&segment_ids)).unwrap();
index_writer.merge(&segment_ids).wait().unwrap();
assert!(index_writer.wait_merging_threads().is_ok());
}
}

View File

@@ -92,7 +92,7 @@ impl Drop for IndexWriterBomb {
mod tests {
use std::mem;
use crossbeam::channel;
use crossbeam_channel as channel;
use super::IndexWriterStatus;

View File

@@ -1,13 +1,14 @@
use chrono::Utc;
use fnv::FnvHashMap;
use murmurhash32::murmurhash2;
use crate::fastfield::FastValue;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
use crate::schema::Type;
use crate::schema::{Field, Type};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, UtcOffset};
use crate::tokenizer::TextAnalyzer;
use crate::{DocId, Term};
use crate::{DateTime, DocId, Term};
/// This object is a map storing the last position for a given path for the current document
/// being indexed.
@@ -148,10 +149,11 @@ fn index_json_value<'a>(
json_term_writer.term_buffer,
ctx,
indexing_position,
None,
);
}
TextOrDateTime::DateTime(dt) => {
json_term_writer.set_fast_value(dt);
json_term_writer.set_fast_value(DateTime::from_utc(dt));
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
},
@@ -184,25 +186,90 @@ fn index_json_value<'a>(
enum TextOrDateTime<'a> {
Text(&'a str),
DateTime(crate::DateTime),
DateTime(OffsetDateTime),
}
fn infer_type_from_str(text: &str) -> TextOrDateTime {
match chrono::DateTime::parse_from_rfc3339(text) {
match OffsetDateTime::parse(text, &Rfc3339) {
Ok(dt) => {
let dt_utc = dt.with_timezone(&Utc);
let dt_utc = dt.to_offset(UtcOffset::UTC);
TextOrDateTime::DateTime(dt_utc)
}
Err(_) => TextOrDateTime::Text(text),
}
}
// Tries to infer a JSON type from a string
pub(crate) fn convert_to_fast_value_and_get_term(
json_term_writer: &mut JsonTermWriter,
phrase: &str,
) -> Option<Term> {
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
let dt_utc = dt.to_offset(UtcOffset::UTC);
return Some(set_fastvalue_and_get_term(
json_term_writer,
DateTime::from_utc(dt_utc),
));
}
if let Ok(u64_val) = str::parse::<u64>(phrase) {
return Some(set_fastvalue_and_get_term(json_term_writer, u64_val));
}
if let Ok(i64_val) = str::parse::<i64>(phrase) {
return Some(set_fastvalue_and_get_term(json_term_writer, i64_val));
}
if let Ok(f64_val) = str::parse::<f64>(phrase) {
return Some(set_fastvalue_and_get_term(json_term_writer, f64_val));
}
None
}
// helper function to generate a Term from a json fastvalue
pub(crate) fn set_fastvalue_and_get_term<T: FastValue>(
json_term_writer: &mut JsonTermWriter,
value: T,
) -> Term {
json_term_writer.set_fast_value(value);
json_term_writer.term().clone()
}
// helper function to generate a list of terms with their positions from a textual json value
pub(crate) fn set_string_and_get_terms(
json_term_writer: &mut JsonTermWriter,
value: &str,
text_analyzer: &TextAnalyzer,
) -> Vec<(usize, Term)> {
let mut positions_and_terms = Vec::<(usize, Term)>::new();
json_term_writer.close_path_and_set_type(Type::Str);
let term_num_bytes = json_term_writer.term_buffer.as_slice().len();
let mut token_stream = text_analyzer.token_stream(value);
token_stream.process(&mut |token| {
json_term_writer.term_buffer.truncate(term_num_bytes);
json_term_writer
.term_buffer
.append_bytes(token.text.as_bytes());
positions_and_terms.push((token.position, json_term_writer.term().clone()));
});
positions_and_terms
}
pub struct JsonTermWriter<'a> {
term_buffer: &'a mut Term,
path_stack: Vec<usize>,
}
impl<'a> JsonTermWriter<'a> {
pub fn from_field_and_json_path(
field: Field,
json_path: &str,
term_buffer: &'a mut Term,
) -> Self {
term_buffer.set_field(Type::Json, field);
let mut json_term_writer = Self::wrap(term_buffer);
for segment in json_path.split('.') {
json_term_writer.push_path_segment(segment);
}
json_term_writer
}
pub fn wrap(term_buffer: &'a mut Term) -> Self {
term_buffer.clear_with_type(Type::Json);
let mut path_stack = Vec::with_capacity(10);

View File

@@ -170,8 +170,8 @@ impl IndexMerger {
index_settings: IndexSettings,
segments: &[Segment],
) -> crate::Result<IndexMerger> {
let delete_bitsets = segments.iter().map(|_| None).collect_vec();
Self::open_with_custom_alive_set(schema, index_settings, segments, delete_bitsets)
let alive_bitset = segments.iter().map(|_| None).collect_vec();
Self::open_with_custom_alive_set(schema, index_settings, segments, alive_bitset)
}
// Create merge with a custom delete set.
@@ -180,7 +180,7 @@ impl IndexMerger {
// corresponds to the segment index.
//
// If `None` is provided for custom alive set, the regular alive set will be used.
// If a delete_bitsets is provided, the union between the provided and regular
// If a alive_bitset is provided, the union between the provided and regular
// alive set will be used.
//
// This can be used to merge but also apply an additional filter.
@@ -283,12 +283,12 @@ impl IndexMerger {
for (field, field_entry) in self.schema.fields() {
let field_type = field_entry.field_type();
match field_type {
FieldType::Facet(_) => {
FieldType::Facet(_) | FieldType::Str(_) if field_type.is_fast() => {
let term_ordinal_mapping = term_ord_mappings.remove(&field).expect(
"Logic Error in Tantivy (Please report). Facet field should have required \
a`term_ordinal_mapping`.",
);
self.write_hierarchical_facet_field(
self.write_term_id_fast_field(
field,
&term_ordinal_mapping,
fast_field_serializer,
@@ -312,8 +312,8 @@ impl IndexMerger {
self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?;
}
}
FieldType::Str(_) | FieldType::JsonObject(_) => {
// We don't handle json / string fast field for the moment
_ => {
// We don't handle json fast field for the moment
// They can be implemented using what is done
// for facets in the future
}
@@ -590,14 +590,14 @@ impl IndexMerger {
)
}
fn write_hierarchical_facet_field(
fn write_term_id_fast_field(
&self,
field: Field,
term_ordinal_mappings: &TermOrdinalMapping,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
debug_time!("write-hierarchical-facet-field");
debug_time!("write-term-id-fast-field");
// Multifastfield consists of 2 fastfields.
// The first serves as an index into the second one and is stricly increasing.
@@ -848,6 +848,9 @@ impl IndexMerger {
let mut term_ord_mapping_opt = match field_type {
FieldType::Facet(_) => Some(TermOrdinalMapping::new(max_term_ords)),
FieldType::Str(options) if options.is_fast() => {
Some(TermOrdinalMapping::new(max_term_ords))
}
_ => None,
};
@@ -1133,7 +1136,6 @@ impl IndexMerger {
#[cfg(test)]
mod tests {
use byteorder::{BigEndian, ReadBytesExt};
use futures::executor::block_on;
use schema::FAST;
use crate::collector::tests::{
@@ -1147,9 +1149,10 @@ mod tests {
Cardinality, Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term,
TextFieldIndexing, INDEXED, TEXT,
};
use crate::time::OffsetDateTime;
use crate::{
assert_nearly_equals, schema, DocAddress, DocSet, IndexSettings, IndexSortByField,
IndexWriter, Order, Searcher, SegmentId,
assert_nearly_equals, schema, DateTime, DocAddress, DocSet, IndexSettings,
IndexSortByField, IndexWriter, Order, Searcher, SegmentId,
};
#[test]
@@ -1157,9 +1160,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
@@ -1169,14 +1170,14 @@ mod tests {
let bytes_score_field = schema_builder.add_bytes_field("score_bytes", FAST);
let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader()?;
let curr_time = chrono::Utc::now();
let curr_time = OffsetDateTime::now_utc();
{
let mut index_writer = index.writer_for_tests()?;
// writing the segment
index_writer.add_document(doc!(
text_field => "af b",
score_field => 3u64,
date_field => curr_time,
date_field => DateTime::from_utc(curr_time),
bytes_score_field => 3u32.to_be_bytes().as_ref()
))?;
index_writer.add_document(doc!(
@@ -1193,7 +1194,7 @@ mod tests {
// writing the segment
index_writer.add_document(doc!(
text_field => "af b",
date_field => curr_time,
date_field => DateTime::from_utc(curr_time),
score_field => 11u64,
bytes_score_field => 11u32.to_be_bytes().as_ref()
))?;
@@ -1209,7 +1210,7 @@ mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests()?;
block_on(index_writer.merge(&segment_ids))?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
{
@@ -1249,7 +1250,10 @@ mod tests {
]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_date(date_field, &curr_time)])?,
get_doc_ids(vec![Term::from_field_date(
date_field,
DateTime::from_utc(curr_time)
)])?,
vec![DocAddress::new(0, 0), DocAddress::new(0, 3)]
);
}
@@ -1458,7 +1462,7 @@ mod tests {
{
// merging the segments
let segment_ids = index.searchable_segment_ids()?;
block_on(index_writer.merge(&segment_ids))?;
index_writer.merge(&segment_ids).wait()?;
reader.reload()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
@@ -1551,7 +1555,7 @@ mod tests {
{
// Test merging a single segment in order to remove deletes.
let segment_ids = index.searchable_segment_ids()?;
block_on(index_writer.merge(&segment_ids))?;
index_writer.merge(&segment_ids).wait()?;
reader.reload()?;
let searcher = reader.searcher();
@@ -1771,7 +1775,10 @@ mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap();
block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
reader.reload().unwrap();
test_searcher(
@@ -1826,7 +1833,7 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
block_on(index_writer.merge(&segment_ids))?;
index_writer.merge(&segment_ids).wait()?;
reader.reload()?;
// commit has not been called yet. The document should still be
// there.
@@ -1853,7 +1860,7 @@ mod tests {
index_writer.commit()?;
index_writer.delete_term(Term::from_field_u64(int_field, 1));
let segment_ids = index.searchable_segment_ids()?;
block_on(index_writer.merge(&segment_ids))?;
index_writer.merge(&segment_ids).wait()?;
// assert delete has not been committed
reader.reload()?;
@@ -1954,7 +1961,7 @@ mod tests {
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer_for_tests()?;
block_on(index_writer.merge(&segment_ids))?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
reader.reload()?;
@@ -2082,7 +2089,7 @@ mod tests {
.iter()
.map(|reader| reader.segment_id())
.collect();
block_on(writer.merge(&segment_ids[..]))?;
writer.merge(&segment_ids[..]).wait()?;
reader.reload()?;
let searcher = reader.searcher();

View File

@@ -1,7 +1,5 @@
#[cfg(test)]
mod tests {
use futures::executor::block_on;
use crate::collector::TopDocs;
use crate::core::Index;
use crate::fastfield::{AliveBitSet, FastFieldReader, MultiValuedFastFieldReader};
@@ -50,7 +48,7 @@ mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.merge(&segment_ids).wait().is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
}
index
@@ -140,7 +138,7 @@ mod tests {
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer_for_tests()?;
block_on(index_writer.merge(&segment_ids))?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
Ok(index)

View File

@@ -21,11 +21,13 @@ pub mod segment_updater;
mod segment_writer;
mod stamper;
use crossbeam::channel;
use crossbeam_channel as channel;
use smallvec::SmallVec;
pub use self::index_writer::IndexWriter;
pub(crate) use self::json_term_writer::JsonTermWriter;
pub(crate) use self::json_term_writer::{
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
};
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::MergeOperation;
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};

View File

@@ -1,7 +1,5 @@
use futures::executor::block_on;
use super::IndexWriter;
use crate::Opstamp;
use crate::{FutureResult, Opstamp};
/// A prepared commit
pub struct PreparedCommit<'a> {
@@ -35,9 +33,9 @@ impl<'a> PreparedCommit<'a> {
}
/// Proceeds to commit.
/// See `.commit_async()`.
/// See `.commit_future()`.
pub fn commit(self) -> crate::Result<Opstamp> {
block_on(self.commit_async())
self.commit_future().wait()
}
/// Proceeds to commit.
@@ -45,12 +43,10 @@ impl<'a> PreparedCommit<'a> {
/// Unfortunately, contrary to what `PrepareCommit` may suggests,
/// this operation is not at all really light.
/// At this point deletes have not been flushed yet.
pub async fn commit_async(self) -> crate::Result<Opstamp> {
pub fn commit_future(self) -> FutureResult<Opstamp> {
info!("committing {}", self.opstamp);
self.index_writer
.segment_updater()
.schedule_commit(self.opstamp, self.payload)
.await?;
Ok(self.opstamp)
}
}

View File

@@ -39,9 +39,10 @@ impl SegmentSerializer {
let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
let compressor = segment.index().settings().docstore_compression;
let blocksize = segment.index().settings().docstore_blocksize;
Ok(SegmentSerializer {
segment,
store_writer: StoreWriter::new(store_write, compressor),
store_writer: StoreWriter::new(store_write, compressor, blocksize),
fast_field_serializer,
fieldnorms_serializer: Some(fieldnorms_serializer),
postings_serializer,

View File

@@ -1,6 +1,5 @@
use std::borrow::BorrowMut;
use std::collections::HashSet;
use std::io;
use std::io::Write;
use std::ops::Deref;
use std::path::PathBuf;
@@ -8,9 +7,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, RwLock};
use fail::fail_point;
use futures::channel::oneshot;
use futures::executor::{ThreadPool, ThreadPoolBuilder};
use futures::future::{Future, TryFutureExt};
use rayon::{ThreadPool, ThreadPoolBuilder};
use super::segment_manager::SegmentManager;
use crate::core::{
@@ -29,7 +26,7 @@ use crate::indexer::{
SegmentSerializer,
};
use crate::schema::Schema;
use crate::{Opstamp, TantivyError};
use crate::{FutureResult, Opstamp};
const NUM_MERGE_THREADS: usize = 4;
@@ -75,10 +72,12 @@ fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()>
let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer.
writeln!(&mut buffer)?;
fail_point!("save_metas", |msg| Err(TantivyError::from(io::Error::new(
io::ErrorKind::Other,
msg.unwrap_or_else(|| "Undefined".to_string())
))));
fail_point!("save_metas", |msg| Err(crate::TantivyError::from(
std::io::Error::new(
std::io::ErrorKind::Other,
msg.unwrap_or_else(|| "Undefined".to_string())
)
)));
directory.sync_directory()?;
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
@@ -105,7 +104,7 @@ impl Deref for SegmentUpdater {
}
}
async fn garbage_collect_files(
fn garbage_collect_files(
segment_updater: SegmentUpdater,
) -> crate::Result<GarbageCollectionResult> {
info!("Running garbage collection");
@@ -309,18 +308,18 @@ impl SegmentUpdater {
let segments = index.searchable_segment_metas()?;
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
let pool = ThreadPoolBuilder::new()
.name_prefix("segment_updater")
.pool_size(1)
.create()
.thread_name(|_| "segment_updater".to_string())
.num_threads(1)
.build()
.map_err(|_| {
crate::TantivyError::SystemError(
"Failed to spawn segment updater thread".to_string(),
)
})?;
let merge_thread_pool = ThreadPoolBuilder::new()
.name_prefix("merge_thread")
.pool_size(NUM_MERGE_THREADS)
.create()
.thread_name(|i| format!("merge_thread_{i}"))
.num_threads(NUM_MERGE_THREADS)
.build()
.map_err(|_| {
crate::TantivyError::SystemError(
"Failed to spawn segment merging thread".to_string(),
@@ -349,39 +348,30 @@ impl SegmentUpdater {
*self.merge_policy.write().unwrap() = arc_merge_policy;
}
async fn schedule_task<
T: 'static + Send,
F: Future<Output = crate::Result<T>> + 'static + Send,
>(
fn schedule_task<T: 'static + Send, F: FnOnce() -> crate::Result<T> + 'static + Send>(
&self,
task: F,
) -> crate::Result<T> {
) -> FutureResult<T> {
if !self.is_alive() {
return Err(crate::TantivyError::SystemError(
"Segment updater killed".to_string(),
));
return crate::TantivyError::SystemError("Segment updater killed".to_string()).into();
}
let (sender, receiver) = oneshot::channel();
self.pool.spawn_ok(async move {
let task_result = task.await;
let (scheduled_result, sender) = FutureResult::create(
"A segment_updater future did not succeed. This should never happen.",
);
self.pool.spawn(|| {
let task_result = task();
let _ = sender.send(task_result);
});
let task_result = receiver.await;
task_result.unwrap_or_else(|_| {
let err_msg =
"A segment_updater future did not success. This should never happen.".to_string();
Err(crate::TantivyError::SystemError(err_msg))
})
scheduled_result
}
pub async fn schedule_add_segment(&self, segment_entry: SegmentEntry) -> crate::Result<()> {
pub fn schedule_add_segment(&self, segment_entry: SegmentEntry) -> FutureResult<()> {
let segment_updater = self.clone();
self.schedule_task(async move {
self.schedule_task(move || {
segment_updater.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options().await;
segment_updater.consider_merge_options();
Ok(())
})
.await
}
/// Orders `SegmentManager` to remove all segments
@@ -448,9 +438,9 @@ impl SegmentUpdater {
Ok(())
}
pub async fn schedule_garbage_collect(&self) -> crate::Result<GarbageCollectionResult> {
let garbage_collect_future = garbage_collect_files(self.clone());
self.schedule_task(garbage_collect_future).await
pub fn schedule_garbage_collect(&self) -> FutureResult<GarbageCollectionResult> {
let self_clone = self.clone();
self.schedule_task(move || garbage_collect_files(self_clone))
}
/// List the files that are useful to the index.
@@ -468,21 +458,20 @@ impl SegmentUpdater {
files
}
pub(crate) async fn schedule_commit(
pub(crate) fn schedule_commit(
&self,
opstamp: Opstamp,
payload: Option<String>,
) -> crate::Result<()> {
) -> FutureResult<Opstamp> {
let segment_updater: SegmentUpdater = self.clone();
self.schedule_task(async move {
self.schedule_task(move || {
let segment_entries = segment_updater.purge_deletes(opstamp)?;
segment_updater.segment_manager.commit(segment_entries);
segment_updater.save_metas(opstamp, payload)?;
let _ = garbage_collect_files(segment_updater.clone()).await;
segment_updater.consider_merge_options().await;
Ok(())
let _ = garbage_collect_files(segment_updater.clone());
segment_updater.consider_merge_options();
Ok(opstamp)
})
.await
}
fn store_meta(&self, index_meta: &IndexMeta) {
@@ -515,26 +504,33 @@ impl SegmentUpdater {
// suggested and the moment when it ended up being executed.)
//
// `segment_ids` is required to be non-empty.
pub fn start_merge(
&self,
merge_operation: MergeOperation,
) -> crate::Result<impl Future<Output = crate::Result<SegmentMeta>>> {
pub fn start_merge(&self, merge_operation: MergeOperation) -> FutureResult<SegmentMeta> {
assert!(
!merge_operation.segment_ids().is_empty(),
"Segment_ids cannot be empty."
);
let segment_updater = self.clone();
let segment_entries: Vec<SegmentEntry> = self
let segment_entries: Vec<SegmentEntry> = match self
.segment_manager
.start_merge(merge_operation.segment_ids())?;
.start_merge(merge_operation.segment_ids())
{
Ok(segment_entries) => segment_entries,
Err(err) => {
warn!(
"Starting the merge failed for the following reason. This is not fatal. {}",
err
);
return err.into();
}
};
info!("Starting merge - {:?}", merge_operation.segment_ids());
let (merging_future_send, merging_future_recv) =
oneshot::channel::<crate::Result<SegmentMeta>>();
let (scheduled_result, merging_future_send) =
FutureResult::create("Merge operation failed.");
self.merge_thread_pool.spawn_ok(async move {
self.merge_thread_pool.spawn(move || {
// The fact that `merge_operation` is moved here is important.
// Its lifetime is used to track how many merging thread are currently running,
// as well as which segment is currently in merge and therefore should not be
@@ -545,28 +541,23 @@ impl SegmentUpdater {
merge_operation.target_opstamp(),
) {
Ok(after_merge_segment_entry) => {
let segment_meta = segment_updater
.end_merge(merge_operation, after_merge_segment_entry)
.await;
let _send_result = merging_future_send.send(segment_meta);
let segment_meta_res =
segment_updater.end_merge(merge_operation, after_merge_segment_entry);
let _send_result = merging_future_send.send(segment_meta_res);
}
Err(e) => {
Err(merge_error) => {
warn!(
"Merge of {:?} was cancelled: {:?}",
merge_operation.segment_ids().to_vec(),
e
merge_error
);
// ... cancel merge
let _send_result = merging_future_send.send(Err(merge_error));
assert!(!cfg!(test), "Merge failed.");
}
}
});
Ok(merging_future_recv.unwrap_or_else(|e| {
Err(crate::TantivyError::SystemError(
"Merge failed:".to_string() + &e.to_string(),
))
}))
scheduled_result
}
pub(crate) fn get_mergeable_segments(&self) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
@@ -575,7 +566,7 @@ impl SegmentUpdater {
.get_mergeable_segments(&merge_segment_ids)
}
async fn consider_merge_options(&self) {
fn consider_merge_options(&self) {
let (committed_segments, uncommitted_segments) = self.get_mergeable_segments();
// Committed segments cannot be merged with uncommitted_segments.
@@ -601,23 +592,21 @@ impl SegmentUpdater {
merge_candidates.extend(committed_merge_candidates);
for merge_operation in merge_candidates {
if let Err(err) = self.start_merge(merge_operation) {
warn!(
"Starting the merge failed for the following reason. This is not fatal. {}",
err
);
}
// If a merge cannot be started this is not a fatal error.
// We do log a warning in `start_merge`.
let _ = self.start_merge(merge_operation);
}
}
async fn end_merge(
/// Queues a `end_merge` in the segment updater and blocks until it is successfully processed.
fn end_merge(
&self,
merge_operation: MergeOperation,
mut after_merge_segment_entry: SegmentEntry,
) -> crate::Result<SegmentMeta> {
let segment_updater = self.clone();
let after_merge_segment_meta = after_merge_segment_entry.meta().clone();
self.schedule_task(async move {
self.schedule_task(move || {
info!("End merge {:?}", after_merge_segment_entry.meta());
{
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
@@ -655,13 +644,13 @@ impl SegmentUpdater {
.save_metas(previous_metas.opstamp, previous_metas.payload.clone())?;
}
segment_updater.consider_merge_options().await;
segment_updater.consider_merge_options();
} // we drop all possible handle to a now useless `SegmentMeta`.
let _ = garbage_collect_files(segment_updater).await;
let _ = garbage_collect_files(segment_updater);
Ok(())
})
.await?;
.wait()?;
Ok(after_merge_segment_meta)
}

View File

@@ -1,7 +1,7 @@
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
use super::operation::AddOperation;
use crate::core::Segment;
use crate::fastfield::FastFieldsWriter;
use crate::fastfield::{FastFieldsWriter, FastValue as _};
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
use crate::indexer::json_term_writer::index_json_values;
use crate::indexer::segment_serializer::SegmentSerializer;
@@ -188,7 +188,7 @@ impl SegmentWriter {
});
if let Some(unordered_term_id) = unordered_term_id_opt {
self.fast_field_writers
.get_multivalue_writer_mut(field)
.get_term_id_writer_mut(field)
.expect("writer for facet missing")
.add_val(unordered_term_id);
}
@@ -221,6 +221,7 @@ impl SegmentWriter {
}
let mut indexing_position = IndexingPosition::default();
for mut token_stream in token_streams {
assert_eq!(term_buffer.as_slice().len(), 5);
postings_writer.index_text(
@@ -229,10 +230,13 @@ impl SegmentWriter {
term_buffer,
ctx,
&mut indexing_position,
self.fast_field_writers.get_term_id_writer_mut(field),
);
}
self.fieldnorms_writer
.record(doc_id, field, indexing_position.num_tokens);
if field_entry.has_fieldnorms() {
self.fieldnorms_writer
.record(doc_id, field, indexing_position.num_tokens);
}
}
FieldType::U64(_) => {
for value in values {
@@ -244,7 +248,7 @@ impl SegmentWriter {
FieldType::Date(_) => {
for value in values {
let date_val = value.as_date().ok_or_else(make_schema_error)?;
term_buffer.set_i64(date_val.timestamp());
term_buffer.set_u64(date_val.to_u64());
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
}
}
@@ -368,9 +372,10 @@ fn remap_and_write(
.segment_mut()
.open_write(SegmentComponent::Store)?;
let compressor = serializer.segment().index().settings().docstore_compression;
let block_size = serializer.segment().index().settings().docstore_blocksize;
let old_store_writer = std::mem::replace(
&mut serializer.store_writer,
StoreWriter::new(store_write, compressor),
StoreWriter::new(store_write, compressor, block_size),
);
old_store_writer.close()?;
let store_read = StoreReader::open(
@@ -414,16 +419,16 @@ pub fn prepare_doc_for_store(doc: Document, schema: &Schema) -> Document {
#[cfg(test)]
mod tests {
use chrono::Utc;
use super::compute_initial_table_size;
use crate::collector::Count;
use crate::indexer::json_term_writer::JsonTermWriter;
use crate::postings::TermInfo;
use crate::query::PhraseQuery;
use crate::schema::{IndexRecordOption, Schema, Type, STORED, STRING, TEXT};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::tokenizer::{PreTokenizedString, Token};
use crate::{DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED};
use crate::{DateTime, DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED};
#[test]
fn test_hashmap_size() {
@@ -523,11 +528,9 @@ mod tests {
json_term_writer.pop_path_segment();
json_term_writer.pop_path_segment();
json_term_writer.push_path_segment("date");
json_term_writer.set_fast_value(
chrono::DateTime::parse_from_rfc3339("1985-04-12T23:20:50.52Z")
.unwrap()
.with_timezone(&Utc),
);
json_term_writer.set_fast_value(DateTime::from_utc(
OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
));
assert!(term_stream.advance());
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());

View File

@@ -123,10 +123,95 @@ mod functional_test;
#[macro_use]
mod macros;
mod future_result;
pub use chrono;
/// Re-export of the `time` crate
///
/// Tantivy uses [`time`](https://crates.io/crates/time) for dates.
pub use time;
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
/// A date/time value with second precision.
///
/// This timestamp does not carry any explicit time zone information.
/// Users are responsible for applying the provided conversion
/// functions consistently. Internally the time zone is assumed
/// to be UTC, which is also used implicitly for JSON serialization.
///
/// All constructors and conversions are provided as explicit
/// functions and not by implementing any `From`/`Into` traits
/// to prevent unintended usage.
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct DateTime {
unix_timestamp: i64,
}
impl DateTime {
/// Create new from UNIX timestamp
pub const fn from_unix_timestamp(unix_timestamp: i64) -> Self {
Self { unix_timestamp }
}
/// Create new from `OffsetDateTime`
///
/// The given date/time is converted to UTC and the actual
/// time zone is discarded.
pub const fn from_utc(dt: OffsetDateTime) -> Self {
Self::from_unix_timestamp(dt.unix_timestamp())
}
/// Create new from `PrimitiveDateTime`
///
/// Implicitly assumes that the given date/time is in UTC!
/// Otherwise the original value must only be reobtained with
/// [`Self::into_primitive()`].
pub const fn from_primitive(dt: PrimitiveDateTime) -> Self {
Self::from_utc(dt.assume_utc())
}
/// Convert to UNIX timestamp
pub const fn into_unix_timestamp(self) -> i64 {
let Self { unix_timestamp } = self;
unix_timestamp
}
/// Convert to UTC `OffsetDateTime`
pub fn into_utc(self) -> OffsetDateTime {
let Self { unix_timestamp } = self;
let utc_datetime =
OffsetDateTime::from_unix_timestamp(unix_timestamp).expect("valid UNIX timestamp");
debug_assert_eq!(UtcOffset::UTC, utc_datetime.offset());
utc_datetime
}
/// Convert to `OffsetDateTime` with the given time zone
pub fn into_offset(self, offset: UtcOffset) -> OffsetDateTime {
self.into_utc().to_offset(offset)
}
/// Convert to `PrimitiveDateTime` without any time zone
///
/// The value should have been constructed with [`Self::from_primitive()`].
/// Otherwise the time zone is implicitly assumed to be UTC.
pub fn into_primitive(self) -> PrimitiveDateTime {
let utc_datetime = self.into_utc();
// Discard the UTC time zone offset
debug_assert_eq!(UtcOffset::UTC, utc_datetime.offset());
PrimitiveDateTime::new(utc_datetime.date(), utc_datetime.time())
}
}
impl fmt::Debug for DateTime {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let utc_rfc3339 = self.into_utc().format(&Rfc3339).map_err(|_| fmt::Error)?;
f.write_str(&utc_rfc3339)
}
}
pub use crate::error::TantivyError;
pub use crate::future_result::FutureResult;
/// Tantivy result.
///
@@ -138,9 +223,6 @@ pub type Result<T> = std::result::Result<T, TantivyError>;
#[cfg(feature = "quickwit")]
pub type AsyncIoResult<T> = std::result::Result<T, crate::error::AsyncIoError>;
/// Tantivy DateTime
pub type DateTime = chrono::DateTime<chrono::Utc>;
mod core;
mod indexer;
@@ -308,6 +390,7 @@ pub mod tests {
use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED};
use crate::fastfield::FastFieldReader;
use crate::merge_policy::NoMergePolicy;
use crate::query::BooleanQuery;
use crate::schema::*;
use crate::{DocAddress, Index, Postings, ReloadPolicy};
@@ -935,8 +1018,6 @@ pub mod tests {
// motivated by #729
#[test]
fn test_update_via_delete_insert() -> crate::Result<()> {
use futures::executor::block_on;
use crate::collector::Count;
use crate::indexer::NoMergePolicy;
use crate::query::AllQuery;
@@ -990,8 +1071,7 @@ pub mod tests {
.iter()
.map(|reader| reader.segment_id())
.collect();
block_on(index_writer.merge(&segment_ids)).unwrap();
index_writer.merge(&segment_ids).wait()?;
index_reader.reload()?;
let searcher = index_reader.searcher();
assert_eq!(searcher.search(&AllQuery, &Count)?, DOC_COUNT as usize);
@@ -1006,6 +1086,7 @@ pub mod tests {
let schema = builder.build();
let index = Index::create_in_dir(&index_path, schema)?;
let mut writer = index.writer(50_000_000)?;
writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..5000 {
writer.add_document(doc!(body => "foo"))?;
writer.add_document(doc!(body => "boo"))?;
@@ -1017,8 +1098,7 @@ pub mod tests {
writer.delete_term(Term::from_field_text(body, "foo"));
writer.commit()?;
let segment_ids = index.searchable_segment_ids()?;
let _ = futures::executor::block_on(writer.merge(&segment_ids));
writer.merge(&segment_ids).wait()?;
assert!(index.validate_checksum()?.is_empty());
Ok(())
}

View File

@@ -1,5 +1,6 @@
use std::io;
use crate::fastfield::MultiValuedFastFieldWriter;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{BufferLender, NothingRecorder, Recorder};
@@ -42,6 +43,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer: &mut Term,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
_fast_field_writer: Option<&mut MultiValuedFastFieldWriter>,
) {
self.str_posting_writer.index_text(
doc_id,
@@ -49,6 +51,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer,
ctx,
indexing_position,
None,
);
}

View File

@@ -6,6 +6,7 @@ use std::ops::Range;
use fnv::FnvHashMap;
use super::stacker::Addr;
use crate::fastfield::MultiValuedFastFieldWriter;
use crate::fieldnorm::FieldNormReaders;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::recorder::{BufferLender, Recorder};
@@ -145,6 +146,7 @@ pub(crate) trait PostingsWriter {
term_buffer: &mut Term,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
mut term_id_fast_field_writer_opt: Option<&mut MultiValuedFastFieldWriter>,
) {
let end_of_path_idx = term_buffer.as_slice().len();
let mut num_tokens = 0;
@@ -164,9 +166,14 @@ pub(crate) trait PostingsWriter {
term_buffer.append_bytes(token.text.as_bytes());
let start_position = indexing_position.end_position + token.position as u32;
end_position = start_position + token.position_length as u32;
self.subscribe(doc_id, start_position, term_buffer, ctx);
let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
term_id_fast_field_writer.add_val(unordered_term_id);
}
num_tokens += 1;
});
indexing_position.end_position = end_position + POSITION_GAP;
indexing_position.num_tokens += num_tokens;
term_buffer.truncate(end_of_path_idx);

View File

@@ -244,12 +244,12 @@ impl MoreLikeThis {
FieldType::Date(_) => {
for value in values {
// TODO: Ask if this is the semantic (timestamp) we want
let val = value
let unix_timestamp = value
.as_date()
.ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))?
.timestamp();
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_i64(field, val);
.into_unix_timestamp();
if !self.is_noise_word(unix_timestamp.to_string()) {
let term = Term::from_field_i64(field, unix_timestamp);
*term_frequencies.entry(term).or_insert(0) += 1;
}
}

View File

@@ -126,9 +126,7 @@ pub mod tests {
let mut schema_builder = Schema::builder();
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
let no_positions = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
);
let text_field = schema_builder.add_text_field("text", no_positions);

View File

@@ -184,6 +184,66 @@ fn intersection_with_slop(left: &mut [u32], right: &[u32], slop: u32) -> usize {
count
}
fn intersection_count_with_slop(left: &[u32], right: &[u32], slop: u32) -> usize {
let mut left_index = 0;
let mut right_index = 0;
let mut count = 0;
let left_len = left.len();
let right_len = right.len();
while left_index < left_len && right_index < right_len {
let left_val = left[left_index];
let right_val = right[right_index];
let right_slop = if right_val >= slop {
right_val - slop
} else {
0
};
if left_val < right_slop {
left_index += 1;
} else if right_slop <= left_val && left_val <= right_val {
while left_index + 1 < left_len {
let next_left_val = left[left_index + 1];
if next_left_val > right_val {
break;
}
left_index += 1;
}
count += 1;
left_index += 1;
right_index += 1;
} else if left_val > right_val {
right_index += 1;
}
}
count
}
fn intersection_exists_with_slop(left: &[u32], right: &[u32], slop: u32) -> bool {
let mut left_index = 0;
let mut right_index = 0;
let left_len = left.len();
let right_len = right.len();
while left_index < left_len && right_index < right_len {
let left_val = left[left_index];
let right_val = right[right_index];
let right_slop = if right_val >= slop {
right_val - slop
} else {
0
};
if left_val < right_slop {
left_index += 1;
} else if right_slop <= left_val && left_val <= right_val {
return true;
} else if left_val > right_val {
right_index += 1;
}
}
false
}
impl<TPostings: Postings> PhraseScorer<TPostings> {
pub fn new(
term_postings: Vec<(usize, TPostings)>,
@@ -237,11 +297,25 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
fn phrase_exists(&mut self) -> bool {
let intersection_len = self.compute_phrase_match();
if self.has_slop() {
return intersection_exists_with_slop(
&self.left[..intersection_len],
&self.right[..],
self.slop,
);
}
intersection_exists(&self.left[..intersection_len], &self.right[..])
}
fn compute_phrase_count(&mut self) -> u32 {
let intersection_len = self.compute_phrase_match();
if self.has_slop() {
return intersection_count_with_slop(
&self.left[..intersection_len],
&self.right[..],
self.slop,
) as u32;
}
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
}
@@ -252,12 +326,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
.positions(&mut self.left);
}
let mut intersection_len = self.left.len();
let end_term = if self.has_slop() {
self.num_terms
} else {
self.num_terms - 1
};
for i in 1..end_term {
for i in 1..self.num_terms - 1 {
{
self.intersection_docset
.docset_mut_specialized(i)

View File

@@ -1,4 +1,4 @@
use std::collections::{BTreeSet, HashMap};
use std::collections::HashMap;
use std::num::{ParseFloatError, ParseIntError};
use std::ops::Bound;
use std::str::FromStr;
@@ -7,7 +7,9 @@ use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInp
use super::logical_ast::*;
use crate::core::Index;
use crate::indexer::JsonTermWriter;
use crate::indexer::{
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
};
use crate::query::{
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery,
TermQuery,
@@ -15,20 +17,22 @@ use crate::query::{
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, Schema, Term, Type,
};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::Score;
use crate::{DateTime, Score};
/// Possible error that may happen when parsing a query.
#[derive(Debug, PartialEq, Eq, Error)]
pub enum QueryParserError {
/// Error in the query syntax
#[error("Syntax Error")]
SyntaxError,
#[error("Syntax Error: {0}")]
SyntaxError(String),
/// This query is unsupported.
#[error("Unsupported query: {0}")]
UnsupportedQuery(String),
/// The query references a field that is not in the schema
#[error("Field does not exists: '{0:?}'")]
#[error("Field does not exists: '{0}'")]
FieldDoesNotExist(String),
/// The query contains a term for a `u64` or `i64`-field, but the value
/// is neither.
@@ -51,23 +55,28 @@ pub enum QueryParserError {
NoDefaultFieldDeclared,
/// The field searched for is not declared
/// as indexed in the schema.
#[error("The field '{0:?}' is not declared as indexed")]
#[error("The field '{0}' is not declared as indexed")]
FieldNotIndexed(String),
/// A phrase query was requested for a field that does not
/// have any positions indexed.
#[error("The field '{0:?}' does not have positions indexed")]
#[error("The field '{0}' does not have positions indexed")]
FieldDoesNotHavePositionsIndexed(String),
/// The tokenizer for the given field is unknown
/// The two argument strings are the name of the field, the name of the tokenizer
#[error("The tokenizer '{0:?}' for the field '{1:?}' is unknown")]
UnknownTokenizer(String, String),
#[error("The tokenizer '{tokenizer:?}' for the field '{field:?}' is unknown")]
UnknownTokenizer {
/// The name of the tokenizer
tokenizer: String,
/// The field name
field: String,
},
/// The query contains a range query with a phrase as one of the bounds.
/// Only terms can be used as bounds.
#[error("A range query cannot have a phrase as one of the bounds")]
RangeMustNotHavePhrase,
/// The format for the date field is not RFC 3339 compliant.
#[error("The date field has an invalid format")]
DateFormatError(#[from] chrono::ParseError),
DateFormatError(#[from] time::error::Parse),
/// The format for the facet field is invalid.
#[error("The facet field is malformed: {0}")]
FacetFormatError(#[from] FacetParseError),
@@ -162,7 +171,7 @@ pub struct QueryParser {
conjunction_by_default: bool,
tokenizer_manager: TokenizerManager,
boost: HashMap<Field, Score>,
field_names: BTreeSet<String>,
field_names: HashMap<String, Field>,
}
fn all_negative(ast: &LogicalAst) -> bool {
@@ -175,6 +184,31 @@ fn all_negative(ast: &LogicalAst) -> bool {
}
}
// Returns the position (in byte offsets) of the unescaped '.' in the `field_path`.
//
// This function operates directly on bytes (as opposed to codepoint), relying
// on a encoding property of utf-8 for its correctness.
fn locate_splitting_dots(field_path: &str) -> Vec<usize> {
let mut splitting_dots_pos = Vec::new();
let mut escape_state = false;
for (pos, b) in field_path.bytes().enumerate() {
if escape_state {
escape_state = false;
continue;
}
match b {
b'\\' => {
escape_state = true;
}
b'.' => {
splitting_dots_pos.push(pos);
}
_ => {}
}
}
splitting_dots_pos
}
impl QueryParser {
/// Creates a `QueryParser`, given
/// * schema - index Schema
@@ -186,7 +220,7 @@ impl QueryParser {
) -> QueryParser {
let field_names = schema
.fields()
.map(|(_, field_entry)| field_entry.name().to_string())
.map(|(field, field_entry)| (field_entry.name().to_string(), field))
.collect();
QueryParser {
schema,
@@ -200,25 +234,18 @@ impl QueryParser {
// Splits a full_path as written in a query, into a field name and a
// json path.
pub(crate) fn split_full_path<'a>(&self, full_path: &'a str) -> (&'a str, &'a str) {
if full_path.is_empty() {
return ("", "");
pub(crate) fn split_full_path<'a>(&self, full_path: &'a str) -> Option<(Field, &'a str)> {
if let Some(field) = self.field_names.get(full_path) {
return Some((*field, ""));
}
if self.field_names.contains(full_path) {
return (full_path, "");
}
let mut result = ("", full_path);
let mut cursor = 0;
while let Some(pos) = full_path[cursor..].find('.') {
cursor += pos;
let prefix = &full_path[..cursor];
let suffix = &full_path[cursor + 1..];
if self.field_names.contains(prefix) {
result = (prefix, suffix);
let mut splitting_period_pos: Vec<usize> = locate_splitting_dots(full_path);
while let Some(pos) = splitting_period_pos.pop() {
let (prefix, suffix) = full_path.split_at(pos);
if let Some(field) = self.field_names.get(prefix) {
return Some((*field, &suffix[1..]));
}
cursor += 1;
}
result
None
}
/// Creates a `QueryParser`, given
@@ -266,17 +293,11 @@ impl QueryParser {
/// Parse the user query into an AST.
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAst, QueryParserError> {
let user_input_ast =
tantivy_query_grammar::parse_query(query).map_err(|_| QueryParserError::SyntaxError)?;
let user_input_ast = tantivy_query_grammar::parse_query(query)
.map_err(|_| QueryParserError::SyntaxError(query.to_string()))?;
self.compute_logical_ast(user_input_ast)
}
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
self.schema
.get_field(field_name)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
}
fn compute_logical_ast(
&self,
user_input_ast: UserInputAst,
@@ -326,11 +347,8 @@ impl QueryParser {
Ok(Term::from_field_f64(field, val))
}
FieldType::Date(_) => {
let dt = chrono::DateTime::parse_from_rfc3339(phrase)?;
Ok(Term::from_field_date(
field,
&dt.with_timezone(&chrono::Utc),
))
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
Ok(Term::from_field_date(field, DateTime::from_utc(dt)))
}
FieldType::Str(ref str_options) => {
let option = str_options.get_indexing_options().ok_or_else(|| {
@@ -340,11 +358,9 @@ impl QueryParser {
let text_analyzer =
self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| {
QueryParserError::UnknownTokenizer(
field_entry.name().to_string(),
option.tokenizer().to_string(),
)
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_entry.name().to_string(),
tokenizer: option.tokenizer().to_string(),
})?;
let mut terms: Vec<Term> = Vec::new();
let mut token_stream = text_analyzer.token_stream(phrase);
@@ -388,6 +404,12 @@ impl QueryParser {
if !field_type.is_indexed() {
return Err(QueryParserError::FieldNotIndexed(field_name.to_string()));
}
if field_type.value_type() != Type::Json && !json_path.is_empty() {
let field_name = self.schema.get_field_name(field);
return Err(QueryParserError::FieldDoesNotExist(format!(
"{field_name}.{json_path}"
)));
}
match *field_type {
FieldType::U64(_) => {
let val: u64 = u64::from_str(phrase)?;
@@ -405,8 +427,8 @@ impl QueryParser {
Ok(vec![LogicalLiteral::Term(f64_term)])
}
FieldType::Date(_) => {
let dt = chrono::DateTime::parse_from_rfc3339(phrase)?;
let dt_term = Term::from_field_date(field, &dt.with_timezone(&chrono::Utc));
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
let dt_term = Term::from_field_date(field, DateTime::from_utc(dt));
Ok(vec![LogicalLiteral::Term(dt_term)])
}
FieldType::Str(ref str_options) => {
@@ -417,11 +439,9 @@ impl QueryParser {
let text_analyzer =
self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| {
QueryParserError::UnknownTokenizer(
field_name.to_string(),
option.tokenizer().to_string(),
)
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_name.to_string(),
tokenizer: option.tokenizer().to_string(),
})?;
let index_record_option = option.index_option();
Ok(generate_literals_for_str(
@@ -442,11 +462,9 @@ impl QueryParser {
let text_analyzer =
self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| {
QueryParserError::UnknownTokenizer(
field_name.to_string(),
option.tokenizer().to_string(),
)
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_name.to_string(),
tokenizer: option.tokenizer().to_string(),
})?;
let index_record_option = option.index_option();
generate_literals_for_json_object(
@@ -533,37 +551,56 @@ impl QueryParser {
})
}
fn compute_path_triplet_for_literal<'a>(
/// Given a literal, returns the list of terms that should be searched.
///
/// The terms are identified by a triplet:
/// - tantivy field
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON
/// object by naturally extending the json field name with a "." separated field_path
/// - field_phrase: the phrase that is being searched.
///
/// The literal identifies the targetted field by a so-called *full field path*,
/// specified before the ":". (e.g. identity.username:fulmicoton).
///
/// The way we split the full field path into (field_name, field_path) can be ambiguous,
/// because field_names can contain "." themselves.
// For instance if a field is named `one.two` and another one is named `one`,
/// should `one.two:three` target `one.two` with field path `` or or `one` with
/// the field path `two`.
///
/// In this case tantivy, just picks the solution with the longest field name.
///
/// Quirk: As a hack for quickwit, we do not split over a dot that appear escaped '\.'.
fn compute_path_triplets_for_literal<'a>(
&self,
literal: &'a UserInputLiteral,
) -> Result<Vec<(Field, &'a str, &'a str)>, QueryParserError> {
match &literal.field_name {
Some(ref full_path) => {
// We need to add terms associated to json default fields.
let (field_name, path) = self.split_full_path(full_path);
if let Ok(field) = self.resolve_field_name(field_name) {
return Ok(vec![(field, path, literal.phrase.as_str())]);
}
let triplets: Vec<(Field, &str, &str)> = self
.default_indexed_json_fields()
.map(|json_field| (json_field, full_path.as_str(), literal.phrase.as_str()))
.collect();
if triplets.is_empty() {
return Err(QueryParserError::FieldDoesNotExist(field_name.to_string()));
}
Ok(triplets)
}
None => {
if self.default_fields.is_empty() {
return Err(QueryParserError::NoDefaultFieldDeclared);
}
Ok(self
.default_fields
.iter()
.map(|default_field| (*default_field, "", literal.phrase.as_str()))
.collect::<Vec<(Field, &str, &str)>>())
let full_path = if let Some(full_path) = &literal.field_name {
full_path
} else {
// The user did not specify any path...
// We simply target default fields.
if self.default_fields.is_empty() {
return Err(QueryParserError::NoDefaultFieldDeclared);
}
return Ok(self
.default_fields
.iter()
.map(|default_field| (*default_field, "", literal.phrase.as_str()))
.collect::<Vec<(Field, &str, &str)>>());
};
if let Some((field, path)) = self.split_full_path(full_path) {
return Ok(vec![(field, path, literal.phrase.as_str())]);
}
// We need to add terms associated to json default fields.
let triplets: Vec<(Field, &str, &str)> = self
.default_indexed_json_fields()
.map(|json_field| (json_field, full_path.as_str(), literal.phrase.as_str()))
.collect();
if triplets.is_empty() {
return Err(QueryParserError::FieldDoesNotExist(full_path.to_string()));
}
Ok(triplets)
}
fn compute_logical_ast_from_leaf(
@@ -573,7 +610,7 @@ impl QueryParser {
match leaf {
UserInputLeaf::Literal(literal) => {
let term_phrases: Vec<(Field, &str, &str)> =
self.compute_path_triplet_for_literal(&literal)?;
self.compute_path_triplets_for_literal(&literal)?;
let mut asts: Vec<LogicalAst> = Vec::new();
for (field, json_path, phrase) in term_phrases {
for ast in self.compute_logical_ast_for_leaf(field, json_path, phrase)? {
@@ -600,8 +637,9 @@ impl QueryParser {
"Range query need to target a specific field.".to_string(),
)
})?;
let (field_name, json_path) = self.split_full_path(&full_path);
let field = self.resolve_field_name(field_name)?;
let (field, json_path) = self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))?;
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
@@ -662,30 +700,6 @@ fn generate_literals_for_str(
Ok(Some(LogicalLiteral::Phrase(terms)))
}
enum NumValue {
U64(u64),
I64(i64),
F64(f64),
DateTime(crate::DateTime),
}
fn infer_type_num(phrase: &str) -> Option<NumValue> {
if let Ok(dt) = chrono::DateTime::parse_from_rfc3339(phrase) {
let dt_utc = dt.with_timezone(&chrono::Utc);
return Some(NumValue::DateTime(dt_utc));
}
if let Ok(u64_val) = str::parse::<u64>(phrase) {
return Some(NumValue::U64(u64_val));
}
if let Ok(i64_val) = str::parse::<i64>(phrase) {
return Some(NumValue::I64(i64_val));
}
if let Ok(f64_val) = str::parse::<f64>(phrase) {
return Some(NumValue::F64(f64_val));
}
None
}
fn generate_literals_for_json_object(
field_name: &str,
field: Field,
@@ -696,38 +710,13 @@ fn generate_literals_for_json_object(
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
let mut logical_literals = Vec::new();
let mut term = Term::new();
term.set_field(Type::Json, field);
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
for segment in json_path.split('.') {
json_term_writer.push_path_segment(segment);
let mut json_term_writer =
JsonTermWriter::from_field_and_json_path(field, json_path, &mut term);
if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) {
logical_literals.push(LogicalLiteral::Term(term));
}
if let Some(num_value) = infer_type_num(phrase) {
match num_value {
NumValue::U64(u64_val) => {
json_term_writer.set_fast_value(u64_val);
}
NumValue::I64(i64_val) => {
json_term_writer.set_fast_value(i64_val);
}
NumValue::F64(f64_val) => {
json_term_writer.set_fast_value(f64_val);
}
NumValue::DateTime(dt_val) => {
json_term_writer.set_fast_value(dt_val);
}
}
logical_literals.push(LogicalLiteral::Term(json_term_writer.term().clone()));
}
json_term_writer.close_path_and_set_type(Type::Str);
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, text_analyzer);
drop(json_term_writer);
let term_num_bytes = term.as_slice().len();
let mut token_stream = text_analyzer.token_stream(phrase);
let mut terms: Vec<(usize, Term)> = Vec::new();
token_stream.process(&mut |token| {
term.truncate(term_num_bytes);
term.append_bytes(token.text.as_bytes());
terms.push((token.position, term.clone()));
});
if terms.len() <= 1 {
for (_, term) in terms {
logical_literals.push(LogicalLiteral::Term(term));
@@ -1040,6 +1029,7 @@ mod test {
#[test]
fn test_json_field_possibly_a_date() {
// Subseconds are discarded
test_parse_query_to_logical_ast_helper(
r#"json.date:"2019-10-12T07:20:50.52Z""#,
r#"(Term(type=Json, field=14, path=date, vtype=Date, 2019-10-12T07:20:50Z) "[(0, Term(type=Json, field=14, path=date, vtype=Str, "2019")), (1, Term(type=Json, field=14, path=date, vtype=Str, "10")), (2, Term(type=Json, field=14, path=date, vtype=Str, "12t07")), (3, Term(type=Json, field=14, path=date, vtype=Str, "20")), (4, Term(type=Json, field=14, path=date, vtype=Str, "50")), (5, Term(type=Json, field=14, path=date, vtype=Str, "52z"))]")"#,
@@ -1221,9 +1211,11 @@ mod test {
#[test]
pub fn test_query_parser_field_does_not_exist() {
let query_parser = make_query_parser();
assert_matches!(
query_parser.parse_query("boujou:\"18446744073709551615\""),
Err(QueryParserError::FieldDoesNotExist(_))
assert_eq!(
query_parser
.parse_query("boujou:\"18446744073709551615\"")
.unwrap_err(),
QueryParserError::FieldDoesNotExist("boujou".to_string())
);
}
@@ -1248,9 +1240,10 @@ mod test {
let default_fields = vec![title];
let tokenizer_manager = TokenizerManager::default();
let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager);
assert_matches!(
query_parser.parse_query("title:\"happy tax payer\""),
Err(QueryParserError::UnknownTokenizer(_, _))
Err(QueryParserError::UnknownTokenizer { .. })
);
}
@@ -1397,29 +1390,56 @@ mod test {
}
}
#[test]
fn test_escaped_field() {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field(r#"a\.b"#, STRING);
let schema = schema_builder.build();
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default());
let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap();
assert_eq!(
format!("{:?}", query),
"TermQuery(Term(type=Str, field=0, \"hello\"))"
);
}
#[test]
fn test_split_full_path() {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("second", STRING);
schema_builder.add_text_field("first", STRING);
schema_builder.add_text_field("first.toto", STRING);
schema_builder.add_text_field("first.toto.titi", STRING);
schema_builder.add_text_field("third.a.b.c", STRING);
let schema = schema_builder.build();
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default());
let query_parser =
QueryParser::new(schema.clone(), Vec::new(), TokenizerManager::default());
assert_eq!(
query_parser.split_full_path("first.toto"),
("first.toto", "")
Some((schema.get_field("first.toto").unwrap(), ""))
);
assert_eq!(
query_parser.split_full_path("first.toto.bubu"),
Some((schema.get_field("first.toto").unwrap(), "bubu"))
);
assert_eq!(
query_parser.split_full_path("first.toto.titi"),
Some((schema.get_field("first.toto.titi").unwrap(), ""))
);
assert_eq!(
query_parser.split_full_path("first.titi"),
("first", "titi")
Some((schema.get_field("first").unwrap(), "titi"))
);
assert_eq!(query_parser.split_full_path("third"), ("", "third"));
assert_eq!(
query_parser.split_full_path("hello.toto"),
("", "hello.toto")
);
assert_eq!(query_parser.split_full_path(""), ("", ""));
assert_eq!(query_parser.split_full_path("firsty"), ("", "firsty"));
assert_eq!(query_parser.split_full_path("third"), None);
assert_eq!(query_parser.split_full_path("hello.toto"), None);
assert_eq!(query_parser.split_full_path(""), None);
assert_eq!(query_parser.split_full_path("firsty"), None);
}
#[test]
fn test_locate_splitting_dots() {
assert_eq!(&super::locate_splitting_dots("a.b.c"), &[1, 3]);
assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]);
assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]);
}
}

View File

@@ -125,7 +125,6 @@ impl Scorer for TermScorer {
#[cfg(test)]
mod tests {
use futures::executor::block_on;
use proptest::prelude::*;
use crate::merge_policy::NoMergePolicy;
@@ -321,9 +320,7 @@ mod tests {
.collect();
test_block_wand_aux(&term_query, &searcher)?;
}
{
let _ = block_on(writer.merge(&segment_ids[..]));
}
writer.merge(&segment_ids[..]).wait().unwrap();
{
reader.reload()?;
let searcher = reader.searcher();

View File

@@ -2,7 +2,7 @@ use std::ops::{Deref, DerefMut};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use crossbeam::channel::{unbounded, Receiver, RecvError, Sender};
use crossbeam_channel::{unbounded, Receiver, RecvError, Sender};
pub struct GenerationItem<T> {
generation: usize,
@@ -197,7 +197,7 @@ mod tests {
use std::{iter, mem};
use crossbeam::channel;
use crossbeam_channel as channel;
use super::{Pool, Queue};

View File

@@ -147,7 +147,7 @@ impl WarmingStateInner {
/// Every [GC_INTERVAL] attempt to GC, with panics caught and logged using
/// [std::panic::catch_unwind].
fn gc_loop(inner: Weak<Mutex<WarmingStateInner>>) {
for _ in crossbeam::channel::tick(GC_INTERVAL) {
for _ in crossbeam_channel::tick(GC_INTERVAL) {
if let Some(inner) = inner.upgrade() {
// rely on deterministic gc in tests
#[cfg(not(test))]

View File

@@ -110,7 +110,7 @@ impl Document {
self.add_field_value(field, value);
}
/// Add a date field
/// Add a date field with unspecified time zone offset
pub fn add_date(&mut self, field: Field, value: DateTime) {
self.add_field_value(field, value);
}
@@ -213,6 +213,8 @@ impl BinarySerializable for Document {
#[cfg(test)]
mod tests {
use common::BinarySerializable;
use crate::schema::*;
#[test]
@@ -223,4 +225,22 @@ mod tests {
doc.add_text(text_field, "My title");
assert_eq!(doc.field_values().len(), 1);
}
#[test]
fn test_doc_serialization_issue() {
let mut doc = Document::default();
doc.add_json_object(
Field::from_field_id(0),
serde_json::json!({"key": 2u64})
.as_object()
.unwrap()
.clone(),
);
doc.add_text(Field::from_field_id(1), "hello");
assert_eq!(doc.field_values().len(), 2);
let mut payload: Vec<u8> = Vec::new();
doc.serialize(&mut payload).unwrap();
assert_eq!(payload.len(), 26);
Document::deserialize(&mut &payload[..]).unwrap();
}
}

View File

@@ -93,13 +93,7 @@ impl FieldEntry {
/// Returns true if the field is a int (signed or unsigned) fast field
pub fn is_fast(&self) -> bool {
match self.field_type {
FieldType::U64(ref options)
| FieldType::I64(ref options)
| FieldType::Date(ref options)
| FieldType::F64(ref options) => options.is_fast(),
_ => false,
}
self.field_type.is_fast()
}
/// Returns true if the field is stored
@@ -144,7 +138,8 @@ mod tests {
"fieldnorms": true,
"tokenizer": "default"
},
"stored": false
"stored": false,
"fast": false
}
}"#;
let field_value_json = serde_json::to_string_pretty(&field_value).unwrap();

View File

@@ -1,4 +1,3 @@
use chrono::{FixedOffset, Utc};
use serde::{Deserialize, Serialize};
use serde_json::Value as JsonValue;
use thiserror::Error;
@@ -9,7 +8,10 @@ use crate::schema::{
Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing, TextOptions,
Value,
};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::tokenizer::PreTokenizedString;
use crate::DateTime;
/// Possible error that may occur while parsing a field value
/// At this point the JSON is known to be valid.
@@ -183,7 +185,21 @@ impl FieldType {
}
}
/// returns true if the field is normed.
/// returns true if the field is fast.
pub fn is_fast(&self) -> bool {
match *self {
FieldType::Bytes(ref bytes_options) => bytes_options.is_fast(),
FieldType::Str(ref text_options) => text_options.is_fast(),
FieldType::U64(ref int_options)
| FieldType::I64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Date(ref int_options) => int_options.get_fastfield_cardinality().is_some(),
FieldType::Facet(_) => true,
FieldType::JsonObject(_) => false,
}
}
/// returns true if the field is normed (see [fieldnorms](crate::fieldnorm)).
pub fn has_fieldnorms(&self) -> bool {
match *self {
FieldType::Str(ref text_options) => text_options
@@ -244,33 +260,33 @@ impl FieldType {
/// target field is a `Str`, this method will return an Error.
pub fn value_from_json(&self, json: JsonValue) -> Result<Value, ValueParsingError> {
match json {
JsonValue::String(field_text) => match *self {
FieldType::Date(_) => {
let dt_with_fixed_tz: chrono::DateTime<FixedOffset> =
chrono::DateTime::parse_from_rfc3339(&field_text).map_err(|_err| {
ValueParsingError::TypeError {
JsonValue::String(field_text) => {
match *self {
FieldType::Date(_) => {
let dt_with_fixed_tz = OffsetDateTime::parse(&field_text, &Rfc3339)
.map_err(|_err| ValueParsingError::TypeError {
expected: "rfc3339 format",
json: JsonValue::String(field_text),
}
})?;
Ok(Value::Date(dt_with_fixed_tz.with_timezone(&Utc)))
}
FieldType::Str(_) => Ok(Value::Str(field_text)),
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
Err(ValueParsingError::TypeError {
expected: "an integer",
})?;
Ok(DateTime::from_utc(dt_with_fixed_tz).into())
}
FieldType::Str(_) => Ok(Value::Str(field_text)),
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
Err(ValueParsingError::TypeError {
expected: "an integer",
json: JsonValue::String(field_text),
})
}
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))),
FieldType::Bytes(_) => base64::decode(&field_text)
.map(Value::Bytes)
.map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }),
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
expected: "a json object",
json: JsonValue::String(field_text),
})
}),
}
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))),
FieldType::Bytes(_) => base64::decode(&field_text)
.map(Value::Bytes)
.map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }),
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
expected: "a json object",
json: JsonValue::String(field_text),
}),
},
}
JsonValue::Number(field_val_num) => match self {
FieldType::I64(_) | FieldType::Date(_) => {
if let Some(field_val_i64) = field_val_num.as_i64() {
@@ -342,12 +358,12 @@ impl FieldType {
#[cfg(test)]
mod tests {
use chrono::{NaiveDate, NaiveDateTime, NaiveTime, Utc};
use serde_json::json;
use super::FieldType;
use crate::schema::field_type::ValueParsingError;
use crate::schema::{Schema, TextOptions, Type, Value, INDEXED};
use crate::time::{Date, Month, PrimitiveDateTime, Time};
use crate::tokenizer::{PreTokenizedString, Token};
use crate::{DateTime, Document};
@@ -359,7 +375,8 @@ mod tests {
let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#;
let doc = schema.parse_document(doc_json).unwrap();
let date = doc.get_first(date_field).unwrap();
assert_eq!(format!("{:?}", date), "Date(2019-10-12T05:20:50.520Z)");
// Time zone is converted to UTC and subseconds are discarded
assert_eq!("Date(2019-10-12T05:20:50Z)", format!("{:?}", date));
}
#[test]
@@ -368,12 +385,12 @@ mod tests {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date", INDEXED);
let schema = schema_builder.build();
let naive_date = NaiveDate::from_ymd(1982, 9, 17);
let naive_time = NaiveTime::from_hms(13, 20, 00);
let date_time = DateTime::from_utc(NaiveDateTime::new(naive_date, naive_time), Utc);
doc.add_date(date_field, date_time);
let naive_date = Date::from_calendar_date(1982, Month::September, 17).unwrap();
let naive_time = Time::from_hms(13, 20, 0).unwrap();
let date_time = PrimitiveDateTime::new(naive_date, naive_time);
doc.add_date(date_field, DateTime::from_primitive(date_time));
let doc_json = schema.to_json(&doc);
assert_eq!(doc_json, r#"{"date":["1982-09-17T13:20:00+00:00"]}"#);
assert_eq!(doc_json, r#"{"date":["1982-09-17T13:20:00Z"]}"#);
}
#[test]

View File

@@ -29,6 +29,12 @@ pub enum IndexRecordOption {
WithFreqsAndPositions,
}
impl Default for IndexRecordOption {
fn default() -> Self {
IndexRecordOption::Basic
}
}
impl IndexRecordOption {
/// Returns true if this option includes encoding
/// term frequencies.

View File

@@ -34,6 +34,20 @@ impl JsonObjectOptions {
pub fn get_text_indexing_options(&self) -> Option<&TextFieldIndexing> {
self.indexing.as_ref()
}
/// Sets the field as stored
#[must_use]
pub fn set_stored(mut self) -> Self {
self.stored = true;
self
}
/// Sets the field as indexed, with the specific indexing options.
#[must_use]
pub fn set_indexing_options(mut self, indexing: TextFieldIndexing) -> Self {
self.indexing = Some(indexing);
self
}
}
impl From<StoredFlag> for JsonObjectOptions {

View File

@@ -417,6 +417,7 @@ mod tests {
use std::collections::BTreeMap;
use matches::{assert_matches, matches};
use pretty_assertions::assert_eq;
use serde_json;
use crate::schema::field_type::ValueParsingError;
@@ -469,7 +470,8 @@ mod tests {
"fieldnorms": true,
"tokenizer": "default"
},
"stored": false
"stored": false,
"fast": false
}
},
{
@@ -481,7 +483,8 @@ mod tests {
"fieldnorms": false,
"tokenizer": "raw"
},
"stored": false
"stored": false,
"fast": false
}
},
{
@@ -784,7 +787,8 @@ mod tests {
"fieldnorms": true,
"tokenizer": "default"
},
"stored": false
"stored": false,
"fast": false
}
},
{
@@ -816,7 +820,8 @@ mod tests {
"fieldnorms": true,
"tokenizer": "raw"
},
"stored": true
"stored": true,
"fast": false
}
},
{
@@ -838,7 +843,8 @@ mod tests {
"fieldnorms": true,
"tokenizer": "default"
},
"stored": false
"stored": false,
"fast": false
}
},
{

View File

@@ -70,8 +70,8 @@ impl Term {
}
/// Builds a term given a field, and a DateTime value
pub fn from_field_date(field: Field, val: &DateTime) -> Term {
Term::from_fast_value(field, val)
pub fn from_field_date(field: Field, val: DateTime) -> Term {
Term::from_fast_value(field, &val)
}
/// Creates a `Term` given a facet.
@@ -126,7 +126,7 @@ impl Term {
}
/// Sets a `i64` value in the term.
pub fn set_date(&mut self, date: crate::DateTime) {
pub fn set_date(&mut self, date: DateTime) {
self.set_fast_value(date);
}
@@ -266,8 +266,8 @@ where B: AsRef<[u8]>
///
/// Returns None if the term is not of the Date type, or if the term byte representation
/// is invalid.
pub fn as_date(&self) -> Option<crate::DateTime> {
self.get_fast_type::<crate::DateTime>()
pub fn as_date(&self) -> Option<DateTime> {
self.get_fast_type::<DateTime>()
}
/// Returns the text associated with the term.
@@ -374,7 +374,7 @@ fn debug_value_bytes(typ: Type, bytes: &[u8], f: &mut fmt::Formatter) -> fmt::Re
}
// TODO pretty print these types too.
Type::Date => {
write_opt(f, get_fast_type::<crate::DateTime>(bytes))?;
write_opt(f, get_fast_type::<DateTime>(bytes))?;
}
Type::Facet => {
let facet_str = str::from_utf8(bytes)

View File

@@ -3,14 +3,20 @@ use std::ops::BitOr;
use serde::{Deserialize, Serialize};
use super::flags::FastFlag;
use crate::schema::flags::{SchemaFlagList, StoredFlag};
use crate::schema::IndexRecordOption;
/// Define how a text field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, Default)]
pub struct TextOptions {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
indexing: Option<TextFieldIndexing>,
#[serde(default)]
stored: bool,
#[serde(default)]
fast: bool,
}
impl TextOptions {
@@ -24,6 +30,30 @@ impl TextOptions {
self.stored
}
/// Returns true iff the value is a fast field.
pub fn is_fast(&self) -> bool {
self.fast
}
/// Set the field as a fast field.
///
/// Fast fields are designed for random access.
/// Access time are similar to a random lookup in an array.
/// Text fast fields will have the term ids stored in the fast field.
/// The fast field will be a multivalued fast field.
///
/// The effective cardinality depends on the tokenizer. When creating fast fields on text
/// fields it is recommended to use the "raw" tokenizer, since it will store the original text
/// unchanged. The "default" tokenizer will store the terms as lower case and this will be
/// reflected in the dictionary.
///
/// The original text can be retrieved via `ord_to_term` from the dictionary.
#[must_use]
pub fn set_fast(mut self) -> TextOptions {
self.fast = true;
self
}
/// Sets the field as stored
#[must_use]
pub fn set_stored(mut self) -> TextOptions {
@@ -39,26 +69,60 @@ impl TextOptions {
}
}
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
struct TokenizerName(Cow<'static, str>);
const DEFAULT_TOKENIZER_NAME: &str = "default";
const NO_TOKENIZER_NAME: &str = "raw";
impl Default for TokenizerName {
fn default() -> Self {
TokenizerName::from_static(DEFAULT_TOKENIZER_NAME)
}
}
impl TokenizerName {
const fn from_static(name: &'static str) -> Self {
TokenizerName(Cow::Borrowed(name))
}
fn from_name(name: &str) -> Self {
TokenizerName(Cow::Owned(name.to_string()))
}
fn name(&self) -> &str {
&self.0
}
}
/// Configuration defining indexing for a text field.
///
/// It defines
/// - the amount of information that should be stored about the presence of a term in a document.
/// - The amount of information that should be stored about the presence of a term in a document.
/// Essentially, should we store the term frequency and/or the positions (See
/// [`IndexRecordOption`](./enum.IndexRecordOption.html)).
/// - the name of the `Tokenizer` that should be used to process the field.
/// - The name of the `Tokenizer` that should be used to process the field.
/// - Flag indicating, if fieldnorms should be stored (See [fieldnorm](crate::fieldnorm)). Defaults
/// to `true`.
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
pub struct TextFieldIndexing {
#[serde(default)]
record: IndexRecordOption,
#[serde(default = "default_fieldnorms")]
fieldnorms: bool,
tokenizer: Cow<'static, str>,
#[serde(default)]
tokenizer: TokenizerName,
}
pub(crate) fn default_fieldnorms() -> bool {
true
}
impl Default for TextFieldIndexing {
fn default() -> TextFieldIndexing {
TextFieldIndexing {
tokenizer: Cow::Borrowed("default"),
record: IndexRecordOption::Basic,
fieldnorms: true,
tokenizer: TokenizerName::default(),
record: IndexRecordOption::default(),
fieldnorms: default_fieldnorms(),
}
}
}
@@ -67,13 +131,13 @@ impl TextFieldIndexing {
/// Sets the tokenizer to be used for a given field.
#[must_use]
pub fn set_tokenizer(mut self, tokenizer_name: &str) -> TextFieldIndexing {
self.tokenizer = Cow::Owned(tokenizer_name.to_string());
self.tokenizer = TokenizerName::from_name(tokenizer_name);
self
}
/// Returns the tokenizer that will be used for this field.
pub fn tokenizer(&self) -> &str {
&self.tokenizer
self.tokenizer.name()
}
/// Sets fieldnorms
@@ -83,7 +147,7 @@ impl TextFieldIndexing {
self
}
/// Returns true if and only if fieldnorms are stored.
/// Returns true if and only if [fieldnorms](crate::fieldnorm)are stored.
pub fn fieldnorms(&self) -> bool {
self.fieldnorms
}
@@ -108,21 +172,23 @@ impl TextFieldIndexing {
/// The field will be untokenized and indexed.
pub const STRING: TextOptions = TextOptions {
indexing: Some(TextFieldIndexing {
tokenizer: Cow::Borrowed("raw"),
tokenizer: TokenizerName::from_static(NO_TOKENIZER_NAME),
fieldnorms: true,
record: IndexRecordOption::Basic,
}),
stored: false,
fast: false,
};
/// The field will be tokenized and indexed.
pub const TEXT: TextOptions = TextOptions {
indexing: Some(TextFieldIndexing {
tokenizer: Cow::Borrowed("default"),
tokenizer: TokenizerName::from_static(DEFAULT_TOKENIZER_NAME),
fieldnorms: true,
record: IndexRecordOption::WithFreqsAndPositions,
}),
stored: false,
fast: false,
};
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
@@ -133,6 +199,7 @@ impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
TextOptions {
indexing: self.indexing.or(other.indexing),
stored: self.stored | other.stored,
fast: self.fast | other.fast,
}
}
}
@@ -148,6 +215,17 @@ impl From<StoredFlag> for TextOptions {
TextOptions {
indexing: None,
stored: true,
fast: false,
}
}
}
impl From<FastFlag> for TextOptions {
fn from(_: FastFlag) -> TextOptions {
TextOptions {
indexing: None,
stored: false,
fast: true,
}
}
}
@@ -187,4 +265,24 @@ mod tests {
assert!(IndexRecordOption::WithFreqsAndPositions > IndexRecordOption::WithFreqs);
assert!(IndexRecordOption::WithFreqs > IndexRecordOption::Basic);
}
#[test]
fn serde_default_test() {
let json = r#"
{
"indexing": {
"record": "basic",
"fieldnorms": true,
"tokenizer": "default"
},
"stored": false
}
"#;
let options: TextOptions = serde_json::from_str(json).unwrap();
let options2: TextOptions = serde_json::from_str("{\"indexing\": {}}").unwrap();
assert_eq!(options, options2);
assert_eq!(options.indexing.unwrap().record, IndexRecordOption::Basic);
let options3: TextOptions = serde_json::from_str("{}").unwrap();
assert_eq!(options3.indexing, None);
}
}

View File

@@ -22,7 +22,7 @@ pub enum Value {
I64(i64),
/// 64-bits Float `f64`
F64(f64),
/// Signed 64-bits Date time stamp `date`
/// Date/time with second precision
Date(DateTime),
/// Facet
Facet(Facet),
@@ -43,7 +43,7 @@ impl Serialize for Value {
Value::U64(u) => serializer.serialize_u64(u),
Value::I64(u) => serializer.serialize_i64(u),
Value::F64(u) => serializer.serialize_f64(u),
Value::Date(ref date) => serializer.serialize_str(&date.to_rfc3339()),
Value::Date(ref date) => time::serde::rfc3339::serialize(&date.into_utc(), serializer),
Value::Facet(ref facet) => facet.serialize(serializer),
Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes),
Value::JsonObject(ref obj) => obj.serialize(serializer),
@@ -154,9 +154,9 @@ impl Value {
/// Returns the Date-value, provided the value is of the `Date` type.
///
/// Returns None if the value is not of type `Date`.
pub fn as_date(&self) -> Option<&DateTime> {
pub fn as_date(&self) -> Option<DateTime> {
if let Value::Date(date) = self {
Some(date)
Some(*date)
} else {
None
}
@@ -209,9 +209,9 @@ impl From<f64> for Value {
}
}
impl From<crate::DateTime> for Value {
fn from(date_time: crate::DateTime) -> Value {
Value::Date(date_time)
impl From<DateTime> for Value {
fn from(dt: DateTime) -> Value {
Value::Date(dt)
}
}
@@ -265,12 +265,12 @@ impl From<serde_json::Value> for Value {
mod binary_serialize {
use std::io::{self, Read, Write};
use chrono::{TimeZone, Utc};
use common::{f64_to_u64, u64_to_f64, BinarySerializable};
use super::Value;
use crate::schema::Facet;
use crate::tokenizer::PreTokenizedString;
use crate::DateTime;
const TEXT_CODE: u8 = 0;
const U64_CODE: u8 = 1;
@@ -319,7 +319,8 @@ mod binary_serialize {
}
Value::Date(ref val) => {
DATE_CODE.serialize(writer)?;
val.timestamp().serialize(writer)
let DateTime { unix_timestamp } = val;
unix_timestamp.serialize(writer)
}
Value::Facet(ref facet) => {
HIERARCHICAL_FACET_CODE.serialize(writer)?;
@@ -357,8 +358,8 @@ mod binary_serialize {
Ok(Value::F64(value))
}
DATE_CODE => {
let timestamp = i64::deserialize(reader)?;
Ok(Value::Date(Utc.timestamp(timestamp, 0)))
let unix_timestamp = i64::deserialize(reader)?;
Ok(Value::Date(DateTime::from_unix_timestamp(unix_timestamp)))
}
HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)),
BYTES_CODE => Ok(Value::Bytes(Vec::<u8>::deserialize(reader)?)),
@@ -387,8 +388,16 @@ mod binary_serialize {
}
}
JSON_OBJ_CODE => {
let map = serde_json::from_reader(reader)?;
Ok(Value::JsonObject(map))
// As explained in
// https://docs.serde.rs/serde_json/fn.from_reader.html
//
// `T::from_reader(..)` expects EOF after reading the object,
// which is not what we want here.
//
// For this reason we need to create our own `Deserializer`.
let mut de = serde_json::Deserializer::from_reader(reader);
let json_map = <serde_json::Map::<String, serde_json::Value> as serde::Deserialize>::deserialize(&mut de)?;
Ok(Value::JsonObject(json_map))
}
_ => Err(io::Error::new(
io::ErrorKind::InvalidData,
@@ -401,15 +410,24 @@ mod binary_serialize {
#[cfg(test)]
mod tests {
use std::str::FromStr;
use super::Value;
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::DateTime;
#[test]
fn test_serialize_date() {
let value = Value::Date(DateTime::from_str("1996-12-20T00:39:57+00:00").unwrap());
let value = Value::from(DateTime::from_utc(
OffsetDateTime::parse("1996-12-20T00:39:57+00:00", &Rfc3339).unwrap(),
));
let serialized_value_json = serde_json::to_string_pretty(&value).unwrap();
assert_eq!(serialized_value_json, r#""1996-12-20T00:39:57+00:00""#);
assert_eq!(serialized_value_json, r#""1996-12-20T00:39:57Z""#);
let value = Value::from(DateTime::from_utc(
OffsetDateTime::parse("1996-12-20T00:39:57-01:00", &Rfc3339).unwrap(),
));
let serialized_value_json = serde_json::to_string_pretty(&value).unwrap();
// The time zone information gets lost by conversion into `Value::Date` and
// implicitly becomes UTC.
assert_eq!(serialized_value_json, r#""1996-12-20T01:39:57Z""#);
}
}

View File

@@ -0,0 +1,50 @@
use std::io;
use zstd::bulk::{compress_to_buffer, decompress_to_buffer};
use zstd::DEFAULT_COMPRESSION_LEVEL;
#[inline]
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
let count_size = std::mem::size_of::<u32>();
let max_size = zstd::zstd_safe::compress_bound(uncompressed.len()) + count_size;
compressed.clear();
compressed.resize(max_size, 0);
let compressed_size = compress_to_buffer(
uncompressed,
&mut compressed[count_size..],
DEFAULT_COMPRESSION_LEVEL,
)?;
compressed[0..count_size].copy_from_slice(&(uncompressed.len() as u32).to_le_bytes());
compressed.resize(compressed_size + count_size, 0);
Ok(())
}
#[inline]
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
let count_size = std::mem::size_of::<u32>();
let uncompressed_size = u32::from_le_bytes(
compressed
.get(..count_size)
.ok_or(io::ErrorKind::InvalidData)?
.try_into()
.unwrap(),
) as usize;
decompressed.clear();
decompressed.resize(uncompressed_size, 0);
let decompressed_size = decompress_to_buffer(&compressed[count_size..], decompressed)?;
if decompressed_size != uncompressed_size {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"doc store block not completely decompressed, data corruption".to_string(),
));
}
Ok(())
}

View File

@@ -26,6 +26,9 @@ pub enum Compressor {
#[serde(rename = "snappy")]
/// Use the snap compressor
Snappy,
#[serde(rename = "zstd")]
/// Use the zstd compressor
Zstd,
}
impl Default for Compressor {
@@ -36,6 +39,8 @@ impl Default for Compressor {
Compressor::Brotli
} else if cfg!(feature = "snappy-compression") {
Compressor::Snappy
} else if cfg!(feature = "zstd-compression") {
Compressor::Zstd
} else {
Compressor::None
}
@@ -49,6 +54,7 @@ impl Compressor {
1 => Compressor::Lz4,
2 => Compressor::Brotli,
3 => Compressor::Snappy,
4 => Compressor::Zstd,
_ => panic!("unknown compressor id {:?}", id),
}
}
@@ -58,6 +64,7 @@ impl Compressor {
Self::Lz4 => 1,
Self::Brotli => 2,
Self::Snappy => 3,
Self::Zstd => 4,
}
}
#[inline]
@@ -98,6 +105,16 @@ impl Compressor {
panic!("snappy-compression feature flag not activated");
}
}
Self::Zstd => {
#[cfg(feature = "zstd-compression")]
{
super::compression_zstd_block::compress(uncompressed, compressed)
}
#[cfg(not(feature = "zstd-compression"))]
{
panic!("zstd-compression feature flag not activated");
}
}
}
}
@@ -143,6 +160,16 @@ impl Compressor {
panic!("snappy-compression feature flag not activated");
}
}
Self::Zstd => {
#[cfg(feature = "zstd-compression")]
{
super::compression_zstd_block::decompress(compressed, decompressed)
}
#[cfg(not(feature = "zstd-compression"))]
{
panic!("zstd-compression feature flag not activated");
}
}
}
}
}

View File

@@ -41,7 +41,6 @@ mod tests {
use std::io;
use futures::executor::block_on;
use proptest::strategy::{BoxedStrategy, Strategy};
use super::{SkipIndex, SkipIndexBuilder};
@@ -145,7 +144,7 @@ mod tests {
index_writer.delete_term(Term::from_field_text(text, "testb"));
index_writer.commit()?;
let segment_ids = index.searchable_segment_ids()?;
block_on(index_writer.merge(&segment_ids))?;
index_writer.merge(&segment_ids).wait().unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 30);

View File

@@ -50,13 +50,14 @@ mod compression_brotli;
#[cfg(feature = "snappy-compression")]
mod compression_snap;
#[cfg(feature = "zstd-compression")]
mod compression_zstd_block;
#[cfg(test)]
pub mod tests {
use std::path::Path;
use futures::executor::block_on;
use super::*;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::AliveBitSet;
@@ -71,10 +72,13 @@ pub mod tests {
sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt \
mollit anim id est laborum.";
const BLOCK_SIZE: usize = 16_384;
pub fn write_lorem_ipsum_store(
writer: WritePtr,
num_docs: usize,
compressor: Compressor,
blocksize: usize,
) -> Schema {
let mut schema_builder = Schema::builder();
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
@@ -82,7 +86,7 @@ pub mod tests {
schema_builder.add_text_field("title", TextOptions::default().set_stored());
let schema = schema_builder.build();
{
let mut store_writer = StoreWriter::new(writer, compressor);
let mut store_writer = StoreWriter::new(writer, compressor, blocksize);
for i in 0..num_docs {
let mut doc = Document::default();
doc.add_field_value(field_body, LOREM.to_string());
@@ -105,7 +109,7 @@ pub mod tests {
let path = Path::new("store");
let directory = RamDirectory::create();
let store_wrt = directory.open_write(path)?;
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::Lz4);
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::Lz4, BLOCK_SIZE);
let field_title = schema.get_field("title").unwrap();
let store_file = directory.open_read(path)?;
let store = StoreReader::open(store_file)?;
@@ -141,11 +145,11 @@ pub mod tests {
Ok(())
}
fn test_store(compressor: Compressor) -> crate::Result<()> {
fn test_store(compressor: Compressor, blocksize: usize) -> crate::Result<()> {
let path = Path::new("store");
let directory = RamDirectory::create();
let store_wrt = directory.open_write(path)?;
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor);
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor, blocksize);
let field_title = schema.get_field("title").unwrap();
let store_file = directory.open_read(path)?;
let store = StoreReader::open(store_file)?;
@@ -171,22 +175,28 @@ pub mod tests {
#[test]
fn test_store_noop() -> crate::Result<()> {
test_store(Compressor::None)
test_store(Compressor::None, BLOCK_SIZE)
}
#[cfg(feature = "lz4-compression")]
#[test]
fn test_store_lz4_block() -> crate::Result<()> {
test_store(Compressor::Lz4)
test_store(Compressor::Lz4, BLOCK_SIZE)
}
#[cfg(feature = "snappy-compression")]
#[test]
fn test_store_snap() -> crate::Result<()> {
test_store(Compressor::Snappy)
test_store(Compressor::Snappy, BLOCK_SIZE)
}
#[cfg(feature = "brotli-compression")]
#[test]
fn test_store_brotli() -> crate::Result<()> {
test_store(Compressor::Brotli)
test_store(Compressor::Brotli, BLOCK_SIZE)
}
#[cfg(feature = "zstd-compression")]
#[test]
fn test_store_zstd() -> crate::Result<()> {
test_store(Compressor::Zstd, BLOCK_SIZE)
}
#[test]
@@ -269,7 +279,7 @@ pub mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.merge(&segment_ids).wait().is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
}
@@ -316,7 +326,7 @@ pub mod tests {
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer_for_tests()?;
block_on(index_writer.merge(&segment_ids))?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -350,6 +360,7 @@ mod bench {
directory.open_write(path).unwrap(),
1_000,
Compressor::default(),
16_384,
);
directory.delete(path).unwrap();
});
@@ -363,6 +374,7 @@ mod bench {
directory.open_write(path).unwrap(),
1_000,
Compressor::default(),
16_384,
);
let store_file = directory.open_read(path).unwrap();
let store = StoreReader::open(store_file).unwrap();

View File

@@ -304,6 +304,8 @@ mod tests {
use crate::store::tests::write_lorem_ipsum_store;
use crate::Directory;
const BLOCK_SIZE: usize = 16_384;
fn get_text_field<'a>(doc: &'a Document, field: &'a Field) -> Option<&'a str> {
doc.get_first(*field).and_then(|f| f.as_text())
}
@@ -313,7 +315,7 @@ mod tests {
let directory = RamDirectory::create();
let path = Path::new("store");
let writer = directory.open_write(path)?;
let schema = write_lorem_ipsum_store(writer, 500, Compressor::default());
let schema = write_lorem_ipsum_store(writer, 500, Compressor::default(), BLOCK_SIZE);
let title = schema.get_field("title").unwrap();
let store_file = directory.open_read(path)?;
let store = StoreReader::open(store_file)?;

View File

@@ -11,8 +11,6 @@ use crate::schema::Document;
use crate::store::index::Checkpoint;
use crate::DocId;
const BLOCK_SIZE: usize = 16_384;
/// Write tantivy's [`Store`](./index.html)
///
/// Contrary to the other components of `tantivy`,
@@ -22,6 +20,7 @@ const BLOCK_SIZE: usize = 16_384;
/// The skip list index on the other hand, is built in memory.
pub struct StoreWriter {
compressor: Compressor,
block_size: usize,
doc: DocId,
first_doc_in_block: DocId,
offset_index_writer: SkipIndexBuilder,
@@ -35,9 +34,10 @@ impl StoreWriter {
///
/// The store writer will writes blocks on disc as
/// document are added.
pub fn new(writer: WritePtr, compressor: Compressor) -> StoreWriter {
pub fn new(writer: WritePtr, compressor: Compressor, block_size: usize) -> StoreWriter {
StoreWriter {
compressor,
block_size,
doc: 0,
first_doc_in_block: 0,
offset_index_writer: SkipIndexBuilder::new(),
@@ -65,7 +65,7 @@ impl StoreWriter {
VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?;
self.current_block.write_all(serialized_document)?;
self.doc += 1;
if self.current_block.len() > BLOCK_SIZE {
if self.current_block.len() > self.block_size {
self.write_and_compress_block()?;
}
Ok(())
@@ -86,7 +86,7 @@ impl StoreWriter {
self.current_block
.write_all(&self.intermediary_buffer[..])?;
self.doc += 1;
if self.current_block.len() > BLOCK_SIZE {
if self.current_block.len() > self.block_size {
self.write_and_compress_block()?;
}
Ok(())

View File

@@ -28,7 +28,6 @@ use fst_termdict as termdict;
mod sstable_termdict;
#[cfg(feature = "quickwit")]
use sstable_termdict as termdict;
use tantivy_fst::automaton::AlwaysMatch;
#[cfg(test)]
mod tests;
@@ -36,24 +35,4 @@ mod tests;
/// Position of the term in the sorted list of terms.
pub type TermOrdinal = u64;
/// The term dictionary contains all of the terms in
/// `tantivy index` in a sorted manner.
pub type TermDictionary = self::termdict::TermDictionary;
/// Builder for the new term dictionary.
///
/// Inserting must be done in the order of the `keys`.
pub type TermDictionaryBuilder<W> = self::termdict::TermDictionaryBuilder<W>;
/// Given a list of sorted term streams,
/// returns an iterator over sorted unique terms.
///
/// The item yield is actually a pair with
/// - the term
/// - a slice with the ordinal of the segments containing
/// the terms.
pub type TermMerger<'a> = self::termdict::TermMerger<'a>;
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
/// Terms are guaranteed to be sorted.
pub type TermStreamer<'a, A = AlwaysMatch> = self::termdict::TermStreamer<'a, A>;
pub use self::termdict::{TermDictionary, TermDictionaryBuilder, TermMerger, TermStreamer};

View File

@@ -145,6 +145,12 @@ where
}
pub fn write_key(&mut self, key: &[u8]) {
// If this is the first key in the block, we use it to
// shorten the last term in the last block.
if self.first_ordinal_of_the_block == self.num_terms {
self.index_builder
.shorten_last_block_key_given_next_key(key);
}
let keep_len = common_prefix_len(&self.previous_key, key);
let add_len = key.len() - keep_len;
let increasing_keys = add_len > 0 && (self.previous_key.len() == keep_len)
@@ -273,11 +279,12 @@ mod test {
33u8, 18u8, 19u8, // keep 1 push 1 | 20
17u8, 20u8, 0u8, 0u8, 0u8, 0u8, // no more blocks
// index
161, 102, 98, 108, 111, 99, 107, 115, 129, 162, 104, 108, 97, 115, 116, 95, 107,
101, 121, 130, 17, 20, 106, 98, 108, 111, 99, 107, 95, 97, 100, 100, 114, 162, 106,
98, 121, 116, 101, 95, 114, 97, 110, 103, 101, 162, 101, 115, 116, 97, 114, 116, 0,
99, 101, 110, 100, 11, 109, 102, 105, 114, 115, 116, 95, 111, 114, 100, 105, 110,
97, 108, 0, 15, 0, 0, 0, 0, 0, 0, 0, // offset for the index
161, 102, 98, 108, 111, 99, 107, 115, 129, 162, 115, 108, 97, 115, 116, 95, 107,
101, 121, 95, 111, 114, 95, 103, 114, 101, 97, 116, 101, 114, 130, 17, 20, 106, 98,
108, 111, 99, 107, 95, 97, 100, 100, 114, 162, 106, 98, 121, 116, 101, 95, 114, 97,
110, 103, 101, 162, 101, 115, 116, 97, 114, 116, 0, 99, 101, 110, 100, 11, 109,
102, 105, 114, 115, 116, 95, 111, 114, 100, 105, 110, 97, 108, 0, 15, 0, 0, 0, 0,
0, 0, 0, // offset for the index
3u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8 // num terms
]
);

View File

@@ -4,6 +4,7 @@ use std::ops::Range;
use serde::{Deserialize, Serialize};
use crate::error::DataCorruption;
use crate::termdict::sstable_termdict::sstable::common_prefix_len;
#[derive(Default, Debug, Serialize, Deserialize)]
pub struct SSTableIndex {
@@ -19,7 +20,7 @@ impl SSTableIndex {
pub fn search(&self, key: &[u8]) -> Option<BlockAddr> {
self.blocks
.iter()
.find(|block| &block.last_key[..] >= key)
.find(|block| &block.last_key_or_greater[..] >= key)
.map(|block| block.block_addr.clone())
}
}
@@ -32,7 +33,10 @@ pub struct BlockAddr {
#[derive(Debug, Serialize, Deserialize)]
struct BlockMeta {
pub last_key: Vec<u8>,
/// Any byte string that is lexicographically greater or equal to
/// the last key in the block,
/// and yet stricly smaller than the first key in the next block.
pub last_key_or_greater: Vec<u8>,
pub block_addr: BlockAddr,
}
@@ -41,10 +45,39 @@ pub struct SSTableIndexBuilder {
index: SSTableIndex,
}
/// Given that left < right,
/// mutates `left into a shorter byte string left'` that
/// matches `left <= left' < right`.
fn find_shorter_str_in_between(left: &mut Vec<u8>, right: &[u8]) {
assert!(&left[..] < right);
let common_len = common_prefix_len(&left, right);
if left.len() == common_len {
return;
}
// It is possible to do one character shorter in some case,
// but it is not worth the extra complexity
for pos in (common_len + 1)..left.len() {
if left[pos] != u8::MAX {
left[pos] += 1;
left.truncate(pos + 1);
return;
}
}
}
impl SSTableIndexBuilder {
/// In order to make the index as light as possible, we
/// try to find a shorter alternative to the last key of the last block
/// that is still smaller than the next key.
pub(crate) fn shorten_last_block_key_given_next_key(&mut self, next_key: &[u8]) {
if let Some(last_block) = self.index.blocks.last_mut() {
find_shorter_str_in_between(&mut last_block.last_key_or_greater, next_key);
}
}
pub fn add_block(&mut self, last_key: &[u8], byte_range: Range<usize>, first_ordinal: u64) {
self.index.blocks.push(BlockMeta {
last_key: last_key.to_vec(),
last_key_or_greater: last_key.to_vec(),
block_addr: BlockAddr {
byte_range,
first_ordinal,
@@ -97,4 +130,35 @@ mod tests {
"Data corruption: SSTable index is corrupted."
);
}
#[track_caller]
fn test_find_shorter_str_in_between_aux(left: &[u8], right: &[u8]) {
let mut left_buf = left.to_vec();
super::find_shorter_str_in_between(&mut left_buf, right);
assert!(left_buf.len() <= left.len());
assert!(left <= &left_buf);
assert!(&left_buf[..] < &right);
}
#[test]
fn test_find_shorter_str_in_between() {
test_find_shorter_str_in_between_aux(b"", b"hello");
test_find_shorter_str_in_between_aux(b"abc", b"abcd");
test_find_shorter_str_in_between_aux(b"abcd", b"abd");
test_find_shorter_str_in_between_aux(&[0, 0, 0], &[1]);
test_find_shorter_str_in_between_aux(&[0, 0, 0], &[0, 0, 1]);
test_find_shorter_str_in_between_aux(&[0, 0, 255, 255, 255, 0u8], &[0, 1]);
}
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(100))]
#[test]
fn test_proptest_find_shorter_str(left in any::<Vec<u8>>(), right in any::<Vec<u8>>()) {
if left < right {
test_find_shorter_str_in_between_aux(&left, &right);
}
}
}
}

View File

@@ -29,7 +29,7 @@ impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> {
}
if !self.token_mut().text.is_ascii() {
// ignore its already ascii
to_ascii(&mut self.tail.token_mut().text, &mut self.buffer);
to_ascii(&self.tail.token().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true
@@ -1625,9 +1625,9 @@ mod tests {
#[test]
fn test_to_ascii() {
let mut input = "Rámon".to_string();
let input = "Rámon".to_string();
let mut buffer = String::new();
to_ascii(&mut input, &mut buffer);
to_ascii(&input, &mut buffer);
assert_eq!("Ramon", buffer);
}

View File

@@ -40,7 +40,7 @@ impl<'a> TokenStream for LowerCaserTokenStream<'a> {
// fast track for ascii.
self.token_mut().text.make_ascii_lowercase();
} else {
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
to_lowercase_unicode(&self.tail.token().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
}
true

View File

@@ -25,6 +25,13 @@ pub struct TokenizerManager {
}
impl TokenizerManager {
/// Creates an empty tokenizer manager.
pub fn new() -> Self {
Self {
tokenizers: Arc::new(RwLock::new(HashMap::new())),
}
}
/// Registers a new tokenizer associated with a given name.
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
where TextAnalyzer: From<T> {
@@ -52,9 +59,7 @@ impl Default for TokenizerManager {
/// - en_stem
/// - ja
fn default() -> TokenizerManager {
let manager = TokenizerManager {
tokenizers: Arc::new(RwLock::new(HashMap::new())),
};
let manager = TokenizerManager::new();
manager.register("raw", RawTokenizer);
manager.register(
"default",