Compare commits
65 Commits
0.17
...
termmap_pe
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
54fc557a6d | ||
|
|
7f5a409d6f | ||
|
|
50ee43ab79 | ||
|
|
2d1beeb6be | ||
|
|
7e3c0c5392 | ||
|
|
fdb2524f9e | ||
|
|
8c1e1cf1ad | ||
|
|
b5b16948b0 | ||
|
|
c305d3a2a2 | ||
|
|
038d234ff1 | ||
|
|
c45eb9a9fa | ||
|
|
824d6f96fe | ||
|
|
7cf821bac0 | ||
|
|
ae83fc8298 | ||
|
|
a7bc361145 | ||
|
|
2805291400 | ||
|
|
6614a2cba0 | ||
|
|
6f4d203d1b | ||
|
|
1be6c6111c | ||
|
|
c7c3eab256 | ||
|
|
ec69875d15 | ||
|
|
d832cfcfd8 | ||
|
|
ab6b532cc4 | ||
|
|
4b6047f7d7 | ||
|
|
5ca04beb94 | ||
|
|
902d05ebec | ||
|
|
f1b298642a | ||
|
|
dd13dedaeb | ||
|
|
46724b4a05 | ||
|
|
24432bf523 | ||
|
|
31d3bcfff2 | ||
|
|
706fbd6886 | ||
|
|
8a8a048015 | ||
|
|
c72549cb9a | ||
|
|
d6f803212c | ||
|
|
dac73537d2 | ||
|
|
bb5254de12 | ||
|
|
be5218c2f6 | ||
|
|
ec9478830a | ||
|
|
8807bfd13d | ||
|
|
447811c111 | ||
|
|
f29acf5d8c | ||
|
|
125707dbe0 | ||
|
|
46d5de920d | ||
|
|
d2a7bcf217 | ||
|
|
141b9aa245 | ||
|
|
c5a6282fa8 | ||
|
|
c0f524e1a3 | ||
|
|
958b2bee08 | ||
|
|
f619658e2c | ||
|
|
aa391bf843 | ||
|
|
47dcbdbeae | ||
|
|
691245bf20 | ||
|
|
90798d4b39 | ||
|
|
0b6d9f90cf | ||
|
|
8a5a12d961 | ||
|
|
e73542e2e8 | ||
|
|
0262e44bbd | ||
|
|
613aad7a8a | ||
|
|
1aa88b0c51 | ||
|
|
564fa38085 | ||
|
|
59ec21479f | ||
|
|
42283f9e91 | ||
|
|
b105bf72e1 | ||
|
|
226f577803 |
7
.github/workflows/coverage.yml
vendored
@@ -13,12 +13,11 @@ jobs:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install Rust
|
||||
run: rustup toolchain install nightly --component llvm-tools-preview
|
||||
- name: Install cargo-llvm-cov
|
||||
run: curl -LsSf https://github.com/taiki-e/cargo-llvm-cov/releases/latest/download/cargo-llvm-cov-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin
|
||||
- uses: taiki-e/install-action@cargo-llvm-cov
|
||||
- name: Generate code coverage
|
||||
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
|
||||
run: cargo +nightly llvm-cov --all-features --workspace --lcov --output-path lcov.info
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v2
|
||||
uses: codecov/codecov-action@v3
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
|
||||
files: lcov.info
|
||||
|
||||
16
CHANGELOG.md
@@ -1,3 +1,17 @@
|
||||
Unreleased
|
||||
================================
|
||||
- For date values `chrono` has been replaced with `time` (@uklotzde) #1304 :
|
||||
- The `time` crate is re-exported as `tantivy::time` instead of `tantivy::chrono`.
|
||||
- The type alias `tantivy::DateTime` has been removed.
|
||||
- `Value::Date` wraps `time::PrimitiveDateTime` without time zone information.
|
||||
- Internally date/time values are stored as seconds since UNIX epoch in UTC.
|
||||
- Converting a `time::OffsetDateTime` to `Value::Date` implicitly converts the value into UTC.
|
||||
If this is not desired do the time zone conversion yourself and use `time::PrimitiveDateTime`
|
||||
directly instead.
|
||||
- Add [histogram](https://github.com/quickwit-oss/tantivy/pull/1306) aggregation (@PSeitz)
|
||||
- Add support for fastfield on text fields (@PSeitz)
|
||||
- Add terms aggregation (@PSeitz)
|
||||
|
||||
Tantivy 0.17
|
||||
================================
|
||||
- LogMergePolicy now triggers merges if the ratio of deleted documents reaches a threshold (@shikhar @fulmicoton) [#115](https://github.com/quickwit-oss/tantivy/issues/115)
|
||||
@@ -8,7 +22,7 @@ Tantivy 0.17
|
||||
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-oss/tantivy/issues/922)
|
||||
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-oss/tantivy/issues/1225)
|
||||
- Fix opening bytes index with dynamic codec (@PSeitz) [#1278](https://github.com/quickwit-oss/tantivy/issues/1278)
|
||||
- Added an aggregation collector compatible with Elasticsearch (@PSeitz)
|
||||
- Added an aggregation collector for range, average and stats compatible with Elasticsearch. (@PSeitz)
|
||||
- Added a JSON schema type @fulmicoton [#1251](https://github.com/quickwit-oss/tantivy/issues/1251)
|
||||
- Added support for slop in phrase queries @halvorboe [#1068](https://github.com/quickwit-oss/tantivy/issues/1068)
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
oneshot = "0.1"
|
||||
base64 = "0.13"
|
||||
byteorder = "1.4.3"
|
||||
crc32fast = "1.2.1"
|
||||
@@ -30,9 +31,8 @@ serde_json = "1.0.64"
|
||||
num_cpus = "1.13"
|
||||
fs2={ version = "0.4.3", optional = true }
|
||||
levenshtein_automata = "0.2"
|
||||
uuid = { version = "0.8.2", features = ["v4", "serde"] }
|
||||
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
||||
crossbeam = "0.8.1"
|
||||
futures = { version = "0.3.15", features = ["thread-pool"] }
|
||||
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||
common = { version = "0.2", path = "./common/", package = "tantivy-common" }
|
||||
@@ -48,7 +48,7 @@ thiserror = "1.0.24"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.5"
|
||||
murmurhash32 = "0.2"
|
||||
chrono = "0.4.19"
|
||||
time = { version = "0.3.7", features = ["serde-well-known"] }
|
||||
smallvec = "1.6.1"
|
||||
rayon = "1.5"
|
||||
lru = "0.7.0"
|
||||
@@ -70,7 +70,8 @@ proptest = "1.0"
|
||||
criterion = "0.3.5"
|
||||
test-log = "0.2.8"
|
||||
env_logger = "0.9.0"
|
||||
pprof = {version= "0.6", features=["flamegraph", "criterion"]}
|
||||
pprof = {version= "0.8", features=["flamegraph", "criterion"]}
|
||||
futures = "0.3.15"
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.5"
|
||||
|
||||
55
README.md
@@ -5,9 +5,10 @@
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://crates.io/crates/tantivy)
|
||||
|
||||
|
||||

|
||||
|
||||
**Tantivy** is a **full text search engine library** written in Rust.
|
||||
**Tantivy** is a **full-text search engine library** written in Rust.
|
||||
|
||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||
an off-the-shelf search engine server, but rather a crate that can be used
|
||||
@@ -15,19 +16,23 @@ to build such a search engine.
|
||||
|
||||
Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
If you are looking for an alternative to Elasticsearch or Apache Solr, check out [Quickwit](https://github.com/quickwit-oss/quickwit), our search engine built on top of Tantivy.
|
||||
|
||||
# Benchmark
|
||||
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
||||
performance for different type of queries / collection.
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns
|
||||
performance for different types of queries/collections.
|
||||
|
||||
Your mileage WILL vary depending on the nature of queries and their load.
|
||||
|
||||
<img src="doc/assets/images/searchbenchmark.png">
|
||||
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
|
||||
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
||||
- Tiny startup time (<10ms), perfect for command line tools
|
||||
- Tiny startup time (<10ms), perfect for command-line tools
|
||||
- BM25 scoring (the same as Lucene)
|
||||
- Natural query language (e.g. `(michael AND jackson) OR "king of pop"`)
|
||||
- Phrase queries search (e.g. `"michael jackson"`)
|
||||
@@ -42,23 +47,25 @@ Your mileage WILL vary depending on the nature of queries and their load.
|
||||
- Range queries
|
||||
- Faceted search
|
||||
- Configurable indexing (optional term frequency and position indexing)
|
||||
- JSON Field
|
||||
- Aggregation Collector: range buckets, average, and stats metrics
|
||||
- LogMergePolicy with deletes
|
||||
- Searcher Warmer API
|
||||
- Cheesy logo with a horse
|
||||
|
||||
## Non-features
|
||||
|
||||
- Distributed search is out of the scope of Tantivy. That being said, Tantivy is a
|
||||
library upon which one could build a distributed search. Serializable/mergeable collector state for instance,
|
||||
are within the scope of Tantivy.
|
||||
Distributed search is out of the scope of Tantivy, but if you are looking for this feature, check out [Quickwit](https://github.com/quickwit-oss/quickwit/).
|
||||
|
||||
|
||||
# Getting started
|
||||
|
||||
Tantivy works on stable Rust (>= 1.27) and supports Linux, MacOS, and Windows.
|
||||
Tantivy works on stable Rust (>= 1.27) and supports Linux, macOS, and Windows.
|
||||
|
||||
- [Tantivy's simple search example](https://tantivy-search.github.io/examples/basic_search.html)
|
||||
- [tantivy-cli and its tutorial](https://github.com/quickwit-oss/tantivy-cli) - `tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
|
||||
- [tantivy-cli and its tutorial](https://github.com/quickwit-oss/tantivy-cli) - `tantivy-cli` is an actual command-line interface that makes it easy for you to create a search engine,
|
||||
index documents, and search via the CLI or a small server with a REST API.
|
||||
It walks you through getting a wikipedia search engine up and running in a few minutes.
|
||||
It walks you through getting a Wikipedia search engine up and running in a few minutes.
|
||||
- [Reference doc for the last released version](https://docs.rs/tantivy/)
|
||||
|
||||
# How can I support this project?
|
||||
@@ -118,3 +125,31 @@ By default, `rustc` compiles everything in the `examples/` directory in debug mo
|
||||
rust-gdb target/debug/examples/$EXAMPLE_NAME
|
||||
$ gdb run
|
||||
```
|
||||
# Companies Using Tantivy
|
||||
|
||||
<p align="left">
|
||||
<img align="center" src="doc/assets/images/Nuclia.png#gh-light-mode-only" alt="Nuclia" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/humanfirst.png#gh-light-mode-only" alt="Humanfirst.ai" height="30" width="auto" />
|
||||
<img align="center" src="doc/assets/images/element.io.svg#gh-light-mode-only" alt="Element.io" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/nuclia-dark-theme.png#gh-dark-mode-only" alt="Nuclia" height="35" width="auto" />
|
||||
<img align="center" src="doc/assets/images/humanfirst.ai-dark-theme.png#gh-dark-mode-only" alt="Humanfirst.ai" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/element-dark-theme.png#gh-dark-mode-only" alt="Element.io" height="25" width="auto" />
|
||||
</p>
|
||||
|
||||
|
||||
# FAQ
|
||||
### Can I use Tantivy in other languages?
|
||||
- Python → [tantivy-py](https://github.com/quickwit-oss/tantivy-py)
|
||||
- Ruby → [tantiny](https://github.com/baygeldin/tantiny)
|
||||
|
||||
You can also find other bindings on [GitHub](https://github.com/search?q=tantivy) but they may be less maintained.
|
||||
|
||||
### What are some examples of Tantivy use?
|
||||
|
||||
- [seshat](https://github.com/matrix-org/seshat/): A matrix message database/indexer
|
||||
- [tantiny](https://github.com/baygeldin/tantiny): Tiny full-text search for Ruby
|
||||
- [lnx](https://github.com/lnx-search/lnx): adaptable, typo tolerant search engine with a REST API
|
||||
- and [more](https://github.com/search?q=tantivy)!
|
||||
|
||||
### On average, how much faster is Tantivy compared to Lucene?
|
||||
- According to our [search latency benchmark](https://tantivy-search.github.io/bench/), Tantivy is approximately 2x faster than Lucene.
|
||||
BIN
doc/assets/images/Nuclia.png
Normal file
|
After Width: | Height: | Size: 3.1 KiB |
BIN
doc/assets/images/element-dark-theme.png
Normal file
|
After Width: | Height: | Size: 56 KiB |
8
doc/assets/images/element.io.svg
Normal file
@@ -0,0 +1,8 @@
|
||||
<svg width="518" height="112" viewBox="0 0 518 112" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M56 112C86.9279 112 112 86.9279 112 56C112 25.0721 86.9279 0 56 0C25.0721 0 0 25.0721 0 56C0 86.9279 25.0721 112 56 112Z" fill="#0DBD8B"/>
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M45.7615 26.093C45.7615 23.8325 47.5977 22.0001 49.8629 22.0001C65.2154 22.0001 77.6611 34.4199 77.6611 49.7406C77.6611 52.001 75.8248 53.8335 73.5597 53.8335C71.2945 53.8335 69.4583 52.001 69.4583 49.7406C69.4583 38.9408 60.6851 30.1859 49.8629 30.1859C47.5977 30.1859 45.7615 28.3534 45.7615 26.093Z" fill="white"/>
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M85.8986 45.6477C88.1637 45.6477 89.9999 47.4801 89.9999 49.7406C89.9999 65.0612 77.5543 77.4811 62.2017 77.4811C59.9366 77.4811 58.1003 75.6486 58.1003 73.3882C58.1003 71.1277 59.9366 69.2953 62.2017 69.2953C73.024 69.2953 81.7972 60.5403 81.7972 49.7406C81.7972 47.4801 83.6334 45.6477 85.8986 45.6477Z" fill="white"/>
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M66.3031 85.907C66.3031 88.1675 64.4668 89.9999 62.2017 89.9999C46.8492 89.9999 34.4035 77.58 34.4035 62.2594C34.4035 59.9989 36.2398 58.1665 38.5049 58.1665C40.77 58.1665 42.6063 59.9989 42.6063 62.2594C42.6063 73.0592 51.3795 81.8141 62.2017 81.8141C64.4668 81.8141 66.3031 83.6466 66.3031 85.907Z" fill="white"/>
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M26.1014 66.3523C23.8363 66.3523 22.0001 64.5199 22.0001 62.2594C22 46.9388 34.4457 34.5189 49.7983 34.5189C52.0634 34.5189 53.8997 36.3514 53.8997 38.6118C53.8997 40.8723 52.0634 42.7047 49.7983 42.7047C38.976 42.7047 30.2028 51.4597 30.2028 62.2594C30.2028 64.5199 28.3666 66.3523 26.1014 66.3523Z" fill="white"/>
|
||||
<path d="M197 63.5H157.5C157.967 67.6333 159.467 70.9333 162 73.4C164.533 75.8 167.867 77 172 77C174.733 77 177.2 76.3333 179.4 75C181.6 73.6667 183.167 71.8667 184.1 69.6H196.1C194.5 74.8667 191.5 79.1333 187.1 82.4C182.767 85.6 177.633 87.2 171.7 87.2C163.967 87.2 157.7 84.6333 152.9 79.5C148.167 74.3667 145.8 67.8667 145.8 60C145.8 52.3333 148.2 45.9 153 40.7C157.8 35.5 164 32.9 171.6 32.9C179.2 32.9 185.333 35.4667 190 40.6C194.733 45.6667 197.1 52.0667 197.1 59.8L197 63.5ZM171.6 42.6C167.867 42.6 164.767 43.7 162.3 45.9C159.833 48.1 158.3 51.0333 157.7 54.7H185.3C184.767 51.0333 183.3 48.1 180.9 45.9C178.5 43.7 175.4 42.6 171.6 42.6ZM205.289 70.5V11H217.189V70.7C217.189 73.3667 218.656 74.7 221.589 74.7L223.689 74.6V85.9C222.556 86.1 221.356 86.2 220.089 86.2C214.956 86.2 211.189 84.9 208.789 82.3C206.456 79.7 205.289 75.7667 205.289 70.5ZM279.109 63.5H239.609C240.076 67.6333 241.576 70.9333 244.109 73.4C246.643 75.8 249.976 77 254.109 77C256.843 77 259.309 76.3333 261.509 75C263.709 73.6667 265.276 71.8667 266.209 69.6H278.209C276.609 74.8667 273.609 79.1333 269.209 82.4C264.876 85.6 259.743 87.2 253.809 87.2C246.076 87.2 239.809 84.6333 235.009 79.5C230.276 74.3667 227.909 67.8667 227.909 60C227.909 52.3333 230.309 45.9 235.109 40.7C239.909 35.5 246.109 32.9 253.709 32.9C261.309 32.9 267.443 35.4667 272.109 40.6C276.843 45.6667 279.209 52.0667 279.209 59.8L279.109 63.5ZM253.709 42.6C249.976 42.6 246.876 43.7 244.409 45.9C241.943 48.1 240.409 51.0333 239.809 54.7H267.409C266.876 51.0333 265.409 48.1 263.009 45.9C260.609 43.7 257.509 42.6 253.709 42.6ZM332.798 56.2V86H320.898V54.9C320.898 47.0333 317.632 43.1 311.098 43.1C307.565 43.1 304.732 44.2333 302.598 46.5C300.532 48.7667 299.498 51.8667 299.498 55.8V86H287.598V34.1H298.598V41C299.865 38.6667 301.798 36.7333 304.398 35.2C306.998 33.6667 310.232 32.9 314.098 32.9C321.298 32.9 326.498 35.6333 329.698 41.1C334.098 35.6333 339.965 32.9 347.298 32.9C353.365 32.9 358.032 34.8 361.298 38.6C364.565 42.3333 366.198 47.2667 366.198 53.4V86H354.298V54.9C354.298 47.0333 351.032 43.1 344.498 43.1C340.898 43.1 338.032 44.2667 335.898 46.6C333.832 48.8667 332.798 52.0667 332.798 56.2ZM425.379 63.5H385.879C386.346 67.6333 387.846 70.9333 390.379 73.4C392.912 75.8 396.246 77 400.379 77C403.112 77 405.579 76.3333 407.779 75C409.979 73.6667 411.546 71.8667 412.479 69.6H424.479C422.879 74.8667 419.879 79.1333 415.479 82.4C411.146 85.6 406.012 87.2 400.079 87.2C392.346 87.2 386.079 84.6333 381.279 79.5C376.546 74.3667 374.179 67.8667 374.179 60C374.179 52.3333 376.579 45.9 381.379 40.7C386.179 35.5 392.379 32.9 399.979 32.9C407.579 32.9 413.712 35.4667 418.379 40.6C423.112 45.6667 425.479 52.0667 425.479 59.8L425.379 63.5ZM399.979 42.6C396.246 42.6 393.146 43.7 390.679 45.9C388.212 48.1 386.679 51.0333 386.079 54.7H413.679C413.146 51.0333 411.679 48.1 409.279 45.9C406.879 43.7 403.779 42.6 399.979 42.6ZM444.868 34.1V41C446.068 38.7333 448.035 36.8333 450.768 35.3C453.568 33.7 456.935 32.9 460.868 32.9C467.001 32.9 471.735 34.7667 475.068 38.5C478.468 42.2333 480.168 47.2 480.168 53.4V86H468.268V54.9C468.268 51.2333 467.401 48.3667 465.668 46.3C464.001 44.1667 461.435 43.1 457.968 43.1C454.168 43.1 451.168 44.2333 448.968 46.5C446.835 48.7667 445.768 51.9 445.768 55.9V86H433.868V34.1H444.868ZM514.922 75.4V85.7C513.455 86.1 511.389 86.3 508.722 86.3C498.589 86.3 493.522 81.2 493.522 71V43.6H485.622V34.1H493.522V20.6H505.422V34.1H515.122V43.6H505.422V69.8C505.422 73.8667 507.355 75.9 511.222 75.9L514.922 75.4Z" fill="black"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 5.2 KiB |
BIN
doc/assets/images/humanfirst.ai-dark-theme.png
Normal file
|
After Width: | Height: | Size: 23 KiB |
BIN
doc/assets/images/humanfirst.png
Normal file
|
After Width: | Height: | Size: 102 KiB |
BIN
doc/assets/images/nuclia-dark-theme.png
Normal file
|
After Width: | Height: | Size: 7.8 KiB |
BIN
doc/assets/images/searchbenchmark.png
Normal file
|
After Width: | Height: | Size: 653 KiB |
@@ -20,9 +20,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_fieldtype = schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_tokenizer("default")
|
||||
.set_index_option(IndexRecordOption::WithFreqs),
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
@@ -124,7 +122,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
let res: Value = serde_json::to_value(&agg_res)?;
|
||||
println!("{}", serde_json::to_string_pretty(&res)?);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -67,7 +67,7 @@ fn word<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
///
|
||||
/// NOTE: also accepts 999999-99-99T99:99:99.266051969+99:99
|
||||
/// We delegate rejecting such invalid dates to the logical AST compuation code
|
||||
/// which invokes chrono::DateTime::parse_from_rfc3339 on the value to actually parse
|
||||
/// which invokes time::OffsetDateTime::parse(..., &Rfc3339) on the value to actually parse
|
||||
/// it (instead of merely extracting the datetime value as string as done here).
|
||||
fn date_time<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
let two_digits = || recognize::<String, _, _>((digit(), digit()));
|
||||
|
||||
@@ -49,7 +49,9 @@ use std::collections::{HashMap, HashSet};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub use super::bucket::RangeAggregation;
|
||||
use super::bucket::{HistogramAggregation, TermsAggregation};
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
use super::VecWithNames;
|
||||
|
||||
/// The top-level aggregation request structure, which contains [Aggregation] and their user defined
|
||||
/// names. It is also used in [buckets](BucketAggregation) to define sub-aggregations.
|
||||
@@ -57,6 +59,70 @@ use super::metric::{AverageAggregation, StatsAggregation};
|
||||
/// The key is the user defined name of the aggregation.
|
||||
pub type Aggregations = HashMap<String, Aggregation>;
|
||||
|
||||
/// Like Aggregations, but optimized to work with the aggregation result
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct AggregationsInternal {
|
||||
pub(crate) metrics: VecWithNames<MetricAggregation>,
|
||||
pub(crate) buckets: VecWithNames<BucketAggregationInternal>,
|
||||
}
|
||||
|
||||
impl From<Aggregations> for AggregationsInternal {
|
||||
fn from(aggs: Aggregations) -> Self {
|
||||
let mut metrics = vec![];
|
||||
let mut buckets = vec![];
|
||||
for (key, agg) in aggs {
|
||||
match agg {
|
||||
Aggregation::Bucket(bucket) => buckets.push((
|
||||
key,
|
||||
BucketAggregationInternal {
|
||||
bucket_agg: bucket.bucket_agg,
|
||||
sub_aggregation: bucket.sub_aggregation.into(),
|
||||
},
|
||||
)),
|
||||
Aggregation::Metric(metric) => metrics.push((key, metric)),
|
||||
}
|
||||
}
|
||||
Self {
|
||||
metrics: VecWithNames::from_entries(metrics),
|
||||
buckets: VecWithNames::from_entries(buckets),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
// Like BucketAggregation, but optimized to work with the result
|
||||
pub(crate) struct BucketAggregationInternal {
|
||||
/// Bucket aggregation strategy to group documents.
|
||||
pub bucket_agg: BucketAggregationType,
|
||||
/// The sub_aggregations in the buckets. Each bucket will aggregate on the document set in the
|
||||
/// bucket.
|
||||
pub sub_aggregation: AggregationsInternal,
|
||||
}
|
||||
|
||||
impl BucketAggregationInternal {
|
||||
pub(crate) fn as_histogram(&self) -> Option<&HistogramAggregation> {
|
||||
match &self.bucket_agg {
|
||||
BucketAggregationType::Histogram(histogram) => Some(histogram),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_term(&self) -> Option<&TermsAggregation> {
|
||||
match &self.bucket_agg {
|
||||
BucketAggregationType::Terms(terms) => Some(terms),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract all fields, where the term directory is used in the tree.
|
||||
pub fn get_term_dict_field_names(aggs: &Aggregations) -> HashSet<String> {
|
||||
let mut term_dict_field_names = Default::default();
|
||||
for el in aggs.values() {
|
||||
el.get_term_dict_field_names(&mut term_dict_field_names)
|
||||
}
|
||||
term_dict_field_names
|
||||
}
|
||||
|
||||
/// Extract all fast field names used in the tree.
|
||||
pub fn get_fast_field_names(aggs: &Aggregations) -> HashSet<String> {
|
||||
let mut fast_field_names = Default::default();
|
||||
@@ -79,6 +145,12 @@ pub enum Aggregation {
|
||||
}
|
||||
|
||||
impl Aggregation {
|
||||
fn get_term_dict_field_names(&self, term_field_names: &mut HashSet<String>) {
|
||||
if let Aggregation::Bucket(bucket) = self {
|
||||
bucket.get_term_dict_field_names(term_field_names)
|
||||
}
|
||||
}
|
||||
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
match self {
|
||||
Aggregation::Bucket(bucket) => bucket.get_fast_field_names(fast_field_names),
|
||||
@@ -111,6 +183,12 @@ pub struct BucketAggregation {
|
||||
}
|
||||
|
||||
impl BucketAggregation {
|
||||
fn get_term_dict_field_names(&self, term_dict_field_names: &mut HashSet<String>) {
|
||||
if let BucketAggregationType::Terms(terms) = &self.bucket_agg {
|
||||
term_dict_field_names.insert(terms.field.to_string());
|
||||
}
|
||||
term_dict_field_names.extend(get_term_dict_field_names(&self.sub_aggregation));
|
||||
}
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
self.bucket_agg.get_fast_field_names(fast_field_names);
|
||||
fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
|
||||
@@ -123,12 +201,22 @@ pub enum BucketAggregationType {
|
||||
/// Put data into buckets of user-defined ranges.
|
||||
#[serde(rename = "range")]
|
||||
Range(RangeAggregation),
|
||||
/// Put data into buckets of user-defined ranges.
|
||||
#[serde(rename = "histogram")]
|
||||
Histogram(HistogramAggregation),
|
||||
/// Put data into buckets of terms.
|
||||
#[serde(rename = "terms")]
|
||||
Terms(TermsAggregation),
|
||||
}
|
||||
|
||||
impl BucketAggregationType {
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
match self {
|
||||
BucketAggregationType::Terms(terms) => fast_field_names.insert(terms.field.to_string()),
|
||||
BucketAggregationType::Range(range) => fast_field_names.insert(range.field.to_string()),
|
||||
BucketAggregationType::Histogram(histogram) => {
|
||||
fast_field_names.insert(histogram.field.to_string())
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
//! This will enhance the request tree with access to the fastfield and metadata.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
|
||||
use super::bucket::RangeAggregation;
|
||||
use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
use super::VecWithNames;
|
||||
use crate::fastfield::{type_and_cardinality, DynamicFastFieldReader, FastType};
|
||||
use crate::fastfield::{
|
||||
type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader,
|
||||
};
|
||||
use crate::schema::{Cardinality, Type};
|
||||
use crate::{SegmentReader, TantivyError};
|
||||
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub(crate) struct AggregationsWithAccessor {
|
||||
@@ -27,11 +31,32 @@ impl AggregationsWithAccessor {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) enum FastFieldAccessor {
|
||||
Multi(MultiValuedFastFieldReader<u64>),
|
||||
Single(DynamicFastFieldReader<u64>),
|
||||
}
|
||||
impl FastFieldAccessor {
|
||||
pub fn as_single(&self) -> Option<&DynamicFastFieldReader<u64>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(_) => None,
|
||||
FastFieldAccessor::Single(reader) => Some(reader),
|
||||
}
|
||||
}
|
||||
pub fn as_multi(&self) -> Option<&MultiValuedFastFieldReader<u64>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(reader) => Some(reader),
|
||||
FastFieldAccessor::Single(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BucketAggregationWithAccessor {
|
||||
/// In general there can be buckets without fast field access, e.g. buckets that are created
|
||||
/// based on search terms. So eventually this needs to be Option or moved.
|
||||
pub(crate) accessor: DynamicFastFieldReader<u64>,
|
||||
pub(crate) accessor: FastFieldAccessor,
|
||||
pub(crate) inverted_index: Option<Arc<InvertedIndexReader>>,
|
||||
pub(crate) field_type: Type,
|
||||
pub(crate) bucket_agg: BucketAggregationType,
|
||||
pub(crate) sub_aggregation: AggregationsWithAccessor,
|
||||
@@ -43,11 +68,25 @@ impl BucketAggregationWithAccessor {
|
||||
sub_aggregation: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<BucketAggregationWithAccessor> {
|
||||
let mut inverted_index = None;
|
||||
let (accessor, field_type) = match &bucket {
|
||||
BucketAggregationType::Range(RangeAggregation {
|
||||
field: field_name,
|
||||
ranges: _,
|
||||
}) => get_ff_reader_and_validate(reader, field_name)?,
|
||||
}) => get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?,
|
||||
BucketAggregationType::Histogram(HistogramAggregation {
|
||||
field: field_name, ..
|
||||
}) => get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?,
|
||||
BucketAggregationType::Terms(TermsAggregation {
|
||||
field: field_name, ..
|
||||
}) => {
|
||||
let field = reader
|
||||
.schema()
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
|
||||
inverted_index = Some(reader.inverted_index(field)?);
|
||||
get_ff_reader_and_validate(reader, field_name, Cardinality::MultiValues)?
|
||||
}
|
||||
};
|
||||
let sub_aggregation = sub_aggregation.clone();
|
||||
Ok(BucketAggregationWithAccessor {
|
||||
@@ -55,6 +94,7 @@ impl BucketAggregationWithAccessor {
|
||||
field_type,
|
||||
sub_aggregation: get_aggs_with_accessor_and_validate(&sub_aggregation, reader)?,
|
||||
bucket_agg: bucket.clone(),
|
||||
inverted_index,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -75,10 +115,14 @@ impl MetricAggregationWithAccessor {
|
||||
match &metric {
|
||||
MetricAggregation::Average(AverageAggregation { field: field_name })
|
||||
| MetricAggregation::Stats(StatsAggregation { field: field_name }) => {
|
||||
let (accessor, field_type) = get_ff_reader_and_validate(reader, field_name)?;
|
||||
let (accessor, field_type) =
|
||||
get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?;
|
||||
|
||||
Ok(MetricAggregationWithAccessor {
|
||||
accessor,
|
||||
accessor: accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinality")
|
||||
.clone(),
|
||||
field_type,
|
||||
metric: metric.clone(),
|
||||
})
|
||||
@@ -115,32 +159,45 @@ pub(crate) fn get_aggs_with_accessor_and_validate(
|
||||
))
|
||||
}
|
||||
|
||||
/// Get fast field reader with given cardinatility.
|
||||
fn get_ff_reader_and_validate(
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
) -> crate::Result<(DynamicFastFieldReader<u64>, Type)> {
|
||||
cardinality: Cardinality,
|
||||
) -> crate::Result<(FastFieldAccessor, Type)> {
|
||||
let field = reader
|
||||
.schema()
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
|
||||
let field_type = reader.schema().get_field_entry(field).field_type();
|
||||
|
||||
if let Some((ff_type, cardinality)) = type_and_cardinality(field_type) {
|
||||
if cardinality == Cardinality::MultiValues || ff_type == FastType::Date {
|
||||
if let Some((ff_type, field_cardinality)) = type_and_cardinality(field_type) {
|
||||
if ff_type == FastType::Date {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"Unsupported field type date in aggregation".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
if cardinality != field_cardinality {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Invalid field type in aggregation {:?}, only Cardinality::SingleValue supported",
|
||||
field_type.value_type()
|
||||
"Invalid field cardinality on field {} expected {:?}, but got {:?}",
|
||||
field_name, cardinality, field_cardinality
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Only single value fast fields of type f64, u64, i64 are supported, but got {:?} ",
|
||||
"Only fast fields of type f64, u64, i64 are supported, but got {:?} ",
|
||||
field_type.value_type()
|
||||
)));
|
||||
};
|
||||
|
||||
let ff_fields = reader.fast_fields();
|
||||
ff_fields
|
||||
.u64_lenient(field)
|
||||
.map(|field| (field, field_type.value_type()))
|
||||
match cardinality {
|
||||
Cardinality::SingleValue => ff_fields
|
||||
.u64_lenient(field)
|
||||
.map(|field| (FastFieldAccessor::Single(field), field_type.value_type())),
|
||||
Cardinality::MultiValues => ff_fields
|
||||
.u64s_lenient(field)
|
||||
.map(|field| (FastFieldAccessor::Multi(field), field_type.value_type())),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,29 +7,132 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::agg_req::{
|
||||
Aggregations, AggregationsInternal, BucketAggregationInternal, MetricAggregation,
|
||||
};
|
||||
use super::bucket::{intermediate_buckets_to_final_buckets, GetDocCount};
|
||||
use super::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
|
||||
IntermediateMetricResult, IntermediateRangeBucketEntry,
|
||||
};
|
||||
use super::metric::{SingleMetricResult, Stats};
|
||||
use super::Key;
|
||||
use super::{Key, VecWithNames};
|
||||
use crate::TantivyError;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// The final aggegation result.
|
||||
pub struct AggregationResults(pub HashMap<String, AggregationResult>);
|
||||
|
||||
impl From<IntermediateAggregationResults> for AggregationResults {
|
||||
fn from(tree: IntermediateAggregationResults) -> Self {
|
||||
Self(
|
||||
tree.0
|
||||
.into_iter()
|
||||
.map(|(key, agg)| (key, agg.into()))
|
||||
.collect(),
|
||||
)
|
||||
impl AggregationResults {
|
||||
pub(crate) fn get_value_from_aggregation(
|
||||
&self,
|
||||
name: &str,
|
||||
agg_property: &str,
|
||||
) -> crate::Result<Option<f64>> {
|
||||
if let Some(agg) = self.0.get(name) {
|
||||
agg.get_value_from_aggregation(name, agg_property)
|
||||
} else {
|
||||
// Validation is be done during request parsing, so we can't reach this state.
|
||||
Err(TantivyError::InternalError(format!(
|
||||
"Can't find aggregation {:?} in sub_aggregations",
|
||||
name
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert and intermediate result and its aggregation request to the final result
|
||||
pub fn from_intermediate_and_req(
|
||||
results: IntermediateAggregationResults,
|
||||
agg: Aggregations,
|
||||
) -> crate::Result<Self> {
|
||||
AggregationResults::from_intermediate_and_req_internal(results, &(agg.into()))
|
||||
}
|
||||
|
||||
/// Convert and intermediate result and its aggregation request to the final result
|
||||
///
|
||||
/// Internal function, CollectorAggregations is used instead Aggregations, which is optimized
|
||||
/// for internal processing, by splitting metric and buckets into seperate groups.
|
||||
pub(crate) fn from_intermediate_and_req_internal(
|
||||
intermediate_results: IntermediateAggregationResults,
|
||||
req: &AggregationsInternal,
|
||||
) -> crate::Result<Self> {
|
||||
// Important assumption:
|
||||
// When the tree contains buckets/metric, we expect it to have all buckets/metrics from the
|
||||
// request
|
||||
let mut results: HashMap<String, AggregationResult> = HashMap::new();
|
||||
|
||||
if let Some(buckets) = intermediate_results.buckets {
|
||||
add_coverted_final_buckets_to_result(&mut results, buckets, &req.buckets)?
|
||||
} else {
|
||||
// When there are no buckets, we create empty buckets, so that the serialized json
|
||||
// format is constant
|
||||
add_empty_final_buckets_to_result(&mut results, &req.buckets)?
|
||||
};
|
||||
|
||||
if let Some(metrics) = intermediate_results.metrics {
|
||||
add_converted_final_metrics_to_result(&mut results, metrics);
|
||||
} else {
|
||||
// When there are no metrics, we create empty metric results, so that the serialized
|
||||
// json format is constant
|
||||
add_empty_final_metrics_to_result(&mut results, &req.metrics)?;
|
||||
}
|
||||
Ok(Self(results))
|
||||
}
|
||||
}
|
||||
|
||||
fn add_converted_final_metrics_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
metrics: VecWithNames<IntermediateMetricResult>,
|
||||
) {
|
||||
results.extend(
|
||||
metrics
|
||||
.into_iter()
|
||||
.map(|(key, metric)| (key, AggregationResult::MetricResult(metric.into()))),
|
||||
);
|
||||
}
|
||||
|
||||
fn add_empty_final_metrics_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
req_metrics: &VecWithNames<MetricAggregation>,
|
||||
) -> crate::Result<()> {
|
||||
results.extend(req_metrics.iter().map(|(key, req)| {
|
||||
let empty_bucket = IntermediateMetricResult::empty_from_req(req);
|
||||
(
|
||||
key.to_string(),
|
||||
AggregationResult::MetricResult(empty_bucket.into()),
|
||||
)
|
||||
}));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn add_empty_final_buckets_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
req_buckets: &VecWithNames<BucketAggregationInternal>,
|
||||
) -> crate::Result<()> {
|
||||
let requested_buckets = req_buckets.iter();
|
||||
for (key, req) in requested_buckets {
|
||||
let empty_bucket = AggregationResult::BucketResult(BucketResult::empty_from_req(req)?);
|
||||
results.insert(key.to_string(), empty_bucket);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn add_coverted_final_buckets_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
buckets: VecWithNames<IntermediateBucketResult>,
|
||||
req_buckets: &VecWithNames<BucketAggregationInternal>,
|
||||
) -> crate::Result<()> {
|
||||
assert_eq!(buckets.len(), req_buckets.len());
|
||||
|
||||
let buckets_with_request = buckets.into_iter().zip(req_buckets.values());
|
||||
for ((key, bucket), req) in buckets_with_request {
|
||||
let result =
|
||||
AggregationResult::BucketResult(BucketResult::from_intermediate_and_req(bucket, req)?);
|
||||
results.insert(key, result);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
@@ -41,15 +144,20 @@ pub enum AggregationResult {
|
||||
/// Metric result variant.
|
||||
MetricResult(MetricResult),
|
||||
}
|
||||
impl From<IntermediateAggregationResult> for AggregationResult {
|
||||
fn from(tree: IntermediateAggregationResult) -> Self {
|
||||
match tree {
|
||||
IntermediateAggregationResult::Bucket(bucket) => {
|
||||
AggregationResult::BucketResult(bucket.into())
|
||||
}
|
||||
IntermediateAggregationResult::Metric(metric) => {
|
||||
AggregationResult::MetricResult(metric.into())
|
||||
}
|
||||
|
||||
impl AggregationResult {
|
||||
pub(crate) fn get_value_from_aggregation(
|
||||
&self,
|
||||
_name: &str,
|
||||
agg_property: &str,
|
||||
) -> crate::Result<Option<f64>> {
|
||||
match self {
|
||||
AggregationResult::BucketResult(_bucket) => Err(TantivyError::InternalError(
|
||||
"Tried to retrieve value from bucket aggregation. This is not supported and \
|
||||
should not happen during collection, but should be catched during validation"
|
||||
.to_string(),
|
||||
)),
|
||||
AggregationResult::MetricResult(metric) => metric.get_value(agg_property),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -64,6 +172,14 @@ pub enum MetricResult {
|
||||
Stats(Stats),
|
||||
}
|
||||
|
||||
impl MetricResult {
|
||||
fn get_value(&self, agg_property: &str) -> crate::Result<Option<f64>> {
|
||||
match self {
|
||||
MetricResult::Average(avg) => Ok(avg.value),
|
||||
MetricResult::Stats(stats) => stats.get_value(agg_property),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl From<IntermediateMetricResult> for MetricResult {
|
||||
fn from(metric: IntermediateMetricResult) -> Self {
|
||||
match metric {
|
||||
@@ -81,35 +197,148 @@ impl From<IntermediateMetricResult> for MetricResult {
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum BucketResult {
|
||||
/// This is the default entry for a bucket, which contains a key, count, and optionally
|
||||
/// This is the range entry for a bucket, which contains a key, count, from, to, and optionally
|
||||
/// sub_aggregations.
|
||||
Range {
|
||||
/// The range buckets sorted by range.
|
||||
buckets: Vec<RangeBucketEntry>,
|
||||
},
|
||||
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
Histogram {
|
||||
/// The buckets.
|
||||
///
|
||||
/// If there are holes depends on the request, if min_doc_count is 0, then there are no
|
||||
/// holes between the first and last bucket.
|
||||
/// See [HistogramAggregation](super::bucket::HistogramAggregation)
|
||||
buckets: Vec<BucketEntry>,
|
||||
},
|
||||
/// This is the term result
|
||||
Terms {
|
||||
/// The buckets.
|
||||
///
|
||||
/// See [TermsAggregation](super::bucket::TermsAggregation)
|
||||
buckets: Vec<BucketEntry>,
|
||||
/// The number of documents that didn’t make it into to TOP N due to shard_size or size
|
||||
sum_other_doc_count: u64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
/// The upper bound error for the doc count of each term.
|
||||
doc_count_error_upper_bound: Option<u64>,
|
||||
},
|
||||
}
|
||||
|
||||
impl From<IntermediateBucketResult> for BucketResult {
|
||||
fn from(result: IntermediateBucketResult) -> Self {
|
||||
match result {
|
||||
IntermediateBucketResult::Range(range_map) => {
|
||||
let mut buckets: Vec<RangeBucketEntry> = range_map
|
||||
.into_iter()
|
||||
.map(|(_, bucket)| bucket.into())
|
||||
.collect_vec();
|
||||
impl BucketResult {
|
||||
pub(crate) fn empty_from_req(req: &BucketAggregationInternal) -> crate::Result<Self> {
|
||||
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
|
||||
|
||||
buckets.sort_by(|a, b| {
|
||||
a.from
|
||||
Ok(BucketResult::from_intermediate_and_req(empty_bucket, req)?)
|
||||
}
|
||||
|
||||
fn from_intermediate_and_req(
|
||||
bucket_result: IntermediateBucketResult,
|
||||
req: &BucketAggregationInternal,
|
||||
) -> crate::Result<Self> {
|
||||
match bucket_result {
|
||||
IntermediateBucketResult::Range(range_res) => {
|
||||
let mut buckets: Vec<RangeBucketEntry> = range_res
|
||||
.buckets
|
||||
.into_iter()
|
||||
.map(|(_, bucket)| {
|
||||
RangeBucketEntry::from_intermediate_and_req(bucket, &req.sub_aggregation)
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
|
||||
buckets.sort_by(|left, right| {
|
||||
// TODO use total_cmp next stable rust release
|
||||
left.from
|
||||
.unwrap_or(f64::MIN)
|
||||
.partial_cmp(&b.from.unwrap_or(f64::MIN))
|
||||
.partial_cmp(&right.from.unwrap_or(f64::MIN))
|
||||
.unwrap_or(Ordering::Equal)
|
||||
});
|
||||
BucketResult::Range { buckets }
|
||||
Ok(BucketResult::Range { buckets })
|
||||
}
|
||||
IntermediateBucketResult::Histogram { buckets } => {
|
||||
let buckets = intermediate_buckets_to_final_buckets(
|
||||
buckets,
|
||||
req.as_histogram()
|
||||
.expect("unexpected aggregation, expected histogram aggregation"),
|
||||
&req.sub_aggregation,
|
||||
)?;
|
||||
|
||||
Ok(BucketResult::Histogram { buckets })
|
||||
}
|
||||
IntermediateBucketResult::Terms(terms) => terms.into_final_result(
|
||||
req.as_term()
|
||||
.expect("unexpected aggregation, expected term aggregation"),
|
||||
&req.sub_aggregation,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the default entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// ...
|
||||
/// "my_histogram": {
|
||||
/// "buckets": [
|
||||
/// {
|
||||
/// "key": "2.0",
|
||||
/// "doc_count": 5
|
||||
/// },
|
||||
/// {
|
||||
/// "key": "4.0",
|
||||
/// "doc_count": 2
|
||||
/// },
|
||||
/// {
|
||||
/// "key": "6.0",
|
||||
/// "doc_count": 3
|
||||
/// }
|
||||
/// ]
|
||||
/// }
|
||||
/// ...
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct BucketEntry {
|
||||
/// The identifier of the bucket.
|
||||
pub key: Key,
|
||||
/// Number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
#[serde(flatten)]
|
||||
/// Sub-aggregations in this bucket.
|
||||
pub sub_aggregation: AggregationResults,
|
||||
}
|
||||
|
||||
impl BucketEntry {
|
||||
pub(crate) fn from_intermediate_and_req(
|
||||
entry: IntermediateHistogramBucketEntry,
|
||||
req: &AggregationsInternal,
|
||||
) -> crate::Result<Self> {
|
||||
Ok(BucketEntry {
|
||||
key: Key::F64(entry.key),
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
|
||||
entry.sub_aggregation,
|
||||
req,
|
||||
)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
impl GetDocCount for &BucketEntry {
|
||||
fn doc_count(&self) -> u64 {
|
||||
self.doc_count
|
||||
}
|
||||
}
|
||||
impl GetDocCount for BucketEntry {
|
||||
fn doc_count(&self) -> u64 {
|
||||
self.doc_count
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the range entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
///
|
||||
@@ -139,7 +368,7 @@ impl From<IntermediateBucketResult> for BucketResult {
|
||||
/// }
|
||||
/// ...
|
||||
/// }
|
||||
/// ```
|
||||
/// ```
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RangeBucketEntry {
|
||||
/// The identifier of the bucket.
|
||||
@@ -157,14 +386,20 @@ pub struct RangeBucketEntry {
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl From<IntermediateRangeBucketEntry> for RangeBucketEntry {
|
||||
fn from(entry: IntermediateRangeBucketEntry) -> Self {
|
||||
RangeBucketEntry {
|
||||
impl RangeBucketEntry {
|
||||
fn from_intermediate_and_req(
|
||||
entry: IntermediateRangeBucketEntry,
|
||||
req: &AggregationsInternal,
|
||||
) -> crate::Result<Self> {
|
||||
Ok(RangeBucketEntry {
|
||||
key: entry.key,
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: entry.sub_aggregation.into(),
|
||||
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
|
||||
entry.sub_aggregation,
|
||||
req,
|
||||
)?,
|
||||
to: entry.to,
|
||||
from: entry.from,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
1367
src/aggregation/bucket/histogram/histogram.rs
Normal file
2
src/aggregation/bucket/histogram/mod.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
mod histogram;
|
||||
pub use histogram::*;
|
||||
@@ -7,7 +7,134 @@
|
||||
//! Results of intermediate buckets are
|
||||
//! [IntermediateBucketResult](super::intermediate_agg_result::IntermediateBucketResult)
|
||||
|
||||
mod histogram;
|
||||
mod range;
|
||||
mod term_agg;
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub(crate) use histogram::SegmentHistogramCollector;
|
||||
pub use histogram::*;
|
||||
pub(crate) use range::SegmentRangeCollector;
|
||||
pub use range::*;
|
||||
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
|
||||
pub use term_agg::*;
|
||||
|
||||
/// Order for buckets in a bucket aggregation.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum Order {
|
||||
/// Asc order
|
||||
#[serde(rename = "asc")]
|
||||
Asc,
|
||||
/// Desc order
|
||||
#[serde(rename = "desc")]
|
||||
Desc,
|
||||
}
|
||||
|
||||
impl Default for Order {
|
||||
fn default() -> Self {
|
||||
Order::Desc
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
/// Order property by which to apply the order
|
||||
pub enum OrderTarget {
|
||||
/// The key of the bucket
|
||||
Key,
|
||||
/// The doc count of the bucket
|
||||
Count,
|
||||
/// Order by value of the sub aggregation metric with identified by given `String`.
|
||||
///
|
||||
/// Only single value metrics are supported currently
|
||||
SubAggregation(String),
|
||||
}
|
||||
|
||||
impl Default for OrderTarget {
|
||||
fn default() -> Self {
|
||||
OrderTarget::Count
|
||||
}
|
||||
}
|
||||
impl From<&str> for OrderTarget {
|
||||
fn from(val: &str) -> Self {
|
||||
match val {
|
||||
"_key" => OrderTarget::Key,
|
||||
"_count" => OrderTarget::Count,
|
||||
_ => OrderTarget::SubAggregation(val.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ToString for OrderTarget {
|
||||
fn to_string(&self) -> String {
|
||||
match self {
|
||||
OrderTarget::Key => "_key".to_string(),
|
||||
OrderTarget::Count => "_count".to_string(),
|
||||
OrderTarget::SubAggregation(agg) => agg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the order. target is either "_count", "_key", or the name of
|
||||
/// a metric sub_aggregation.
|
||||
///
|
||||
/// De/Serializes to elasticsearch compatible JSON.
|
||||
///
|
||||
/// Examples in JSON format:
|
||||
/// { "_count": "asc" }
|
||||
/// { "_key": "asc" }
|
||||
/// { "average_price": "asc" }
|
||||
#[derive(Clone, Default, Debug, PartialEq)]
|
||||
pub struct CustomOrder {
|
||||
/// The target property by which to sort by
|
||||
pub target: OrderTarget,
|
||||
/// The order asc or desc
|
||||
pub order: Order,
|
||||
}
|
||||
|
||||
impl Serialize for CustomOrder {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer {
|
||||
let map: HashMap<String, Order> =
|
||||
std::iter::once((self.target.to_string(), self.order)).collect();
|
||||
map.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for CustomOrder {
|
||||
fn deserialize<D>(deserializer: D) -> Result<CustomOrder, D::Error>
|
||||
where D: Deserializer<'de> {
|
||||
HashMap::<String, Order>::deserialize(deserializer).and_then(|map| {
|
||||
if let Some((key, value)) = map.into_iter().next() {
|
||||
Ok(CustomOrder {
|
||||
target: key.as_str().into(),
|
||||
order: value,
|
||||
})
|
||||
} else {
|
||||
Err(de::Error::custom(
|
||||
"unexpected empty map in order".to_string(),
|
||||
))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn custom_order_serde_test() {
|
||||
let order = CustomOrder {
|
||||
target: OrderTarget::Key,
|
||||
order: Order::Desc,
|
||||
};
|
||||
|
||||
let order_str = serde_json::to_string(&order).unwrap();
|
||||
assert_eq!(order_str, "{\"_key\":\"desc\"}");
|
||||
let order_deser = serde_json::from_str(&order_str).unwrap();
|
||||
|
||||
assert_eq!(order, order_deser);
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("{}");
|
||||
assert!(order_deser.is_err());
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("[]");
|
||||
assert!(order_deser.is_err());
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::fmt::Debug;
|
||||
use std::ops::Range;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -5,10 +6,10 @@ use serde::{Deserialize, Serialize};
|
||||
use crate::aggregation::agg_req_with_accessor::{
|
||||
AggregationsWithAccessor, BucketAggregationWithAccessor,
|
||||
};
|
||||
use crate::aggregation::intermediate_agg_result::IntermediateBucketResult;
|
||||
use crate::aggregation::segment_agg_result::{
|
||||
SegmentAggregationResultsCollector, SegmentRangeBucketEntry,
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateBucketResult, IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Type;
|
||||
@@ -30,20 +31,25 @@ use crate::{DocId, TantivyError};
|
||||
/// [crate::aggregation::intermediate_agg_result::IntermediateRangeBucketEntry] on the
|
||||
/// DistributedAggregationCollector.
|
||||
///
|
||||
/// # Limitations/Compatibility
|
||||
/// Overlapping ranges are not yet supported.
|
||||
///
|
||||
/// The keyed parameter (elasticsearch) is not yet supported.
|
||||
///
|
||||
/// # Request JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "range": {
|
||||
/// "my_ranges": {
|
||||
/// "field": "score",
|
||||
/// "ranges": [
|
||||
/// { "to": 3.0 },
|
||||
/// { "from": 3.0, "to": 7.0 },
|
||||
/// { "from": 7.0, "to": 20.0 }
|
||||
/// { "from": 7.0, "to": 20.0 },
|
||||
/// { "from": 20.0 }
|
||||
/// ]
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RangeAggregation {
|
||||
/// The field to aggregate on.
|
||||
@@ -97,22 +103,72 @@ pub struct SegmentRangeCollector {
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentRangeBucketEntry {
|
||||
pub key: Key,
|
||||
pub doc_count: u64,
|
||||
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None. Open interval, `to` is not
|
||||
/// inclusive.
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl Debug for SegmentRangeBucketEntry {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentRangeBucketEntry")
|
||||
.field("key", &self.key)
|
||||
.field("doc_count", &self.doc_count)
|
||||
.field("from", &self.from)
|
||||
.field("to", &self.to)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
impl SegmentRangeBucketEntry {
|
||||
pub(crate) fn into_intermediate_bucket_entry(
|
||||
self,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<IntermediateRangeBucketEntry> {
|
||||
let sub_aggregation = if let Some(sub_aggregation) = self.sub_aggregation {
|
||||
sub_aggregation.into_intermediate_aggregations_result(agg_with_accessor)?
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
Ok(IntermediateRangeBucketEntry {
|
||||
key: self.key,
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation,
|
||||
from: self.from,
|
||||
to: self.to,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentRangeCollector {
|
||||
pub fn into_intermediate_bucket_result(self) -> IntermediateBucketResult {
|
||||
pub fn into_intermediate_bucket_result(
|
||||
self,
|
||||
agg_with_accessor: &BucketAggregationWithAccessor,
|
||||
) -> crate::Result<IntermediateBucketResult> {
|
||||
let field_type = self.field_type;
|
||||
|
||||
let buckets = self
|
||||
.buckets
|
||||
.into_iter()
|
||||
.map(move |range_bucket| {
|
||||
(
|
||||
Ok((
|
||||
range_to_string(&range_bucket.range, &field_type),
|
||||
range_bucket.bucket.into(),
|
||||
)
|
||||
range_bucket
|
||||
.bucket
|
||||
.into_intermediate_bucket_entry(&agg_with_accessor.sub_aggregation)?,
|
||||
))
|
||||
})
|
||||
.collect();
|
||||
.collect::<crate::Result<_>>()?;
|
||||
|
||||
IntermediateBucketResult::Range(buckets)
|
||||
Ok(IntermediateBucketResult::Range(
|
||||
IntermediateRangeBucketResult { buckets },
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(
|
||||
@@ -170,11 +226,15 @@ impl SegmentRangeCollector {
|
||||
force_flush: bool,
|
||||
) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
let accessor = bucket_with_accessor
|
||||
.accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinatility");
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = bucket_with_accessor.accessor.get(docs[0]);
|
||||
let val2 = bucket_with_accessor.accessor.get(docs[1]);
|
||||
let val3 = bucket_with_accessor.accessor.get(docs[2]);
|
||||
let val4 = bucket_with_accessor.accessor.get(docs[3]);
|
||||
let val1 = accessor.get(docs[0]);
|
||||
let val2 = accessor.get(docs[1]);
|
||||
let val3 = accessor.get(docs[2]);
|
||||
let val4 = accessor.get(docs[3]);
|
||||
let bucket_pos1 = self.get_bucket_pos(val1);
|
||||
let bucket_pos2 = self.get_bucket_pos(val2);
|
||||
let bucket_pos3 = self.get_bucket_pos(val3);
|
||||
@@ -186,7 +246,7 @@ impl SegmentRangeCollector {
|
||||
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation);
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = bucket_with_accessor.accessor.get(*doc);
|
||||
let val = accessor.get(*doc);
|
||||
let bucket_pos = self.get_bucket_pos(val);
|
||||
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation);
|
||||
}
|
||||
@@ -341,7 +401,8 @@ mod tests {
|
||||
ranges,
|
||||
};
|
||||
|
||||
SegmentRangeCollector::from_req_and_validate(&req, &Default::default(), field_type).unwrap()
|
||||
SegmentRangeCollector::from_req_and_validate(&req, &Default::default(), field_type)
|
||||
.expect("unexpected error")
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -482,11 +543,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn range_binary_search_test_f64() {
|
||||
let ranges = vec![
|
||||
//(f64::MIN..10.0).into(),
|
||||
(10.0..100.0).into(),
|
||||
//(100.0..f64::MAX).into(),
|
||||
];
|
||||
let ranges = vec![(10.0..100.0).into()];
|
||||
|
||||
let collector = get_collector_from_ranges(ranges, Type::F64);
|
||||
let search = |val: u64| collector.get_bucket_pos(val);
|
||||
|
||||
1274
src/aggregation/bucket/term_agg.rs
Normal file
@@ -5,7 +5,7 @@ use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use super::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_validate;
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::{SegmentReader, TantivyError};
|
||||
use crate::SegmentReader;
|
||||
|
||||
/// Collector for aggregations.
|
||||
///
|
||||
@@ -75,13 +75,7 @@ impl Collector for AggregationCollector {
|
||||
_segment_local_id: crate::SegmentOrdinal,
|
||||
reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
let aggs_with_accessor = get_aggs_with_accessor_and_validate(&self.agg, reader)?;
|
||||
let result =
|
||||
SegmentAggregationResultsCollector::from_req_and_validate(&aggs_with_accessor)?;
|
||||
Ok(AggregationSegmentCollector {
|
||||
aggs: aggs_with_accessor,
|
||||
result,
|
||||
})
|
||||
AggregationSegmentCollector::from_agg_req_and_reader(&self.agg, reader)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
@@ -92,28 +86,28 @@ impl Collector for AggregationCollector {
|
||||
&self,
|
||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
merge_fruits(segment_fruits).map(|res| res.into())
|
||||
let res = merge_fruits(segment_fruits)?;
|
||||
AggregationResults::from_intermediate_and_req(res, self.agg.clone())
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
mut segment_fruits: Vec<IntermediateAggregationResults>,
|
||||
mut segment_fruits: Vec<crate::Result<IntermediateAggregationResults>>,
|
||||
) -> crate::Result<IntermediateAggregationResults> {
|
||||
if let Some(mut fruit) = segment_fruits.pop() {
|
||||
if let Some(fruit) = segment_fruits.pop() {
|
||||
let mut fruit = fruit?;
|
||||
for next_fruit in segment_fruits {
|
||||
fruit.merge_fruits(&next_fruit);
|
||||
fruit.merge_fruits(next_fruit?);
|
||||
}
|
||||
Ok(fruit)
|
||||
} else {
|
||||
Err(TantivyError::InvalidArgument(
|
||||
"no fruits provided in merge_fruits".to_string(),
|
||||
))
|
||||
Ok(IntermediateAggregationResults::default())
|
||||
}
|
||||
}
|
||||
|
||||
/// AggregationSegmentCollector does the aggregation collection on a segment.
|
||||
pub struct AggregationSegmentCollector {
|
||||
aggs: AggregationsWithAccessor,
|
||||
aggs_with_accessor: AggregationsWithAccessor,
|
||||
result: SegmentAggregationResultsCollector,
|
||||
}
|
||||
|
||||
@@ -128,22 +122,24 @@ impl AggregationSegmentCollector {
|
||||
let result =
|
||||
SegmentAggregationResultsCollector::from_req_and_validate(&aggs_with_accessor)?;
|
||||
Ok(AggregationSegmentCollector {
|
||||
aggs: aggs_with_accessor,
|
||||
aggs_with_accessor,
|
||||
result,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for AggregationSegmentCollector {
|
||||
type Fruit = IntermediateAggregationResults;
|
||||
type Fruit = crate::Result<IntermediateAggregationResults>;
|
||||
|
||||
#[inline]
|
||||
fn collect(&mut self, doc: crate::DocId, _score: crate::Score) {
|
||||
self.result.collect(doc, &self.aggs);
|
||||
self.result.collect(doc, &self.aggs_with_accessor);
|
||||
}
|
||||
|
||||
fn harvest(mut self) -> Self::Fruit {
|
||||
self.result.flush_staged_docs(&self.aggs, true);
|
||||
self.result.into()
|
||||
self.result
|
||||
.flush_staged_docs(&self.aggs_with_accessor, true);
|
||||
self.result
|
||||
.into_intermediate_aggregations_result(&self.aggs_with_accessor)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,43 +2,88 @@
|
||||
//! Intermediate aggregation results can be used to merge results between segments or between
|
||||
//! indices.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use fnv::FnvHashMap;
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::metric::{IntermediateAverage, IntermediateStats};
|
||||
use super::segment_agg_result::{
|
||||
SegmentAggregationResultsCollector, SegmentBucketResultCollector, SegmentMetricResultCollector,
|
||||
SegmentRangeBucketEntry,
|
||||
use super::agg_req::{AggregationsInternal, BucketAggregationType, MetricAggregation};
|
||||
use super::agg_result::BucketResult;
|
||||
use super::bucket::{
|
||||
cut_off_buckets, get_agg_name_and_property, GetDocCount, Order, OrderTarget,
|
||||
SegmentHistogramBucketEntry, TermsAggregation,
|
||||
};
|
||||
use super::metric::{IntermediateAverage, IntermediateStats};
|
||||
use super::segment_agg_result::SegmentMetricResultCollector;
|
||||
use super::{Key, SerializedKey, VecWithNames};
|
||||
use crate::aggregation::agg_result::{AggregationResults, BucketEntry};
|
||||
use crate::aggregation::bucket::TermsAggregationInternal;
|
||||
|
||||
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
||||
/// intermediate results.
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateAggregationResults(pub(crate) VecWithNames<IntermediateAggregationResult>);
|
||||
|
||||
impl From<SegmentAggregationResultsCollector> for IntermediateAggregationResults {
|
||||
fn from(tree: SegmentAggregationResultsCollector) -> Self {
|
||||
let mut data = vec![];
|
||||
for (key, bucket) in tree.buckets.into_iter() {
|
||||
data.push((key, IntermediateAggregationResult::Bucket(bucket.into())));
|
||||
}
|
||||
for (key, metric) in tree.metrics.into_iter() {
|
||||
data.push((key, IntermediateAggregationResult::Metric(metric.into())));
|
||||
}
|
||||
Self(VecWithNames::from_entries(data))
|
||||
}
|
||||
pub struct IntermediateAggregationResults {
|
||||
pub(crate) metrics: Option<VecWithNames<IntermediateMetricResult>>,
|
||||
pub(crate) buckets: Option<VecWithNames<IntermediateBucketResult>>,
|
||||
}
|
||||
|
||||
impl IntermediateAggregationResults {
|
||||
pub(crate) fn empty_from_req(req: &AggregationsInternal) -> Self {
|
||||
let metrics = if req.metrics.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let metrics = req
|
||||
.metrics
|
||||
.iter()
|
||||
.map(|(key, req)| {
|
||||
(
|
||||
key.to_string(),
|
||||
IntermediateMetricResult::empty_from_req(req),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
Some(VecWithNames::from_entries(metrics))
|
||||
};
|
||||
|
||||
let buckets = if req.buckets.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let buckets = req
|
||||
.buckets
|
||||
.iter()
|
||||
.map(|(key, req)| {
|
||||
(
|
||||
key.to_string(),
|
||||
IntermediateBucketResult::empty_from_req(&req.bucket_agg),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
Some(VecWithNames::from_entries(buckets))
|
||||
};
|
||||
|
||||
Self { metrics, buckets }
|
||||
}
|
||||
|
||||
/// Merge an other intermediate aggregation result into this result.
|
||||
///
|
||||
/// The order of the values need to be the same on both results. This is ensured when the same
|
||||
/// (key values) are present on the underlying VecWithNames struct.
|
||||
pub fn merge_fruits(&mut self, other: &IntermediateAggregationResults) {
|
||||
for (tree_left, tree_right) in self.0.values_mut().zip(other.0.values()) {
|
||||
tree_left.merge_fruits(tree_right);
|
||||
pub fn merge_fruits(&mut self, other: IntermediateAggregationResults) {
|
||||
if let (Some(buckets_left), Some(buckets_right)) = (&mut self.buckets, other.buckets) {
|
||||
for (bucket_left, bucket_right) in
|
||||
buckets_left.values_mut().zip(buckets_right.into_values())
|
||||
{
|
||||
bucket_left.merge_fruits(bucket_right);
|
||||
}
|
||||
}
|
||||
|
||||
if let (Some(metrics_left), Some(metrics_right)) = (&mut self.metrics, other.metrics) {
|
||||
for (metric_left, metric_right) in
|
||||
metrics_left.values_mut().zip(metrics_right.into_values())
|
||||
{
|
||||
metric_left.merge_fruits(metric_right);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -52,28 +97,6 @@ pub enum IntermediateAggregationResult {
|
||||
Metric(IntermediateMetricResult),
|
||||
}
|
||||
|
||||
impl IntermediateAggregationResult {
|
||||
fn merge_fruits(&mut self, other: &IntermediateAggregationResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateAggregationResult::Bucket(res_left),
|
||||
IntermediateAggregationResult::Bucket(res_right),
|
||||
) => {
|
||||
res_left.merge_fruits(res_right);
|
||||
}
|
||||
(
|
||||
IntermediateAggregationResult::Metric(res_left),
|
||||
IntermediateAggregationResult::Metric(res_right),
|
||||
) => {
|
||||
res_left.merge_fruits(res_right);
|
||||
}
|
||||
_ => {
|
||||
panic!("incompatible types in aggregation tree on merge fruits");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds the intermediate data for metric results
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum IntermediateMetricResult {
|
||||
@@ -97,7 +120,17 @@ impl From<SegmentMetricResultCollector> for IntermediateMetricResult {
|
||||
}
|
||||
|
||||
impl IntermediateMetricResult {
|
||||
fn merge_fruits(&mut self, other: &IntermediateMetricResult) {
|
||||
pub(crate) fn empty_from_req(req: &MetricAggregation) -> Self {
|
||||
match req {
|
||||
MetricAggregation::Average(_) => {
|
||||
IntermediateMetricResult::Average(IntermediateAverage::default())
|
||||
}
|
||||
MetricAggregation::Stats(_) => {
|
||||
IntermediateMetricResult::Stats(IntermediateStats::default())
|
||||
}
|
||||
}
|
||||
}
|
||||
fn merge_fruits(&mut self, other: IntermediateMetricResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateMetricResult::Average(avg_data_left),
|
||||
@@ -112,7 +145,7 @@ impl IntermediateMetricResult {
|
||||
stats_left.merge_fruits(stats_right);
|
||||
}
|
||||
_ => {
|
||||
panic!("incompatible fruit types in tree {:?}", other);
|
||||
panic!("incompatible fruit types in tree");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -124,36 +157,227 @@ impl IntermediateMetricResult {
|
||||
pub enum IntermediateBucketResult {
|
||||
/// This is the range entry for a bucket, which contains a key, count, from, to, and optionally
|
||||
/// sub_aggregations.
|
||||
Range(HashMap<SerializedKey, IntermediateRangeBucketEntry>),
|
||||
Range(IntermediateRangeBucketResult),
|
||||
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
Histogram {
|
||||
/// The buckets
|
||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||
},
|
||||
/// Term aggregation
|
||||
Terms(IntermediateTermBucketResult),
|
||||
}
|
||||
|
||||
impl From<SegmentBucketResultCollector> for IntermediateBucketResult {
|
||||
fn from(collector: SegmentBucketResultCollector) -> Self {
|
||||
match collector {
|
||||
SegmentBucketResultCollector::Range(range) => range.into_intermediate_bucket_result(),
|
||||
impl IntermediateBucketResult {
|
||||
pub(crate) fn empty_from_req(req: &BucketAggregationType) -> Self {
|
||||
match req {
|
||||
BucketAggregationType::Terms(_) => IntermediateBucketResult::Terms(Default::default()),
|
||||
BucketAggregationType::Range(_) => IntermediateBucketResult::Range(Default::default()),
|
||||
BucketAggregationType::Histogram(_) => {
|
||||
IntermediateBucketResult::Histogram { buckets: vec![] }
|
||||
}
|
||||
}
|
||||
}
|
||||
fn merge_fruits(&mut self, other: IntermediateBucketResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateBucketResult::Terms(term_res_left),
|
||||
IntermediateBucketResult::Terms(term_res_right),
|
||||
) => {
|
||||
merge_maps(&mut term_res_left.entries, term_res_right.entries);
|
||||
term_res_left.sum_other_doc_count += term_res_right.sum_other_doc_count;
|
||||
term_res_left.doc_count_error_upper_bound +=
|
||||
term_res_right.doc_count_error_upper_bound;
|
||||
}
|
||||
|
||||
(
|
||||
IntermediateBucketResult::Range(range_res_left),
|
||||
IntermediateBucketResult::Range(range_res_right),
|
||||
) => {
|
||||
merge_maps(&mut range_res_left.buckets, range_res_right.buckets);
|
||||
}
|
||||
(
|
||||
IntermediateBucketResult::Histogram {
|
||||
buckets: buckets_left,
|
||||
..
|
||||
},
|
||||
IntermediateBucketResult::Histogram {
|
||||
buckets: buckets_right,
|
||||
..
|
||||
},
|
||||
) => {
|
||||
let buckets = buckets_left
|
||||
.drain(..)
|
||||
.merge_join_by(buckets_right.into_iter(), |left, right| {
|
||||
left.key.partial_cmp(&right.key).unwrap_or(Ordering::Equal)
|
||||
})
|
||||
.map(|either| match either {
|
||||
itertools::EitherOrBoth::Both(mut left, right) => {
|
||||
left.merge_fruits(right);
|
||||
left
|
||||
}
|
||||
itertools::EitherOrBoth::Left(left) => left,
|
||||
itertools::EitherOrBoth::Right(right) => right,
|
||||
})
|
||||
.collect();
|
||||
|
||||
*buckets_left = buckets;
|
||||
}
|
||||
(IntermediateBucketResult::Range(_), _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
(IntermediateBucketResult::Histogram { .. }, _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
(IntermediateBucketResult::Terms { .. }, _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateBucketResult {
|
||||
fn merge_fruits(&mut self, other: &IntermediateBucketResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateBucketResult::Range(entries_left),
|
||||
IntermediateBucketResult::Range(entries_right),
|
||||
) => {
|
||||
for (name, entry_left) in entries_left.iter_mut() {
|
||||
if let Some(entry_right) = entries_right.get(name) {
|
||||
entry_left.merge_fruits(entry_right);
|
||||
}
|
||||
}
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// Range aggregation including error counts
|
||||
pub struct IntermediateRangeBucketResult {
|
||||
pub(crate) buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry>,
|
||||
}
|
||||
|
||||
for (key, res) in entries_right.iter() {
|
||||
if !entries_left.contains_key(key) {
|
||||
entries_left.insert(key.clone(), res.clone());
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// Term aggregation including error counts
|
||||
pub struct IntermediateTermBucketResult {
|
||||
pub(crate) entries: FnvHashMap<String, IntermediateTermBucketEntry>,
|
||||
pub(crate) sum_other_doc_count: u64,
|
||||
pub(crate) doc_count_error_upper_bound: u64,
|
||||
}
|
||||
|
||||
impl IntermediateTermBucketResult {
|
||||
pub(crate) fn into_final_result(
|
||||
self,
|
||||
req: &TermsAggregation,
|
||||
sub_aggregation_req: &AggregationsInternal,
|
||||
) -> crate::Result<BucketResult> {
|
||||
let req = TermsAggregationInternal::from_req(req);
|
||||
let mut buckets: Vec<BucketEntry> = self
|
||||
.entries
|
||||
.into_iter()
|
||||
.filter(|bucket| bucket.1.doc_count >= req.min_doc_count)
|
||||
.map(|(key, entry)| {
|
||||
Ok(BucketEntry {
|
||||
key: Key::Str(key),
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
|
||||
entry.sub_aggregation,
|
||||
sub_aggregation_req,
|
||||
)?,
|
||||
})
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
|
||||
let order = req.order.order;
|
||||
match req.order.target {
|
||||
OrderTarget::Key => {
|
||||
buckets.sort_by(|left, right| {
|
||||
if req.order.order == Order::Desc {
|
||||
left.key.partial_cmp(&right.key)
|
||||
} else {
|
||||
right.key.partial_cmp(&left.key)
|
||||
}
|
||||
.expect("expected type string, which is always sortable")
|
||||
});
|
||||
}
|
||||
OrderTarget::Count => {
|
||||
if req.order.order == Order::Desc {
|
||||
buckets.sort_unstable_by_key(|bucket| std::cmp::Reverse(bucket.doc_count()));
|
||||
} else {
|
||||
buckets.sort_unstable_by_key(|bucket| bucket.doc_count());
|
||||
}
|
||||
}
|
||||
OrderTarget::SubAggregation(name) => {
|
||||
let (agg_name, agg_property) = get_agg_name_and_property(&name);
|
||||
let mut buckets_with_val = buckets
|
||||
.into_iter()
|
||||
.map(|bucket| {
|
||||
let val = bucket
|
||||
.sub_aggregation
|
||||
.get_value_from_aggregation(agg_name, agg_property)?
|
||||
.unwrap_or(f64::NAN);
|
||||
Ok((bucket, val))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
|
||||
buckets_with_val.sort_by(|(_, val1), (_, val2)| {
|
||||
// TODO use total_cmp in next rust stable release
|
||||
match &order {
|
||||
Order::Desc => val2.partial_cmp(val1).unwrap_or(std::cmp::Ordering::Equal),
|
||||
Order::Asc => val1.partial_cmp(val2).unwrap_or(std::cmp::Ordering::Equal),
|
||||
}
|
||||
});
|
||||
buckets = buckets_with_val
|
||||
.into_iter()
|
||||
.map(|(bucket, _val)| bucket)
|
||||
.collect_vec();
|
||||
}
|
||||
}
|
||||
|
||||
// We ignore _term_doc_count_before_cutoff here, because it increases the upperbound error
|
||||
// only for terms that didn't make it into the top N.
|
||||
//
|
||||
// This can be interesting, as a value of quality of the results, but not good to check the
|
||||
// actual error count for the returned terms.
|
||||
let (_term_doc_count_before_cutoff, sum_other_doc_count) =
|
||||
cut_off_buckets(&mut buckets, req.size as usize);
|
||||
|
||||
let doc_count_error_upper_bound = if req.show_term_doc_count_error {
|
||||
Some(self.doc_count_error_upper_bound)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(BucketResult::Terms {
|
||||
buckets,
|
||||
sum_other_doc_count: self.sum_other_doc_count + sum_other_doc_count,
|
||||
doc_count_error_upper_bound,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
trait MergeFruits {
|
||||
fn merge_fruits(&mut self, other: Self);
|
||||
}
|
||||
|
||||
fn merge_maps<V: MergeFruits + Clone>(
|
||||
entries_left: &mut FnvHashMap<SerializedKey, V>,
|
||||
mut entries_right: FnvHashMap<SerializedKey, V>,
|
||||
) {
|
||||
for (name, entry_left) in entries_left.iter_mut() {
|
||||
if let Some(entry_right) = entries_right.remove(name) {
|
||||
entry_left.merge_fruits(entry_right);
|
||||
}
|
||||
}
|
||||
|
||||
for (key, res) in entries_right.into_iter() {
|
||||
entries_left.entry(key).or_insert(res);
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateHistogramBucketEntry {
|
||||
/// The unique the bucket is identified.
|
||||
pub key: f64,
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
}
|
||||
|
||||
impl From<SegmentHistogramBucketEntry> for IntermediateHistogramBucketEntry {
|
||||
fn from(entry: SegmentHistogramBucketEntry) -> Self {
|
||||
IntermediateHistogramBucketEntry {
|
||||
key: entry.key,
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -166,7 +390,6 @@ pub struct IntermediateRangeBucketEntry {
|
||||
pub key: Key,
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
pub(crate) values: Option<Vec<u64>>,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
@@ -177,49 +400,54 @@ pub struct IntermediateRangeBucketEntry {
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl From<SegmentRangeBucketEntry> for IntermediateRangeBucketEntry {
|
||||
fn from(entry: SegmentRangeBucketEntry) -> Self {
|
||||
let sub_aggregation = if let Some(sub_aggregation) = entry.sub_aggregation {
|
||||
sub_aggregation.into()
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
// let sub_aggregation = entry.sub_aggregation.into();
|
||||
/// This is the term entry for a bucket, which contains a count, and optionally
|
||||
/// sub_aggregations.
|
||||
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateTermBucketEntry {
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
}
|
||||
|
||||
IntermediateRangeBucketEntry {
|
||||
key: entry.key,
|
||||
doc_count: entry.doc_count,
|
||||
values: None,
|
||||
sub_aggregation,
|
||||
to: entry.to,
|
||||
from: entry.from,
|
||||
}
|
||||
impl MergeFruits for IntermediateTermBucketEntry {
|
||||
fn merge_fruits(&mut self, other: IntermediateTermBucketEntry) {
|
||||
self.doc_count += other.doc_count;
|
||||
self.sub_aggregation.merge_fruits(other.sub_aggregation);
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateRangeBucketEntry {
|
||||
fn merge_fruits(&mut self, other: &IntermediateRangeBucketEntry) {
|
||||
impl MergeFruits for IntermediateRangeBucketEntry {
|
||||
fn merge_fruits(&mut self, other: IntermediateRangeBucketEntry) {
|
||||
self.doc_count += other.doc_count;
|
||||
self.sub_aggregation.merge_fruits(&other.sub_aggregation);
|
||||
self.sub_aggregation.merge_fruits(other.sub_aggregation);
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeFruits for IntermediateHistogramBucketEntry {
|
||||
fn merge_fruits(&mut self, other: IntermediateHistogramBucketEntry) {
|
||||
self.doc_count += other.doc_count;
|
||||
self.sub_aggregation.merge_fruits(other.sub_aggregation);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn get_sub_test_tree(data: &[(String, u64)]) -> IntermediateAggregationResults {
|
||||
let mut map = HashMap::new();
|
||||
let mut buckets = HashMap::new();
|
||||
let mut buckets = FnvHashMap::default();
|
||||
for (key, doc_count) in data {
|
||||
buckets.insert(
|
||||
key.to_string(),
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
values: None,
|
||||
sub_aggregation: Default::default(),
|
||||
from: None,
|
||||
to: None,
|
||||
@@ -228,21 +456,25 @@ mod tests {
|
||||
}
|
||||
map.insert(
|
||||
"my_agg_level2".to_string(),
|
||||
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(buckets)),
|
||||
IntermediateBucketResult::Range(IntermediateRangeBucketResult { buckets }),
|
||||
);
|
||||
IntermediateAggregationResults(VecWithNames::from_entries(map.into_iter().collect()))
|
||||
IntermediateAggregationResults {
|
||||
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),
|
||||
metrics: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_test_tree(data: &[(String, u64, String, u64)]) -> IntermediateAggregationResults {
|
||||
fn get_intermediat_tree_with_ranges(
|
||||
data: &[(String, u64, String, u64)],
|
||||
) -> IntermediateAggregationResults {
|
||||
let mut map = HashMap::new();
|
||||
let mut buckets = HashMap::new();
|
||||
let mut buckets: FnvHashMap<_, _> = Default::default();
|
||||
for (key, doc_count, sub_aggregation_key, sub_aggregation_count) in data {
|
||||
buckets.insert(
|
||||
key.to_string(),
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
values: None,
|
||||
from: None,
|
||||
to: None,
|
||||
sub_aggregation: get_sub_test_tree(&[(
|
||||
@@ -254,25 +486,28 @@ mod tests {
|
||||
}
|
||||
map.insert(
|
||||
"my_agg_level1".to_string(),
|
||||
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(buckets)),
|
||||
IntermediateBucketResult::Range(IntermediateRangeBucketResult { buckets }),
|
||||
);
|
||||
IntermediateAggregationResults(VecWithNames::from_entries(map.into_iter().collect()))
|
||||
IntermediateAggregationResults {
|
||||
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),
|
||||
metrics: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_1() {
|
||||
let mut tree_left = get_test_tree(&[
|
||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
let tree_right = get_test_tree(&[
|
||||
let tree_right = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 60, "1900".to_string(), 30),
|
||||
("blue".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
tree_left.merge_fruits(&tree_right);
|
||||
tree_left.merge_fruits(tree_right);
|
||||
|
||||
let tree_expected = get_test_tree(&[
|
||||
let tree_expected = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 110, "1900".to_string(), 55),
|
||||
("blue".to_string(), 55, "1900".to_string(), 80),
|
||||
]);
|
||||
@@ -282,18 +517,18 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_2() {
|
||||
let mut tree_left = get_test_tree(&[
|
||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
let tree_right = get_test_tree(&[
|
||||
let tree_right = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 60, "1900".to_string(), 30),
|
||||
("green".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
tree_left.merge_fruits(&tree_right);
|
||||
tree_left.merge_fruits(tree_right);
|
||||
|
||||
let tree_expected = get_test_tree(&[
|
||||
let tree_expected = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 110, "1900".to_string(), 55),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
("green".to_string(), 25, "1900".to_string(), 50),
|
||||
@@ -301,4 +536,18 @@ mod tests {
|
||||
|
||||
assert_eq!(tree_left, tree_expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_empty() {
|
||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
|
||||
let orig = tree_left.clone();
|
||||
|
||||
tree_left.merge_fruits(IntermediateAggregationResults::default());
|
||||
|
||||
assert_eq!(tree_left, orig);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,8 +19,8 @@ use crate::DocId;
|
||||
/// "avg": {
|
||||
/// "field": "score",
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
/// }
|
||||
/// ```
|
||||
pub struct AverageAggregation {
|
||||
/// The field name to compute the stats on.
|
||||
pub field: String,
|
||||
@@ -94,7 +94,7 @@ impl IntermediateAverage {
|
||||
}
|
||||
|
||||
/// Merge average data into this instance.
|
||||
pub fn merge_fruits(&mut self, other: &IntermediateAverage) {
|
||||
pub fn merge_fruits(&mut self, other: IntermediateAverage) {
|
||||
self.sum += other.sum;
|
||||
self.doc_count += other.doc_count;
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::schema::Type;
|
||||
use crate::DocId;
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
/// A multi-value metric aggregation that computes stats of numeric values that are
|
||||
/// extracted from the aggregated documents.
|
||||
@@ -17,7 +17,7 @@ use crate::DocId;
|
||||
/// "field": "score",
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
/// ```
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct StatsAggregation {
|
||||
@@ -53,6 +53,23 @@ pub struct Stats {
|
||||
pub avg: Option<f64>,
|
||||
}
|
||||
|
||||
impl Stats {
|
||||
pub(crate) fn get_value(&self, agg_property: &str) -> crate::Result<Option<f64>> {
|
||||
match agg_property {
|
||||
"count" => Ok(Some(self.count as f64)),
|
||||
"sum" => Ok(Some(self.sum)),
|
||||
"standard_deviation" => Ok(self.standard_deviation),
|
||||
"min" => Ok(self.min),
|
||||
"max" => Ok(self.max),
|
||||
"avg" => Ok(self.avg),
|
||||
_ => Err(TantivyError::InvalidArgument(format!(
|
||||
"unknown property {} on stats metric aggregation",
|
||||
agg_property
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// IntermediateStats contains the mergeable version for stats.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateStats {
|
||||
@@ -62,9 +79,8 @@ pub struct IntermediateStats {
|
||||
min: f64,
|
||||
max: f64,
|
||||
}
|
||||
|
||||
impl IntermediateStats {
|
||||
fn new() -> Self {
|
||||
impl Default for IntermediateStats {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
count: 0,
|
||||
sum: 0.0,
|
||||
@@ -73,7 +89,9 @@ impl IntermediateStats {
|
||||
max: f64::MIN,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateStats {
|
||||
pub(crate) fn avg(&self) -> Option<f64> {
|
||||
if self.count == 0 {
|
||||
None
|
||||
@@ -92,7 +110,7 @@ impl IntermediateStats {
|
||||
}
|
||||
|
||||
/// Merge data from other stats into this instance.
|
||||
pub fn merge_fruits(&mut self, other: &IntermediateStats) {
|
||||
pub fn merge_fruits(&mut self, other: IntermediateStats) {
|
||||
self.count += other.count;
|
||||
self.sum += other.sum;
|
||||
self.squared_sum += other.squared_sum;
|
||||
@@ -142,7 +160,7 @@ impl SegmentStatsCollector {
|
||||
pub fn from_req(field_type: Type) -> Self {
|
||||
Self {
|
||||
field_type,
|
||||
stats: IntermediateStats::new(),
|
||||
stats: IntermediateStats::default(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
|
||||
@@ -182,12 +200,50 @@ mod tests {
|
||||
};
|
||||
use crate::aggregation::agg_result::AggregationResults;
|
||||
use crate::aggregation::metric::StatsAggregation;
|
||||
use crate::aggregation::tests::get_test_index_2_segments;
|
||||
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values};
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::query::TermQuery;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::Term;
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_stats_empty_index() -> crate::Result<()> {
|
||||
// test index without segments
|
||||
let values = vec![];
|
||||
|
||||
let index = get_test_index_from_values(false, &values)?;
|
||||
|
||||
let agg_req_1: Aggregations = vec![(
|
||||
"stats".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
|
||||
"score".to_string(),
|
||||
))),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
assert_eq!(
|
||||
res["stats"],
|
||||
json!({
|
||||
"avg": Value::Null,
|
||||
"count": 0,
|
||||
"max": Value::Null,
|
||||
"min": Value::Null,
|
||||
"standard_deviation": Value::Null,
|
||||
"sum": 0.0
|
||||
})
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_stats() -> crate::Result<()> {
|
||||
let index = get_test_index_2_segments(false)?;
|
||||
|
||||
@@ -18,6 +18,10 @@
|
||||
//! Create an [AggregationCollector] from this request. AggregationCollector implements the
|
||||
//! `Collector` trait and can be passed as collector into `searcher.search()`.
|
||||
//!
|
||||
//! #### Limitations
|
||||
//!
|
||||
//! Currently aggregations work only on single value fast fields of type u64, f64 and i64.
|
||||
//!
|
||||
//! # JSON Format
|
||||
//! Aggregations request and result structures de/serialize into elasticsearch compatible JSON.
|
||||
//!
|
||||
@@ -28,9 +32,15 @@
|
||||
//! let agg_res = searcher.search(&term_query, &collector).unwrap_err();
|
||||
//! let json_response_string: String = &serde_json::to_string(&agg_res)?;
|
||||
//! ```
|
||||
//! # Limitations
|
||||
//!
|
||||
//! Currently aggregations work only on single value fast fields of type u64, f64 and i64.
|
||||
//! # Supported Aggregations
|
||||
//! - [Bucket](bucket)
|
||||
//! - [Histogram](bucket::HistogramAggregation)
|
||||
//! - [Range](bucket::RangeAggregation)
|
||||
//! - [Terms](bucket::TermsAggregation)
|
||||
//! - [Metric](metric)
|
||||
//! - [Average](metric::AverageAggregation)
|
||||
//! - [Stats](metric::StatsAggregation)
|
||||
//!
|
||||
//! # Example
|
||||
//! Compute the average metric, by building [agg_req::Aggregations], which is built from an (String,
|
||||
@@ -90,7 +100,8 @@
|
||||
//! }
|
||||
//! }
|
||||
//! "#;
|
||||
//! let agg_req: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
|
||||
//! let agg_req: Aggregations =
|
||||
//! serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
|
||||
//! ```
|
||||
//! # Code Organization
|
||||
//!
|
||||
@@ -101,8 +112,7 @@
|
||||
//! Buckets can contain sub-aggregations. In this example we create buckets with the range
|
||||
//! aggregation and then calculate the average on each bucket.
|
||||
//! ```
|
||||
//! use tantivy::aggregation::agg_req::{Aggregations, Aggregation, BucketAggregation,
|
||||
//! MetricAggregation, BucketAggregationType};
|
||||
//! use tantivy::aggregation::agg_req::*;
|
||||
//! use tantivy::aggregation::metric::AverageAggregation;
|
||||
//! use tantivy::aggregation::bucket::RangeAggregation;
|
||||
//! let sub_agg_req_1: Aggregations = vec method to
|
||||
//! merge multiple results. The merged result can then be converted into
|
||||
//! [agg_result::AggregationResults] via the [Into] trait.
|
||||
//! [agg_result::AggregationResults] via the
|
||||
//! [agg_result::AggregationResults::from_intermediate_and_req] method.
|
||||
|
||||
pub mod agg_req;
|
||||
mod agg_req_with_accessor;
|
||||
@@ -167,6 +178,7 @@ pub(crate) struct VecWithNames<T: Clone> {
|
||||
values: Vec<T>,
|
||||
keys: Vec<String>,
|
||||
}
|
||||
|
||||
impl<T: Clone> Default for VecWithNames<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
@@ -189,6 +201,14 @@ impl<T: Clone> From<HashMap<String, T>> for VecWithNames<T> {
|
||||
}
|
||||
|
||||
impl<T: Clone> VecWithNames<T> {
|
||||
fn from_other<K: Clone + Into<T>>(entries: VecWithNames<K>) -> Self {
|
||||
let values = entries.values.into_iter().map(Into::into).collect();
|
||||
Self {
|
||||
keys: entries.keys,
|
||||
values,
|
||||
}
|
||||
}
|
||||
|
||||
fn from_entries(mut entries: Vec<(String, T)>) -> Self {
|
||||
// Sort to ensure order of elements match across multiple instances
|
||||
entries.sort_by(|left, right| left.0.cmp(&right.0));
|
||||
@@ -212,6 +232,9 @@ impl<T: Clone> VecWithNames<T> {
|
||||
fn keys(&self) -> impl Iterator<Item = &str> + '_ {
|
||||
self.keys.iter().map(|key| key.as_str())
|
||||
}
|
||||
fn into_values(self) -> impl Iterator<Item = T> {
|
||||
self.values.into_iter()
|
||||
}
|
||||
fn values(&self) -> impl Iterator<Item = &T> + '_ {
|
||||
self.values.iter()
|
||||
}
|
||||
@@ -224,6 +247,14 @@ impl<T: Clone> VecWithNames<T> {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.keys.is_empty()
|
||||
}
|
||||
fn len(&self) -> usize {
|
||||
self.keys.len()
|
||||
}
|
||||
fn get(&self, name: &str) -> Option<&T> {
|
||||
self.keys()
|
||||
.position(|key| key == name)
|
||||
.map(|pos| &self.values[pos])
|
||||
}
|
||||
}
|
||||
|
||||
/// The serialized key is used in a HashMap.
|
||||
@@ -284,21 +315,22 @@ pub(crate) fn f64_to_fastfield_u64(val: f64, field_type: &Type) -> Option<u64> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use futures::executor::block_on;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::agg_req::{Aggregation, Aggregations, BucketAggregation};
|
||||
use super::bucket::RangeAggregation;
|
||||
use super::collector::AggregationCollector;
|
||||
use super::metric::AverageAggregation;
|
||||
use crate::aggregation::agg_req::{BucketAggregationType, MetricAggregation};
|
||||
use crate::aggregation::agg_req::{
|
||||
get_term_dict_field_names, BucketAggregationType, MetricAggregation,
|
||||
};
|
||||
use crate::aggregation::agg_result::AggregationResults;
|
||||
use crate::aggregation::bucket::TermsAggregation;
|
||||
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use crate::aggregation::segment_agg_result::DOC_BLOCK_SIZE;
|
||||
use crate::aggregation::DistributedAggregationCollector;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
use crate::schema::{Cardinality, IndexRecordOption, Schema, TextFieldIndexing};
|
||||
use crate::schema::{Cardinality, IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
|
||||
use crate::{Index, Term};
|
||||
|
||||
fn get_avg_req(field_name: &str) -> Aggregation {
|
||||
@@ -310,16 +342,87 @@ mod tests {
|
||||
pub fn get_test_index_with_num_docs(
|
||||
merge_segments: bool,
|
||||
num_docs: usize,
|
||||
) -> crate::Result<Index> {
|
||||
get_test_index_from_values(
|
||||
merge_segments,
|
||||
&(0..num_docs).map(|el| el as f64).collect::<Vec<f64>>(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn exec_request(agg_req: Aggregations, index: &Index) -> crate::Result<Value> {
|
||||
exec_request_with_query(agg_req, index, None)
|
||||
}
|
||||
pub fn exec_request_with_query(
|
||||
agg_req: Aggregations,
|
||||
index: &Index,
|
||||
query: Option<(&str, &str)>,
|
||||
) -> crate::Result<Value> {
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let agg_res = if let Some((field, term)) = query {
|
||||
let text_field = reader.searcher().schema().get_field(field).unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, term),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
searcher.search(&term_query, &collector)?
|
||||
} else {
|
||||
searcher.search(&AllQuery, &collector)?
|
||||
};
|
||||
|
||||
// Test serialization/deserialization rountrip
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub fn get_test_index_from_values(
|
||||
merge_segments: bool,
|
||||
values: &[f64],
|
||||
) -> crate::Result<Index> {
|
||||
// Every value gets its own segment
|
||||
let mut segment_and_values = vec![];
|
||||
for value in values {
|
||||
segment_and_values.push(vec![(*value, value.to_string())]);
|
||||
}
|
||||
get_test_index_from_values_and_terms(merge_segments, &segment_and_values)
|
||||
}
|
||||
|
||||
pub fn get_test_index_from_terms(
|
||||
merge_segments: bool,
|
||||
values: &[Vec<&str>],
|
||||
) -> crate::Result<Index> {
|
||||
// Every value gets its own segment
|
||||
let segment_and_values = values
|
||||
.iter()
|
||||
.map(|terms| {
|
||||
terms
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, term)| (i as f64, term.to_string()))
|
||||
.collect()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
get_test_index_from_values_and_terms(merge_segments, &segment_and_values)
|
||||
}
|
||||
|
||||
pub fn get_test_index_from_values_and_terms(
|
||||
merge_segments: bool,
|
||||
segment_and_values: &[Vec<(f64, String)>],
|
||||
) -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_fieldtype = crate::schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_tokenizer("default")
|
||||
.set_index_option(IndexRecordOption::WithFreqs),
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_fast()
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
|
||||
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
|
||||
let string_field_id = schema_builder.add_text_field("string_id", STRING | FAST);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
@@ -332,25 +435,29 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for i in 0..num_docs {
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
score_field => i as u64,
|
||||
score_field_f64 => i as f64,
|
||||
score_field_i64 => i as i64,
|
||||
fraction_field => i as f64/100.0,
|
||||
))?;
|
||||
for values in segment_and_values {
|
||||
for (i, term) in values {
|
||||
let i = *i;
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
text_field_id => term.to_string(),
|
||||
string_field_id => term.to_string(),
|
||||
score_field => i as u64,
|
||||
score_field_f64 => i as f64,
|
||||
score_field_i64 => i as i64,
|
||||
fraction_field => i as f64/100.0,
|
||||
))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
index_writer.commit()?;
|
||||
}
|
||||
if merge_segments {
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
|
||||
@@ -362,22 +469,20 @@ mod tests {
|
||||
merge_segments: bool,
|
||||
use_distributed_collector: bool,
|
||||
) -> crate::Result<()> {
|
||||
let index = get_test_index_with_num_docs(merge_segments, 300)?;
|
||||
let mut values_and_terms = (0..80)
|
||||
.map(|val| vec![(val as f64, "terma".to_string())])
|
||||
.collect::<Vec<_>>();
|
||||
values_and_terms.last_mut().unwrap()[0].1 = "termb".to_string();
|
||||
let index = get_test_index_from_values_and_terms(merge_segments, &values_and_terms)?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
assert_eq!(DOC_BLOCK_SIZE, 256);
|
||||
assert_eq!(DOC_BLOCK_SIZE, 64);
|
||||
// In the tree we cache Documents of DOC_BLOCK_SIZE, before passing them down as one block.
|
||||
//
|
||||
// Build a request so that on the first level we have one full cache, which is then flushed.
|
||||
// The same cache should have some residue docs at the end, which are flushed (Range 0-266)
|
||||
// -> 266 docs
|
||||
// The same cache should have some residue docs at the end, which are flushed (Range 0-70)
|
||||
// -> 70 docs
|
||||
//
|
||||
// The second level should also have some residue docs in the cache that are flushed at the
|
||||
// end.
|
||||
@@ -385,38 +490,70 @@ mod tests {
|
||||
// A second bucket on the first level should have the cache unfilled
|
||||
|
||||
// let elasticsearch_compatible_json_req = r#"
|
||||
let elasticsearch_compatible_json_req = r#"
|
||||
let elasticsearch_compatible_json = json!(
|
||||
{
|
||||
"bucketsL1": {
|
||||
"range": {
|
||||
"field": "score",
|
||||
"ranges": [ { "to": 3.0 }, { "from": 3.0, "to": 266.0 }, { "from": 266.0 } ]
|
||||
"ranges": [ { "to": 3.0f64 }, { "from": 3.0f64, "to": 70.0f64 }, { "from": 70.0f64 } ]
|
||||
},
|
||||
"aggs": {
|
||||
"bucketsL2": {
|
||||
"range": {
|
||||
"field": "score",
|
||||
"ranges": [ { "to": 100.0 }, { "from": 100.0, "to": 266.0 }, { "from": 266.0 } ]
|
||||
"ranges": [ { "to": 30.0f64 }, { "from": 30.0f64, "to": 70.0f64 }, { "from": 70.0f64 } ]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"histogram_test":{
|
||||
"histogram": {
|
||||
"field": "score",
|
||||
"interval": 70.0,
|
||||
"offset": 3.0,
|
||||
},
|
||||
"aggs": {
|
||||
"bucketsL2": {
|
||||
"histogram": {
|
||||
"field": "score",
|
||||
"interval": 70.0
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"term_agg_test":{
|
||||
"terms": {
|
||||
"field": "string_id"
|
||||
},
|
||||
"aggs": {
|
||||
"bucketsL2": {
|
||||
"histogram": {
|
||||
"field": "score",
|
||||
"interval": 70.0
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"#;
|
||||
});
|
||||
|
||||
let agg_req: Aggregations =
|
||||
serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
|
||||
serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
|
||||
.unwrap();
|
||||
|
||||
let agg_res: AggregationResults = if use_distributed_collector {
|
||||
let collector = DistributedAggregationCollector::from_aggs(agg_req);
|
||||
let collector = DistributedAggregationCollector::from_aggs(agg_req.clone());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&term_query, &collector).unwrap().into()
|
||||
AggregationResults::from_intermediate_and_req(
|
||||
searcher.search(&AllQuery, &collector).unwrap(),
|
||||
agg_req,
|
||||
)
|
||||
.unwrap()
|
||||
} else {
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&term_query, &collector).unwrap()
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
};
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
@@ -426,15 +563,15 @@ mod tests {
|
||||
res["bucketsL1"]["buckets"][0]["bucketsL2"]["buckets"][0]["doc_count"],
|
||||
3
|
||||
);
|
||||
assert_eq!(res["bucketsL1"]["buckets"][1]["key"], "3-266");
|
||||
assert_eq!(res["bucketsL1"]["buckets"][1]["doc_count"], 266 - 3);
|
||||
assert_eq!(res["bucketsL1"]["buckets"][1]["key"], "3-70");
|
||||
assert_eq!(res["bucketsL1"]["buckets"][1]["doc_count"], 70 - 3);
|
||||
assert_eq!(
|
||||
res["bucketsL1"]["buckets"][1]["bucketsL2"]["buckets"][0]["doc_count"],
|
||||
97
|
||||
27
|
||||
);
|
||||
assert_eq!(
|
||||
res["bucketsL1"]["buckets"][1]["bucketsL2"]["buckets"][1]["doc_count"],
|
||||
166
|
||||
40
|
||||
);
|
||||
assert_eq!(
|
||||
res["bucketsL1"]["buckets"][1]["bucketsL2"]["buckets"][2]["doc_count"],
|
||||
@@ -442,9 +579,49 @@ mod tests {
|
||||
);
|
||||
assert_eq!(
|
||||
res["bucketsL1"]["buckets"][2]["bucketsL2"]["buckets"][2]["doc_count"],
|
||||
300 - 266
|
||||
80 - 70
|
||||
);
|
||||
assert_eq!(res["bucketsL1"]["buckets"][2]["doc_count"], 80 - 70);
|
||||
|
||||
assert_eq!(
|
||||
res["term_agg_test"],
|
||||
json!(
|
||||
{
|
||||
"buckets": [
|
||||
{
|
||||
"bucketsL2": {
|
||||
"buckets": [
|
||||
{
|
||||
"doc_count": 70,
|
||||
"key": 0.0
|
||||
},
|
||||
{
|
||||
"doc_count": 9,
|
||||
"key": 70.0
|
||||
}
|
||||
]
|
||||
},
|
||||
"doc_count": 79,
|
||||
"key": "terma"
|
||||
},
|
||||
{
|
||||
"bucketsL2": {
|
||||
"buckets": [
|
||||
{
|
||||
"doc_count": 1,
|
||||
"key": 70.0
|
||||
}
|
||||
]
|
||||
},
|
||||
"doc_count": 1,
|
||||
"key": "termb"
|
||||
}
|
||||
],
|
||||
"doc_count_error_upper_bound": 0,
|
||||
"sum_other_doc_count": 0
|
||||
}
|
||||
)
|
||||
);
|
||||
assert_eq!(res["bucketsL1"]["buckets"][2]["doc_count"], 300 - 266);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -461,12 +638,12 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_fieldtype = crate::schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_tokenizer("default")
|
||||
.set_index_option(IndexRecordOption::WithFreqs),
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_fast()
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
schema_builder.add_text_field("dummy_text", STRING);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
@@ -553,7 +730,7 @@ mod tests {
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
|
||||
@@ -671,10 +848,21 @@ mod tests {
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let sub_agg_req: Aggregations =
|
||||
vec![("average_in_range".to_string(), get_avg_req("score"))]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let sub_agg_req: Aggregations = vec![
|
||||
("average_in_range".to_string(), get_avg_req("score")),
|
||||
(
|
||||
"term_agg".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "text".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let agg_req: Aggregations = if use_elastic_json_req {
|
||||
let elasticsearch_compatible_json_req = r#"
|
||||
{
|
||||
@@ -690,7 +878,8 @@ mod tests {
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_in_range": { "avg": { "field": "score" } }
|
||||
"average_in_range": { "avg": { "field": "score" } },
|
||||
"term_agg": { "terms": { "field": "text" } }
|
||||
}
|
||||
},
|
||||
"rangei64": {
|
||||
@@ -705,7 +894,8 @@ mod tests {
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_in_range": { "avg": { "field": "score" } }
|
||||
"average_in_range": { "avg": { "field": "score" } },
|
||||
"term_agg": { "terms": { "field": "text" } }
|
||||
}
|
||||
},
|
||||
"average": {
|
||||
@@ -723,7 +913,8 @@ mod tests {
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_in_range": { "avg": { "field": "score" } }
|
||||
"average_in_range": { "avg": { "field": "score" } },
|
||||
"term_agg": { "terms": { "field": "text" } }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -782,6 +973,9 @@ mod tests {
|
||||
agg_req
|
||||
};
|
||||
|
||||
let field_names = get_term_dict_field_names(&agg_req);
|
||||
assert_eq!(field_names, vec!["text".to_string()].into_iter().collect());
|
||||
|
||||
let agg_res: AggregationResults = if use_distributed_collector {
|
||||
let collector = DistributedAggregationCollector::from_aggs(agg_req.clone());
|
||||
|
||||
@@ -790,7 +984,7 @@ mod tests {
|
||||
// Test de/serialization roundtrip on intermediate_agg_result
|
||||
let res: IntermediateAggregationResults =
|
||||
serde_json::from_str(&serde_json::to_string(&res).unwrap()).unwrap();
|
||||
res.into()
|
||||
AggregationResults::from_intermediate_and_req(res, agg_req.clone()).unwrap()
|
||||
} else {
|
||||
let collector = AggregationCollector::from_aggs(agg_req.clone());
|
||||
|
||||
@@ -922,10 +1116,10 @@ mod tests {
|
||||
searcher.search(&AllQuery, &collector).unwrap_err()
|
||||
};
|
||||
|
||||
let agg_res = avg_on_field("text");
|
||||
let agg_res = avg_on_field("dummy_text");
|
||||
assert_eq!(
|
||||
format!("{:?}", agg_res),
|
||||
r#"InvalidArgument("Only single value fast fields of type f64, u64, i64 are supported, but got Str ")"#
|
||||
r#"InvalidArgument("Only fast fields of type f64, u64, i64 are supported, but got Str ")"#
|
||||
);
|
||||
|
||||
let agg_res = avg_on_field("not_exist_field");
|
||||
@@ -937,7 +1131,7 @@ mod tests {
|
||||
let agg_res = avg_on_field("scores_i64");
|
||||
assert_eq!(
|
||||
format!("{:?}", agg_res),
|
||||
r#"InvalidArgument("Invalid field type in aggregation I64, only Cardinality::SingleValue supported")"#
|
||||
r#"InvalidArgument("Invalid field cardinality on field scores_i64 expected SingleValue, but got MultiValues")"#
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -946,10 +1140,12 @@ mod tests {
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::{thread_rng, Rng};
|
||||
use test::{self, Bencher};
|
||||
|
||||
use super::*;
|
||||
use crate::aggregation::bucket::{HistogramAggregation, HistogramBounds, TermsAggregation};
|
||||
use crate::aggregation::metric::StatsAggregation;
|
||||
use crate::query::AllQuery;
|
||||
|
||||
@@ -957,12 +1153,14 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_fieldtype = crate::schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_tokenizer("default")
|
||||
.set_index_option(IndexRecordOption::WithFreqs),
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let text_field_many_terms =
|
||||
schema_builder.add_text_field("text_many_terms", STRING | FAST);
|
||||
let text_field_few_terms =
|
||||
schema_builder.add_text_field("text_few_terms", STRING | FAST);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
@@ -970,6 +1168,10 @@ mod tests {
|
||||
schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
|
||||
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
|
||||
let index = Index::create_from_tempdir(schema_builder.build())?;
|
||||
let few_terms_data = vec!["INFO", "ERROR", "WARN", "DEBUG"];
|
||||
let many_terms_data = (0..15_000)
|
||||
.map(|num| format!("author{}", num))
|
||||
.collect::<Vec<_>>();
|
||||
{
|
||||
let mut rng = thread_rng();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
@@ -978,6 +1180,8 @@ mod tests {
|
||||
let val: f64 = rng.gen_range(0.0..1_000_000.0);
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||
score_field => val as u64,
|
||||
score_field_f64 => val as f64,
|
||||
score_field_i64 => val as i64,
|
||||
@@ -990,7 +1194,7 @@ mod tests {
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
|
||||
@@ -1129,6 +1333,64 @@ mod tests {
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_aggregation_terms_few(b: &mut Bencher) {
|
||||
let index = get_test_index_bench(false).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req: Aggregations = vec![(
|
||||
"my_texts".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "text_few_terms".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
searcher.search(&AllQuery, &collector).unwrap().into();
|
||||
|
||||
agg_res
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_aggregation_terms_many(b: &mut Bencher) {
|
||||
let index = get_test_index_bench(false).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req: Aggregations = vec![(
|
||||
"my_texts".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "text_many_terms".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
searcher.search(&AllQuery, &collector).unwrap().into();
|
||||
|
||||
agg_res
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_aggregation_range_only(b: &mut Bencher) {
|
||||
let index = get_test_index_bench(false).unwrap();
|
||||
@@ -1165,6 +1427,110 @@ mod tests {
|
||||
});
|
||||
}
|
||||
|
||||
// hard bounds has a different algorithm, because it actually limits collection range
|
||||
#[bench]
|
||||
fn bench_aggregation_histogram_only_hard_bounds(b: &mut Bencher) {
|
||||
let index = get_test_index_bench(false).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req_1: Aggregations = vec![(
|
||||
"rangef64".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
|
||||
field: "score_f64".to_string(),
|
||||
interval: 100f64,
|
||||
hard_bounds: Some(HistogramBounds {
|
||||
min: 1000.0,
|
||||
max: 300_000.0,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
searcher.search(&AllQuery, &collector).unwrap().into();
|
||||
|
||||
agg_res
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_aggregation_histogram_with_avg(b: &mut Bencher) {
|
||||
let index = get_test_index_bench(false).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let sub_agg_req: Aggregations = vec![(
|
||||
"average_f64".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Average(
|
||||
AverageAggregation::from_field_name("score_f64".to_string()),
|
||||
)),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let agg_req_1: Aggregations = vec![(
|
||||
"rangef64".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
|
||||
field: "score_f64".to_string(),
|
||||
interval: 100f64, // 1000 buckets
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: sub_agg_req,
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
searcher.search(&AllQuery, &collector).unwrap().into();
|
||||
|
||||
agg_res
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_aggregation_histogram_only(b: &mut Bencher) {
|
||||
let index = get_test_index_bench(false).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req_1: Aggregations = vec![(
|
||||
"rangef64".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
|
||||
field: "score_f64".to_string(),
|
||||
interval: 100f64, // 1000 buckets
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
searcher.search(&AllQuery, &collector).unwrap().into();
|
||||
|
||||
agg_res
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_aggregation_sub_tree(b: &mut Bencher) {
|
||||
let index = get_test_index_bench(false).unwrap();
|
||||
|
||||
@@ -9,25 +9,37 @@ use super::agg_req::MetricAggregation;
|
||||
use super::agg_req_with_accessor::{
|
||||
AggregationsWithAccessor, BucketAggregationWithAccessor, MetricAggregationWithAccessor,
|
||||
};
|
||||
use super::bucket::SegmentRangeCollector;
|
||||
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
|
||||
use super::intermediate_agg_result::{IntermediateAggregationResults, IntermediateBucketResult};
|
||||
use super::metric::{
|
||||
AverageAggregation, SegmentAverageCollector, SegmentStatsCollector, StatsAggregation,
|
||||
};
|
||||
use super::{Key, VecWithNames};
|
||||
use super::VecWithNames;
|
||||
use crate::aggregation::agg_req::BucketAggregationType;
|
||||
use crate::DocId;
|
||||
|
||||
pub(crate) const DOC_BLOCK_SIZE: usize = 256;
|
||||
pub(crate) const DOC_BLOCK_SIZE: usize = 64;
|
||||
pub(crate) type DocBlock = [DocId; DOC_BLOCK_SIZE];
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentAggregationResultsCollector {
|
||||
pub(crate) metrics: VecWithNames<SegmentMetricResultCollector>,
|
||||
pub(crate) buckets: VecWithNames<SegmentBucketResultCollector>,
|
||||
pub(crate) metrics: Option<VecWithNames<SegmentMetricResultCollector>>,
|
||||
pub(crate) buckets: Option<VecWithNames<SegmentBucketResultCollector>>,
|
||||
staged_docs: DocBlock,
|
||||
num_staged_docs: usize,
|
||||
}
|
||||
|
||||
impl Default for SegmentAggregationResultsCollector {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
metrics: Default::default(),
|
||||
buckets: Default::default(),
|
||||
staged_docs: [0; DOC_BLOCK_SIZE],
|
||||
num_staged_docs: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for SegmentAggregationResultsCollector {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentAggregationResultsCollector")
|
||||
@@ -40,6 +52,25 @@ impl Debug for SegmentAggregationResultsCollector {
|
||||
}
|
||||
|
||||
impl SegmentAggregationResultsCollector {
|
||||
pub fn into_intermediate_aggregations_result(
|
||||
self,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<IntermediateAggregationResults> {
|
||||
let buckets = if let Some(buckets) = self.buckets {
|
||||
let entries = buckets
|
||||
.into_iter()
|
||||
.zip(agg_with_accessor.buckets.values())
|
||||
.map(|((key, bucket), acc)| Ok((key, bucket.into_intermediate_bucket_result(acc)?)))
|
||||
.collect::<crate::Result<Vec<(String, _)>>>()?;
|
||||
Some(VecWithNames::from_entries(entries))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let metrics = self.metrics.map(VecWithNames::from_other);
|
||||
|
||||
Ok(IntermediateAggregationResults { metrics, buckets })
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(req: &AggregationsWithAccessor) -> crate::Result<Self> {
|
||||
let buckets = req
|
||||
.buckets
|
||||
@@ -50,7 +81,7 @@ impl SegmentAggregationResultsCollector {
|
||||
SegmentBucketResultCollector::from_req_and_validate(req)?,
|
||||
))
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
.collect::<crate::Result<Vec<(String, _)>>>()?;
|
||||
let metrics = req
|
||||
.metrics
|
||||
.entries()
|
||||
@@ -60,10 +91,20 @@ impl SegmentAggregationResultsCollector {
|
||||
SegmentMetricResultCollector::from_req_and_validate(req)?,
|
||||
))
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
.collect::<crate::Result<Vec<(String, _)>>>()?;
|
||||
let metrics = if metrics.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(VecWithNames::from_entries(metrics))
|
||||
};
|
||||
let buckets = if buckets.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(VecWithNames::from_entries(buckets))
|
||||
};
|
||||
Ok(SegmentAggregationResultsCollector {
|
||||
metrics: VecWithNames::from_entries(metrics),
|
||||
buckets: VecWithNames::from_entries(buckets),
|
||||
metrics,
|
||||
buckets,
|
||||
staged_docs: [0; DOC_BLOCK_SIZE],
|
||||
num_staged_docs: 0,
|
||||
})
|
||||
@@ -82,29 +123,33 @@ impl SegmentAggregationResultsCollector {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
pub(crate) fn flush_staged_docs(
|
||||
&mut self,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
force_flush: bool,
|
||||
) {
|
||||
for (agg_with_accessor, collector) in agg_with_accessor
|
||||
.metrics
|
||||
.values()
|
||||
.zip(self.metrics.values_mut())
|
||||
{
|
||||
collector.collect_block(&self.staged_docs[..self.num_staged_docs], agg_with_accessor);
|
||||
if self.num_staged_docs == 0 {
|
||||
return;
|
||||
}
|
||||
for (agg_with_accessor, collector) in agg_with_accessor
|
||||
.buckets
|
||||
.values()
|
||||
.zip(self.buckets.values_mut())
|
||||
{
|
||||
collector.collect_block(
|
||||
&self.staged_docs[..self.num_staged_docs],
|
||||
agg_with_accessor,
|
||||
force_flush,
|
||||
);
|
||||
if let Some(metrics) = &mut self.metrics {
|
||||
for (collector, agg_with_accessor) in
|
||||
metrics.values_mut().zip(agg_with_accessor.metrics.values())
|
||||
{
|
||||
collector
|
||||
.collect_block(&self.staged_docs[..self.num_staged_docs], agg_with_accessor);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(buckets) = &mut self.buckets {
|
||||
for (collector, agg_with_accessor) in
|
||||
buckets.values_mut().zip(agg_with_accessor.buckets.values())
|
||||
{
|
||||
collector.collect_block(
|
||||
&self.staged_docs[..self.num_staged_docs],
|
||||
agg_with_accessor,
|
||||
force_flush,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
self.num_staged_docs = 0;
|
||||
@@ -151,11 +196,40 @@ impl SegmentMetricResultCollector {
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) enum SegmentBucketResultCollector {
|
||||
Range(SegmentRangeCollector),
|
||||
Histogram(Box<SegmentHistogramCollector>),
|
||||
Terms(Box<SegmentTermCollector>),
|
||||
}
|
||||
|
||||
impl SegmentBucketResultCollector {
|
||||
pub fn into_intermediate_bucket_result(
|
||||
self,
|
||||
agg_with_accessor: &BucketAggregationWithAccessor,
|
||||
) -> crate::Result<IntermediateBucketResult> {
|
||||
match self {
|
||||
SegmentBucketResultCollector::Terms(terms) => {
|
||||
terms.into_intermediate_bucket_result(agg_with_accessor)
|
||||
}
|
||||
SegmentBucketResultCollector::Range(range) => {
|
||||
range.into_intermediate_bucket_result(agg_with_accessor)
|
||||
}
|
||||
SegmentBucketResultCollector::Histogram(histogram) => {
|
||||
histogram.into_intermediate_bucket_result(agg_with_accessor)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_req_and_validate(req: &BucketAggregationWithAccessor) -> crate::Result<Self> {
|
||||
match &req.bucket_agg {
|
||||
BucketAggregationType::Terms(terms_req) => Ok(Self::Terms(Box::new(
|
||||
SegmentTermCollector::from_req_and_validate(
|
||||
terms_req,
|
||||
&req.sub_aggregation,
|
||||
req.field_type,
|
||||
req.accessor
|
||||
.as_multi()
|
||||
.expect("unexpected fast field cardinality"),
|
||||
)?,
|
||||
))),
|
||||
BucketAggregationType::Range(range_req) => {
|
||||
Ok(Self::Range(SegmentRangeCollector::from_req_and_validate(
|
||||
range_req,
|
||||
@@ -163,6 +237,16 @@ impl SegmentBucketResultCollector {
|
||||
req.field_type,
|
||||
)?))
|
||||
}
|
||||
BucketAggregationType::Histogram(histogram) => Ok(Self::Histogram(Box::new(
|
||||
SegmentHistogramCollector::from_req_and_validate(
|
||||
histogram,
|
||||
&req.sub_aggregation,
|
||||
req.field_type,
|
||||
req.accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinality"),
|
||||
)?,
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -177,28 +261,12 @@ impl SegmentBucketResultCollector {
|
||||
SegmentBucketResultCollector::Range(range) => {
|
||||
range.collect_block(doc, bucket_with_accessor, force_flush);
|
||||
}
|
||||
SegmentBucketResultCollector::Histogram(histogram) => {
|
||||
histogram.collect_block(doc, bucket_with_accessor, force_flush)
|
||||
}
|
||||
SegmentBucketResultCollector::Terms(terms) => {
|
||||
terms.collect_block(doc, bucket_with_accessor, force_flush)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentRangeBucketEntry {
|
||||
pub key: Key,
|
||||
pub doc_count: u64,
|
||||
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None.
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl Debug for SegmentRangeBucketEntry {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentRangeBucketEntry")
|
||||
.field("key", &self.key)
|
||||
.field("doc_count", &self.doc_count)
|
||||
.field("from", &self.from)
|
||||
.field("to", &self.to)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,9 +152,9 @@ mod tests {
|
||||
use query::AllQuery;
|
||||
|
||||
use super::{add_vecs, HistogramCollector, HistogramComputer};
|
||||
use crate::chrono::{TimeZone, Utc};
|
||||
use crate::schema::{Schema, FAST};
|
||||
use crate::{doc, query, Index};
|
||||
use crate::time::{Date, Month};
|
||||
use crate::{doc, query, DateTime, Index};
|
||||
|
||||
#[test]
|
||||
fn test_add_histograms_simple() {
|
||||
@@ -273,16 +273,20 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)))?;
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)))?;
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)))?;
|
||||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
|
||||
writer.add_document(
|
||||
doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1986, Month::March, 9)?.with_hms(0, 0, 0)?)),
|
||||
)?;
|
||||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let all_query = AllQuery;
|
||||
let week_histogram_collector = HistogramCollector::new(
|
||||
date_field,
|
||||
Utc.ymd(1980, 1, 1).and_hms(0, 0, 0),
|
||||
DateTime::from_primitive(
|
||||
Date::from_calendar_date(1980, Month::January, 1)?.with_hms(0, 0, 0)?,
|
||||
),
|
||||
3600 * 24 * 365, // it is just for a unit test... sorry leap years.
|
||||
10,
|
||||
);
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use super::*;
|
||||
use crate::collector::{Count, FilterCollector, TopDocs};
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fastfield::{BytesFastFieldReader, DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::query::{AllQuery, QueryParser};
|
||||
use crate::schema::{Field, Schema, FAST, TEXT};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{doc, DateTime, DocAddress, DocId, Document, Index, Score, Searcher, SegmentOrdinal};
|
||||
|
||||
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
|
||||
@@ -26,11 +26,11 @@ pub fn test_filter_collector() -> crate::Result<()> {
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_utc(OffsetDateTime::parse("1898-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2020-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2019-04-20T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_utc(OffsetDateTime::parse("2018-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
@@ -55,7 +55,9 @@ pub fn test_filter_collector() -> crate::Result<()> {
|
||||
assert_eq!(filtered_top_docs.len(), 0);
|
||||
|
||||
fn date_filter(value: DateTime) -> bool {
|
||||
(value - DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()).num_weeks() > 0
|
||||
(value.into_utc() - OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
|
||||
.whole_weeks()
|
||||
> 0
|
||||
}
|
||||
|
||||
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
|
||||
|
||||
@@ -714,7 +714,9 @@ mod tests {
|
||||
use crate::collector::Collector;
|
||||
use crate::query::{AllQuery, Query, QueryParser};
|
||||
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
||||
use crate::{DocAddress, DocId, Index, IndexWriter, Score, SegmentReader};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Score, SegmentReader};
|
||||
|
||||
fn make_index() -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -890,28 +892,32 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_top_field_collector_datetime() -> crate::Result<()> {
|
||||
use std::str::FromStr;
|
||||
let mut schema_builder = Schema::builder();
|
||||
let name = schema_builder.add_text_field("name", TEXT);
|
||||
let birthday = schema_builder.add_date_field("birthday", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let pr_birthday = crate::DateTime::from_str("1898-04-09T00:00:00+00:00")?;
|
||||
let pr_birthday = DateTime::from_utc(OffsetDateTime::parse(
|
||||
"1898-04-09T00:00:00+00:00",
|
||||
&Rfc3339,
|
||||
)?);
|
||||
index_writer.add_document(doc!(
|
||||
name => "Paul Robeson",
|
||||
birthday => pr_birthday
|
||||
birthday => pr_birthday,
|
||||
))?;
|
||||
let mr_birthday = crate::DateTime::from_str("1947-11-08T00:00:00+00:00")?;
|
||||
let mr_birthday = DateTime::from_utc(OffsetDateTime::parse(
|
||||
"1947-11-08T00:00:00+00:00",
|
||||
&Rfc3339,
|
||||
)?);
|
||||
index_writer.add_document(doc!(
|
||||
name => "Minnie Riperton",
|
||||
birthday => mr_birthday
|
||||
birthday => mr_birthday,
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
|
||||
let top_docs: Vec<(crate::DateTime, DocAddress)> =
|
||||
searcher.search(&AllQuery, &top_collector)?;
|
||||
let top_docs: Vec<(DateTime, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
&[
|
||||
|
||||
@@ -781,24 +781,24 @@ mod tests {
|
||||
for i in 0u64..8_000u64 {
|
||||
writer.add_document(doc!(field => i))?;
|
||||
}
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _handle = directory.watch(WatchCallback::new(move || {
|
||||
let _ = sender.send(());
|
||||
}));
|
||||
|
||||
writer.commit()?;
|
||||
let mem_right_after_commit = directory.total_mem_usage();
|
||||
assert!(receiver.recv().is_ok());
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()?;
|
||||
|
||||
assert_eq!(reader.searcher().num_docs(), 8_000);
|
||||
assert_eq!(reader.searcher().segment_readers().len(), 8);
|
||||
|
||||
writer.wait_merging_threads()?;
|
||||
|
||||
let mem_right_after_merge_finished = directory.total_mem_usage();
|
||||
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.num_docs(), 8_000);
|
||||
assert!(
|
||||
mem_right_after_merge_finished < mem_right_after_commit,
|
||||
|
||||
@@ -239,7 +239,7 @@ impl InnerSegmentMeta {
|
||||
///
|
||||
/// Contains settings which are applied on the whole
|
||||
/// index, like presort documents.
|
||||
#[derive(Clone, Default, Serialize, Deserialize, Eq, PartialEq)]
|
||||
#[derive(Clone, Debug, Default, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct IndexSettings {
|
||||
/// Sorts the documents by information
|
||||
/// provided in `IndexSortByField`
|
||||
@@ -254,7 +254,7 @@ pub struct IndexSettings {
|
||||
/// Presorting documents can greatly performance
|
||||
/// in some scenarios, by applying top n
|
||||
/// optimizations.
|
||||
#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct IndexSortByField {
|
||||
/// The field to sort the documents by
|
||||
pub field: String,
|
||||
@@ -262,7 +262,7 @@ pub struct IndexSortByField {
|
||||
pub order: Order,
|
||||
}
|
||||
/// The order to sort by
|
||||
#[derive(Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub enum Order {
|
||||
/// Ascending Order
|
||||
Asc,
|
||||
@@ -298,12 +298,12 @@ pub struct IndexMeta {
|
||||
pub schema: Schema,
|
||||
/// Opstamp associated to the last `commit` operation.
|
||||
pub opstamp: Opstamp,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
/// Payload associated to the last commit.
|
||||
///
|
||||
/// Upon commit, clients can optionally add a small `String` payload to their commit
|
||||
/// to help identify this commit.
|
||||
/// This payload is entirely unused by tantivy.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub payload: Option<String>,
|
||||
}
|
||||
|
||||
@@ -374,6 +374,7 @@ impl fmt::Debug for IndexMeta {
|
||||
mod tests {
|
||||
|
||||
use super::IndexMeta;
|
||||
use crate::core::index_meta::UntrackedIndexMeta;
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::{IndexSettings, IndexSortByField, Order};
|
||||
|
||||
@@ -400,7 +401,12 @@ mod tests {
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||
);
|
||||
|
||||
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(index_metas.index_settings, deser_meta.index_settings);
|
||||
assert_eq!(index_metas.schema, deser_meta.schema);
|
||||
assert_eq!(index_metas.opstamp, deser_meta.opstamp);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ const ZERO_ARRAY: [u8; 8] = [0u8; 8];
|
||||
#[cfg(test)]
|
||||
fn create_uuid() -> Uuid {
|
||||
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
|
||||
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &ZERO_ARRAY).unwrap()
|
||||
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &ZERO_ARRAY)
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
@@ -57,7 +57,7 @@ impl SegmentId {
|
||||
/// Picking the first 8 chars is ok to identify
|
||||
/// segments in a display message (e.g. a5c4dfcb).
|
||||
pub fn short_uuid_string(&self) -> String {
|
||||
(&self.0.to_simple_ref().to_string()[..8]).to_string()
|
||||
(&self.0.as_simple().to_string()[..8]).to_string()
|
||||
}
|
||||
|
||||
/// Returns a segment uuid string.
|
||||
@@ -65,7 +65,7 @@ impl SegmentId {
|
||||
/// It consists in 32 lowercase hexadecimal chars
|
||||
/// (e.g. a5c4dfcbdfe645089129e308e26d5523)
|
||||
pub fn uuid_string(&self) -> String {
|
||||
self.0.to_simple_ref().to_string()
|
||||
self.0.as_simple().to_string()
|
||||
}
|
||||
|
||||
/// Build a `SegmentId` string from the full uuid string.
|
||||
|
||||
@@ -169,7 +169,7 @@ impl SegmentReader {
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?;
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
let fast_field_readers =
|
||||
let fast_fields_readers =
|
||||
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
|
||||
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
@@ -196,7 +196,7 @@ impl SegmentReader {
|
||||
max_doc,
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_readers: fast_field_readers,
|
||||
fast_fields_readers,
|
||||
fieldnorm_readers,
|
||||
segment_id: segment.id(),
|
||||
delete_opstamp: segment.meta().delete_opstamp(),
|
||||
|
||||
@@ -53,7 +53,9 @@ impl FileWatcher {
|
||||
if metafile_has_changed {
|
||||
info!("Meta file {:?} was modified", path);
|
||||
current_checksum_opt = Some(checksum);
|
||||
futures::executor::block_on(callbacks.broadcast());
|
||||
// We actually ignore callbacks failing here.
|
||||
// We just wait for the end of their execution.
|
||||
let _ = callbacks.broadcast().wait();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,9 +6,6 @@ use std::sync::atomic::{AtomicBool, AtomicUsize};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use futures::channel::oneshot;
|
||||
use futures::executor::block_on;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
@@ -249,8 +246,8 @@ fn test_lock_blocking(directory: &dyn Directory) {
|
||||
std::thread::spawn(move || {
|
||||
//< lock_a_res is sent to the thread.
|
||||
in_thread_clone.store(true, SeqCst);
|
||||
let _just_sync = block_on(receiver);
|
||||
// explicitely droping lock_a_res. It would have been sufficient to just force it
|
||||
let _just_sync = receiver.recv();
|
||||
// explicitely dropping lock_a_res. It would have been sufficient to just force it
|
||||
// to be part of the move, but the intent seems clearer that way.
|
||||
drop(lock_a_res);
|
||||
});
|
||||
@@ -273,7 +270,7 @@ fn test_lock_blocking(directory: &dyn Directory) {
|
||||
assert!(in_thread.load(SeqCst));
|
||||
assert!(lock_a_res.is_ok());
|
||||
});
|
||||
assert!(block_on(receiver2).is_ok());
|
||||
assert!(receiver2.recv().is_ok());
|
||||
assert!(sender.send(()).is_ok());
|
||||
assert!(join_handle.join().is_ok());
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
use futures::channel::oneshot;
|
||||
use futures::{Future, TryFutureExt};
|
||||
use crate::FutureResult;
|
||||
|
||||
/// Cloneable wrapper for callbacks registered when watching files of a `Directory`.
|
||||
#[derive(Clone)]
|
||||
@@ -74,12 +73,11 @@ impl WatchCallbackList {
|
||||
}
|
||||
|
||||
/// Triggers all callbacks
|
||||
pub fn broadcast(&self) -> impl Future<Output = ()> {
|
||||
pub fn broadcast(&self) -> FutureResult<()> {
|
||||
let callbacks = self.list_callback();
|
||||
let (sender, receiver) = oneshot::channel();
|
||||
let result = receiver.unwrap_or_else(|_| ());
|
||||
let (result, sender) = FutureResult::create("One of the callback panicked.");
|
||||
if callbacks.is_empty() {
|
||||
let _ = sender.send(());
|
||||
let _ = sender.send(Ok(()));
|
||||
return result;
|
||||
}
|
||||
let spawn_res = std::thread::Builder::new()
|
||||
@@ -88,7 +86,7 @@ impl WatchCallbackList {
|
||||
for callback in callbacks {
|
||||
callback.call();
|
||||
}
|
||||
let _ = sender.send(());
|
||||
let _ = sender.send(Ok(()));
|
||||
});
|
||||
if let Err(err) = spawn_res {
|
||||
error!(
|
||||
@@ -106,8 +104,6 @@ mod tests {
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use futures::executor::block_on;
|
||||
|
||||
use crate::directory::{WatchCallback, WatchCallbackList};
|
||||
|
||||
#[test]
|
||||
@@ -118,22 +114,18 @@ mod tests {
|
||||
let inc_callback = WatchCallback::new(move || {
|
||||
counter_clone.fetch_add(1, Ordering::SeqCst);
|
||||
});
|
||||
block_on(watch_event_router.broadcast());
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
||||
let handle_a = watch_event_router.subscribe(inc_callback);
|
||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
||||
block_on(watch_event_router.broadcast());
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
assert_eq!(1, counter.load(Ordering::SeqCst));
|
||||
block_on(async {
|
||||
(
|
||||
watch_event_router.broadcast().await,
|
||||
watch_event_router.broadcast().await,
|
||||
watch_event_router.broadcast().await,
|
||||
)
|
||||
});
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
assert_eq!(4, counter.load(Ordering::SeqCst));
|
||||
mem::drop(handle_a);
|
||||
block_on(watch_event_router.broadcast());
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
assert_eq!(4, counter.load(Ordering::SeqCst));
|
||||
}
|
||||
|
||||
@@ -150,19 +142,15 @@ mod tests {
|
||||
let handle_a = watch_event_router.subscribe(inc_callback(1));
|
||||
let handle_a2 = watch_event_router.subscribe(inc_callback(10));
|
||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
||||
block_on(async {
|
||||
futures::join!(
|
||||
watch_event_router.broadcast(),
|
||||
watch_event_router.broadcast()
|
||||
)
|
||||
});
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
assert_eq!(22, counter.load(Ordering::SeqCst));
|
||||
mem::drop(handle_a);
|
||||
block_on(watch_event_router.broadcast());
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
assert_eq!(32, counter.load(Ordering::SeqCst));
|
||||
mem::drop(handle_a2);
|
||||
block_on(watch_event_router.broadcast());
|
||||
block_on(watch_event_router.broadcast());
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
assert_eq!(32, counter.load(Ordering::SeqCst));
|
||||
}
|
||||
|
||||
@@ -176,15 +164,12 @@ mod tests {
|
||||
});
|
||||
let handle_a = watch_event_router.subscribe(inc_callback);
|
||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
||||
block_on(async {
|
||||
let future1 = watch_event_router.broadcast();
|
||||
let future2 = watch_event_router.broadcast();
|
||||
futures::join!(future1, future2)
|
||||
});
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
assert_eq!(2, counter.load(Ordering::SeqCst));
|
||||
mem::drop(handle_a);
|
||||
let _ = watch_event_router.broadcast();
|
||||
block_on(watch_event_router.broadcast());
|
||||
watch_event_router.broadcast().wait().unwrap();
|
||||
assert_eq!(2, counter.load(Ordering::SeqCst));
|
||||
}
|
||||
}
|
||||
|
||||
22
src/error.rs
@@ -97,6 +97,10 @@ pub enum TantivyError {
|
||||
/// Index incompatible with current version of Tantivy.
|
||||
#[error("{0:?}")]
|
||||
IncompatibleIndex(Incompatibility),
|
||||
/// An internal error occurred. This is are internal states that should not be reached.
|
||||
/// e.g. a datastructure is incorrectly inititalized.
|
||||
#[error("Internal error: '{0}'")]
|
||||
InternalError(String),
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
@@ -149,9 +153,21 @@ impl<Guard> From<PoisonError<Guard>> for TantivyError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<chrono::ParseError> for TantivyError {
|
||||
fn from(err: chrono::ParseError) -> TantivyError {
|
||||
TantivyError::InvalidArgument(err.to_string())
|
||||
impl From<time::error::Format> for TantivyError {
|
||||
fn from(err: time::error::Format) -> TantivyError {
|
||||
TantivyError::InvalidArgument(format!("Date formatting error: {err}"))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<time::error::Parse> for TantivyError {
|
||||
fn from(err: time::error::Parse) -> TantivyError {
|
||||
TantivyError::InvalidArgument(format!("Date parsing error: {err}"))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<time::error::ComponentRange> for TantivyError {
|
||||
fn from(err: time::error::ComponentRange) -> TantivyError {
|
||||
TantivyError::InvalidArgument(format!("Date range error: {err}"))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -188,14 +188,14 @@ mod bench {
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
|
||||
fn bench_alive_bitset_iter_deser_on_fly(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_access(bench: &mut Bencher) {
|
||||
fn bench_alive_bitset_access(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
@@ -206,14 +206,14 @@ mod bench {
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
|
||||
fn bench_alive_bitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
|
||||
fn bench_alive_bitset_access_1_8_alive(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
|
||||
@@ -30,9 +30,8 @@ pub use self::readers::FastFieldReaders;
|
||||
pub(crate) use self::readers::{type_and_cardinality, FastType};
|
||||
pub use self::serializer::{CompositeFastFieldSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use crate::chrono::{NaiveDateTime, Utc};
|
||||
use crate::schema::{Cardinality, FieldType, Type, Value};
|
||||
use crate::DocId;
|
||||
use crate::{DateTime, DocId};
|
||||
|
||||
mod alive_bitset;
|
||||
mod bytes;
|
||||
@@ -161,14 +160,14 @@ impl FastValue for f64 {
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for crate::DateTime {
|
||||
impl FastValue for DateTime {
|
||||
fn from_u64(timestamp_u64: u64) -> Self {
|
||||
let timestamp_i64 = i64::from_u64(timestamp_u64);
|
||||
crate::DateTime::from_utc(NaiveDateTime::from_timestamp(timestamp_i64, 0), Utc)
|
||||
let unix_timestamp = i64::from_u64(timestamp_u64);
|
||||
Self::from_unix_timestamp(unix_timestamp)
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
self.timestamp().to_u64()
|
||||
self.into_unix_timestamp().to_u64()
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
@@ -179,7 +178,7 @@ impl FastValue for crate::DateTime {
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
self.timestamp().as_u64()
|
||||
self.into_unix_timestamp().as_u64()
|
||||
}
|
||||
|
||||
fn to_type() -> Type {
|
||||
@@ -188,12 +187,32 @@ impl FastValue for crate::DateTime {
|
||||
}
|
||||
|
||||
fn value_to_u64(value: &Value) -> u64 {
|
||||
match *value {
|
||||
Value::U64(ref val) => *val,
|
||||
Value::I64(ref val) => common::i64_to_u64(*val),
|
||||
Value::F64(ref val) => common::f64_to_u64(*val),
|
||||
Value::Date(ref datetime) => common::i64_to_u64(datetime.timestamp()),
|
||||
_ => panic!("Expected a u64/i64/f64 field, got {:?} ", value),
|
||||
match value {
|
||||
Value::U64(val) => val.to_u64(),
|
||||
Value::I64(val) => val.to_u64(),
|
||||
Value::F64(val) => val.to_u64(),
|
||||
Value::Date(val) => val.to_u64(),
|
||||
_ => panic!("Expected a u64/i64/f64/date field, got {:?} ", value),
|
||||
}
|
||||
}
|
||||
|
||||
/// The fast field type
|
||||
pub enum FastFieldType {
|
||||
/// Numeric type, e.g. f64.
|
||||
Numeric,
|
||||
/// Fast field stores string ids.
|
||||
String,
|
||||
/// Fast field stores string ids for facets.
|
||||
Facet,
|
||||
}
|
||||
|
||||
impl FastFieldType {
|
||||
fn is_storing_term_ids(&self) -> bool {
|
||||
matches!(self, FastFieldType::String | FastFieldType::Facet)
|
||||
}
|
||||
|
||||
fn is_facet(&self) -> bool {
|
||||
matches!(self, FastFieldType::Facet)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,6 +220,7 @@ fn value_to_u64(value: &Value) -> u64 {
|
||||
mod tests {
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Range;
|
||||
use std::path::Path;
|
||||
|
||||
use common::HasLen;
|
||||
@@ -212,7 +232,8 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::{Document, Field, NumericOptions, Schema, FAST};
|
||||
use crate::schema::{Document, Field, NumericOptions, Schema, FAST, STRING, TEXT};
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{Index, SegmentId, SegmentReader};
|
||||
|
||||
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
|
||||
@@ -233,7 +254,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield_i64_u64() {
|
||||
let datetime = crate::DateTime::from_utc(NaiveDateTime::from_timestamp(0i64, 0), Utc);
|
||||
let datetime = DateTime::from_utc(OffsetDateTime::UNIX_EPOCH);
|
||||
assert_eq!(i64::from_u64(datetime.to_u64()), 0i64);
|
||||
}
|
||||
|
||||
@@ -489,7 +510,8 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()))?;
|
||||
index_writer
|
||||
.add_document(doc!(date_field =>DateTime::from_utc(OffsetDateTime::now_utc())))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
@@ -501,8 +523,7 @@ mod tests {
|
||||
.map(SegmentReader::segment_id)
|
||||
.collect();
|
||||
assert_eq!(segment_ids.len(), 2);
|
||||
let merge_future = index_writer.merge(&segment_ids[..]);
|
||||
futures::executor::block_on(merge_future)?;
|
||||
index_writer.merge(&segment_ids[..]).wait().unwrap();
|
||||
reader.reload()?;
|
||||
assert_eq!(reader.searcher().segment_readers().len(), 1);
|
||||
Ok(())
|
||||
@@ -510,7 +531,206 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_default_datetime() {
|
||||
assert_eq!(crate::DateTime::make_zero().timestamp(), 0i64);
|
||||
assert_eq!(0, DateTime::make_zero().into_unix_timestamp());
|
||||
}
|
||||
|
||||
fn get_vals_for_docs(ff: &MultiValuedFastFieldReader<u64>, docs: Range<u32>) -> Vec<u64> {
|
||||
let mut all = vec![];
|
||||
|
||||
for doc in docs {
|
||||
let mut out = vec![];
|
||||
ff.get_vals(doc, &mut out);
|
||||
all.extend(out);
|
||||
}
|
||||
all
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_fastfield() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT | FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
// first segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "BBBBB AAAAA", // term_ord 1,2
|
||||
))?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA BBBBB", // term_ord 0
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "zumberthree", // term_ord 2, after merge term_ord 3
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
get_vals_for_docs(&text_fast_field, 0..5),
|
||||
vec![1, 0, 0, 0, 1, 2]
|
||||
);
|
||||
|
||||
let mut out = vec![];
|
||||
text_fast_field.get_vals(3, &mut out);
|
||||
assert_eq!(out, vec![0, 1]);
|
||||
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 3);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
// default tokenizer applies lower case
|
||||
assert_eq!(bytes, "aaaaa".as_bytes());
|
||||
}
|
||||
|
||||
{
|
||||
// second segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "CCCCC AAAAA", // term_ord 1, after merge 2
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 2);
|
||||
let segment_reader = searcher.segment_reader(1);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(get_vals_for_docs(&text_fast_field, 0..3), vec![0, 1, 0]);
|
||||
}
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
get_vals_for_docs(&text_fast_field, 0..8),
|
||||
vec![1, 0, 0, 0, 1, 3 /* next segment */, 0, 2, 0]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_string_fastfield() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", STRING | FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
// first segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "BBBBB", // term_ord 1
|
||||
))?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "zumberthree", // term_ord 2, after merge term_ord 3
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(get_vals_for_docs(&text_fast_field, 0..6), vec![1, 0, 0, 2]);
|
||||
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 3);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
assert_eq!(bytes, "AAAAA".as_bytes());
|
||||
}
|
||||
|
||||
{
|
||||
// second segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "CCCCC", // term_ord 1, after merge 2
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 2);
|
||||
let segment_reader = searcher.segment_reader(1);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(get_vals_for_docs(&text_fast_field, 0..2), vec![0, 1]);
|
||||
}
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
get_vals_for_docs(&text_fast_field, 0..9),
|
||||
vec![1, 0, 0, 3 /* next segment */, 0, 2]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -527,16 +747,16 @@ mod tests {
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
date_field => crate::DateTime::from_u64(1i64.to_u64()),
|
||||
multi_date_field => crate::DateTime::from_u64(2i64.to_u64()),
|
||||
multi_date_field => crate::DateTime::from_u64(3i64.to_u64())
|
||||
date_field => DateTime::from_u64(1i64.to_u64()),
|
||||
multi_date_field => DateTime::from_u64(2i64.to_u64()),
|
||||
multi_date_field => DateTime::from_u64(3i64.to_u64())
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
date_field => crate::DateTime::from_u64(4i64.to_u64())
|
||||
date_field => DateTime::from_u64(4i64.to_u64())
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
multi_date_field => crate::DateTime::from_u64(5i64.to_u64()),
|
||||
multi_date_field => crate::DateTime::from_u64(6i64.to_u64())
|
||||
multi_date_field => DateTime::from_u64(5i64.to_u64()),
|
||||
multi_date_field => DateTime::from_u64(6i64.to_u64())
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
@@ -548,23 +768,23 @@ mod tests {
|
||||
let dates_fast_field = fast_fields.dates(multi_date_field).unwrap();
|
||||
let mut dates = vec![];
|
||||
{
|
||||
assert_eq!(date_fast_field.get(0u32).timestamp(), 1i64);
|
||||
assert_eq!(date_fast_field.get(0u32).into_unix_timestamp(), 1i64);
|
||||
dates_fast_field.get_vals(0u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].timestamp(), 2i64);
|
||||
assert_eq!(dates[1].timestamp(), 3i64);
|
||||
assert_eq!(dates[0].into_unix_timestamp(), 2i64);
|
||||
assert_eq!(dates[1].into_unix_timestamp(), 3i64);
|
||||
}
|
||||
{
|
||||
assert_eq!(date_fast_field.get(1u32).timestamp(), 4i64);
|
||||
assert_eq!(date_fast_field.get(1u32).into_unix_timestamp(), 4i64);
|
||||
dates_fast_field.get_vals(1u32, &mut dates);
|
||||
assert!(dates.is_empty());
|
||||
}
|
||||
{
|
||||
assert_eq!(date_fast_field.get(2u32).timestamp(), 0i64);
|
||||
assert_eq!(date_fast_field.get(2u32).into_unix_timestamp(), 0i64);
|
||||
dates_fast_field.get_vals(2u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].timestamp(), 5i64);
|
||||
assert_eq!(dates[1].timestamp(), 6i64);
|
||||
assert_eq!(dates[0].into_unix_timestamp(), 5i64);
|
||||
assert_eq!(dates[1].into_unix_timestamp(), 6i64);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -6,9 +6,6 @@ pub use self::writer::MultiValuedFastFieldWriter;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use chrono::Duration;
|
||||
use futures::executor::block_on;
|
||||
use proptest::strategy::Strategy;
|
||||
use proptest::{prop_oneof, proptest};
|
||||
use test_log::test;
|
||||
@@ -17,7 +14,9 @@ mod tests {
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema};
|
||||
use crate::{Document, Index, Term};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::{Duration, OffsetDateTime};
|
||||
use crate::{DateTime, Document, Index, Term};
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_u64() -> crate::Result<()> {
|
||||
@@ -70,22 +69,27 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let first_time_stamp = chrono::Utc::now();
|
||||
index_writer.add_document(
|
||||
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
|
||||
)?;
|
||||
index_writer.add_document(doc!(time_i=>0i64))?;
|
||||
let first_time_stamp = OffsetDateTime::now_utc();
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_utc(first_time_stamp),
|
||||
date_field => DateTime::from_utc(first_time_stamp),
|
||||
time_i=>1i64))?;
|
||||
index_writer.add_document(doc!(time_i => 0i64))?;
|
||||
// add one second
|
||||
index_writer.add_document(
|
||||
doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64),
|
||||
)?;
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(1)),
|
||||
time_i => 2i64))?;
|
||||
// add another second
|
||||
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
|
||||
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64))?;
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_utc(two_secs_ahead),
|
||||
date_field => DateTime::from_utc(two_secs_ahead),
|
||||
date_field => DateTime::from_utc(two_secs_ahead),
|
||||
time_i => 3i64))?;
|
||||
// add three seconds
|
||||
index_writer.add_document(
|
||||
doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64),
|
||||
)?;
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(3)),
|
||||
time_i => 4i64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
@@ -97,7 +101,7 @@ mod tests {
|
||||
let parser = QueryParser::for_index(&index, vec![]);
|
||||
let query = parser.parse_query(&format!(
|
||||
"multi_date_field:\"{}\"",
|
||||
first_time_stamp.to_rfc3339()
|
||||
first_time_stamp.format(&Rfc3339)?,
|
||||
))?;
|
||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
||||
assert_eq!(results.len(), 1);
|
||||
@@ -108,9 +112,8 @@ mod tests {
|
||||
.get_first(date_field)
|
||||
.expect("cannot find value")
|
||||
.as_date()
|
||||
.unwrap()
|
||||
.timestamp(),
|
||||
first_time_stamp.timestamp()
|
||||
.unwrap(),
|
||||
DateTime::from_utc(first_time_stamp),
|
||||
);
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
@@ -124,7 +127,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let query = parser.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))?;
|
||||
let query = parser.parse_query(&format!("\"{}\"", two_secs_ahead.format(&Rfc3339)?))?;
|
||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
@@ -136,9 +139,8 @@ mod tests {
|
||||
.get_first(date_field)
|
||||
.expect("cannot find value")
|
||||
.as_date()
|
||||
.unwrap()
|
||||
.timestamp(),
|
||||
two_secs_ahead.timestamp()
|
||||
.unwrap(),
|
||||
DateTime::from_utc(two_secs_ahead)
|
||||
);
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
@@ -154,8 +156,8 @@ mod tests {
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let range_q = format!(
|
||||
"multi_date_field:[{} TO {}}}",
|
||||
(first_time_stamp + Duration::seconds(1)).to_rfc3339(),
|
||||
(first_time_stamp + Duration::seconds(3)).to_rfc3339()
|
||||
(first_time_stamp + Duration::seconds(1)).format(&Rfc3339)?,
|
||||
(first_time_stamp + Duration::seconds(3)).format(&Rfc3339)?
|
||||
);
|
||||
let query = parser.parse_query(&range_q)?;
|
||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
||||
@@ -178,9 +180,8 @@ mod tests {
|
||||
.get_first(date_field)
|
||||
.expect("cannot find value")
|
||||
.as_date()
|
||||
.expect("value not of Date type")
|
||||
.timestamp(),
|
||||
(first_time_stamp + Duration::seconds(offset_sec)).timestamp()
|
||||
.expect("value not of Date type"),
|
||||
DateTime::from_utc(first_time_stamp + Duration::seconds(offset_sec)),
|
||||
);
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
@@ -268,7 +269,7 @@ mod tests {
|
||||
IndexingOp::Merge => {
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
if segment_ids.len() >= 2 {
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.segment_updater().wait_merging_thread()?;
|
||||
}
|
||||
}
|
||||
@@ -283,7 +284,7 @@ mod tests {
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
if !segment_ids.is_empty() {
|
||||
block_on(index_writer.merge(&segment_ids)).unwrap();
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,22 +27,28 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
/// Returns `[start, end)`, such that the values associated
|
||||
/// to the given document are `start..end`.
|
||||
#[inline]
|
||||
fn range(&self, doc: DocId) -> Range<u64> {
|
||||
let start = self.idx_reader.get(doc);
|
||||
let stop = self.idx_reader.get(doc + 1);
|
||||
start..stop
|
||||
let end = self.idx_reader.get(doc + 1);
|
||||
start..end
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
fn get_vals_for_range(&self, range: Range<u64>, vals: &mut Vec<Item>) {
|
||||
let len = (range.end - range.start) as usize;
|
||||
vals.resize(len, Item::make_zero());
|
||||
self.vals_reader.get_range(range.start, &mut vals[..]);
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let range = self.range(doc);
|
||||
let len = (range.end - range.start) as usize;
|
||||
vals.resize(len, Item::make_zero());
|
||||
self.vals_reader.get_range(range.start, &mut vals[..]);
|
||||
self.get_vals_for_range(range, vals);
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
|
||||
@@ -4,7 +4,7 @@ use fnv::FnvHashMap;
|
||||
use tantivy_bitpacker::minmax;
|
||||
|
||||
use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy;
|
||||
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer};
|
||||
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer, FastFieldType};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Document, Field};
|
||||
@@ -38,17 +38,17 @@ pub struct MultiValuedFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<UnorderedTermId>,
|
||||
doc_index: Vec<u64>,
|
||||
is_facet: bool,
|
||||
fast_field_type: FastFieldType,
|
||||
}
|
||||
|
||||
impl MultiValuedFastFieldWriter {
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub(crate) fn new(field: Field, is_facet: bool) -> Self {
|
||||
/// Creates a new `MultiValuedFastFieldWriter`
|
||||
pub(crate) fn new(field: Field, fast_field_type: FastFieldType) -> Self {
|
||||
MultiValuedFastFieldWriter {
|
||||
field,
|
||||
vals: Vec::new(),
|
||||
doc_index: Vec::new(),
|
||||
is_facet,
|
||||
fast_field_type,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,12 +77,13 @@ impl MultiValuedFastFieldWriter {
|
||||
/// all of the matching field values present in the document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
self.next_doc();
|
||||
// facets are indexed in the `SegmentWriter` as we encode their unordered id.
|
||||
if !self.is_facet {
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field == self.field {
|
||||
self.add_val(value_to_u64(field_value.value()));
|
||||
}
|
||||
// facets/texts are indexed in the `SegmentWriter` as we encode their unordered id.
|
||||
if self.fast_field_type.is_storing_term_ids() {
|
||||
return;
|
||||
}
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field == self.field {
|
||||
self.add_val(value_to_u64(field_value.value()));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -158,15 +159,15 @@ impl MultiValuedFastFieldWriter {
|
||||
{
|
||||
// writing the values themselves.
|
||||
let mut value_serializer: BitpackedFastFieldSerializerLegacy<'_, _>;
|
||||
match mapping_opt {
|
||||
Some(mapping) => {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
self.field,
|
||||
0u64,
|
||||
mapping.len() as u64,
|
||||
1,
|
||||
)?;
|
||||
if let Some(mapping) = mapping_opt {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
self.field,
|
||||
0u64,
|
||||
mapping.len() as u64,
|
||||
1,
|
||||
)?;
|
||||
|
||||
if self.fast_field_type.is_facet() {
|
||||
let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
doc_vals.clear();
|
||||
@@ -179,19 +180,27 @@ impl MultiValuedFastFieldWriter {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let val_min_max = minmax(self.vals.iter().cloned());
|
||||
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
} else {
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
// sort values in case of remapped doc_ids?
|
||||
for &val in vals {
|
||||
let remapped_vals = vals
|
||||
.iter()
|
||||
.map(|val| *mapping.get(val).expect("Missing term ordinal"));
|
||||
for val in remapped_vals {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let val_min_max = minmax(self.vals.iter().cloned());
|
||||
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
// sort values in case of remapped doc_ids?
|
||||
for &val in vals {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
value_serializer.close_field()?;
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::fastfield::{
|
||||
};
|
||||
use crate::schema::{Cardinality, Field, FieldType, Schema};
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use crate::TantivyError;
|
||||
use crate::{DateTime, TantivyError};
|
||||
|
||||
/// Provides access to all of the BitpackedFastFieldReader.
|
||||
///
|
||||
@@ -39,6 +39,9 @@ pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType,
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::Date, cardinality)),
|
||||
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)),
|
||||
FieldType::Str(options) if options.is_fast() => {
|
||||
Some((FastType::U64, Cardinality::MultiValues))
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@@ -147,10 +150,10 @@ impl FastFieldReaders {
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
/// Returns the `date` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns an Error.
|
||||
pub fn date(&self, field: Field) -> crate::Result<DynamicFastFieldReader<crate::DateTime>> {
|
||||
/// If `field` is not a date fast field, this method returns an Error.
|
||||
pub fn date(&self, field: Field) -> crate::Result<DynamicFastFieldReader<DateTime>> {
|
||||
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -195,13 +198,12 @@ impl FastFieldReaders {
|
||||
self.typed_fast_field_multi_reader(field)
|
||||
}
|
||||
|
||||
/// Returns a `crate::DateTime` multi-valued fast field reader reader associated to `field`.
|
||||
/// Returns a `time::OffsetDateTime` multi-valued fast field reader reader associated to
|
||||
/// `field`.
|
||||
///
|
||||
/// If `field` is not a `crate::DateTime` multi-valued fast field, this method returns an Error.
|
||||
pub fn dates(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<MultiValuedFastFieldReader<crate::DateTime>> {
|
||||
/// If `field` is not a `time::OffsetDateTime` multi-valued fast field, this method returns an
|
||||
/// Error.
|
||||
pub fn dates(&self, field: Field) -> crate::Result<MultiValuedFastFieldReader<DateTime>> {
|
||||
self.check_type(field, FastType::Date, Cardinality::MultiValues)?;
|
||||
self.typed_fast_field_multi_reader(field)
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use tantivy_bitpacker::BlockedBitpacker;
|
||||
|
||||
use super::multivalued::MultiValuedFastFieldWriter;
|
||||
use super::serializer::FastFieldStats;
|
||||
use super::FastFieldDataAccess;
|
||||
use super::{FastFieldDataAccess, FastFieldType};
|
||||
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
@@ -16,6 +16,7 @@ use crate::termdict::TermOrdinal;
|
||||
|
||||
/// The `FastFieldsWriter` groups all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
term_id_writers: Vec<MultiValuedFastFieldWriter>,
|
||||
single_value_writers: Vec<IntFastFieldWriter>,
|
||||
multi_values_writers: Vec<MultiValuedFastFieldWriter>,
|
||||
bytes_value_writers: Vec<BytesFastFieldWriter>,
|
||||
@@ -33,6 +34,7 @@ impl FastFieldsWriter {
|
||||
/// Create all `FastFieldWriter` required by the schema.
|
||||
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
|
||||
let mut single_value_writers = Vec::new();
|
||||
let mut term_id_writers = Vec::new();
|
||||
let mut multi_values_writers = Vec::new();
|
||||
let mut bytes_value_writers = Vec::new();
|
||||
|
||||
@@ -50,15 +52,22 @@ impl FastFieldsWriter {
|
||||
single_value_writers.push(fast_field_writer);
|
||||
}
|
||||
Some(Cardinality::MultiValues) => {
|
||||
let fast_field_writer = MultiValuedFastFieldWriter::new(field, false);
|
||||
let fast_field_writer =
|
||||
MultiValuedFastFieldWriter::new(field, FastFieldType::Numeric);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
FieldType::Facet(_) => {
|
||||
let fast_field_writer = MultiValuedFastFieldWriter::new(field, true);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
let fast_field_writer =
|
||||
MultiValuedFastFieldWriter::new(field, FastFieldType::Facet);
|
||||
term_id_writers.push(fast_field_writer);
|
||||
}
|
||||
FieldType::Str(_) if field_entry.is_fast() => {
|
||||
let fast_field_writer =
|
||||
MultiValuedFastFieldWriter::new(field, FastFieldType::String);
|
||||
term_id_writers.push(fast_field_writer);
|
||||
}
|
||||
FieldType::Bytes(bytes_option) => {
|
||||
if bytes_option.is_fast() {
|
||||
@@ -70,6 +79,7 @@ impl FastFieldsWriter {
|
||||
}
|
||||
}
|
||||
FastFieldsWriter {
|
||||
term_id_writers,
|
||||
single_value_writers,
|
||||
multi_values_writers,
|
||||
bytes_value_writers,
|
||||
@@ -78,10 +88,15 @@ impl FastFieldsWriter {
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.single_value_writers
|
||||
self.term_id_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.single_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.multi_values_writers
|
||||
.iter()
|
||||
@@ -94,6 +109,14 @@ impl FastFieldsWriter {
|
||||
.sum::<usize>()
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_term_id_writer(&self, field: Field) -> Option<&MultiValuedFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.term_id_writers
|
||||
.iter()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_field_writer(&self, field: Field) -> Option<&IntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
@@ -110,6 +133,17 @@ impl FastFieldsWriter {
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_term_id_writer_mut(
|
||||
&mut self,
|
||||
field: Field,
|
||||
) -> Option<&mut MultiValuedFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.term_id_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Returns the fast field multi-value writer for the given field.
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
@@ -137,6 +171,9 @@ impl FastFieldsWriter {
|
||||
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
for field_writer in &mut self.term_id_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
for field_writer in &mut self.single_value_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
@@ -156,6 +193,10 @@ impl FastFieldsWriter {
|
||||
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
for field_writer in &self.term_id_writers {
|
||||
let field = field_writer.field();
|
||||
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
|
||||
}
|
||||
for field_writer in &self.single_value_writers {
|
||||
field_writer.serialize(serializer, doc_id_map)?;
|
||||
}
|
||||
@@ -244,6 +285,10 @@ impl IntFastFieldWriter {
|
||||
self.val_count += 1;
|
||||
}
|
||||
|
||||
/// Extract the fast field value from the document
|
||||
/// (or use the default value) and records it.
|
||||
///
|
||||
///
|
||||
/// Extract the value associated to the fast field for
|
||||
/// this document.
|
||||
///
|
||||
@@ -254,18 +299,17 @@ impl IntFastFieldWriter {
|
||||
/// instead.
|
||||
/// If the document has more than one value for the given field,
|
||||
/// only the first one is taken in account.
|
||||
fn extract_val(&self, doc: &Document) -> u64 {
|
||||
match doc.get_first(self.field) {
|
||||
Some(v) => super::value_to_u64(v),
|
||||
None => self.val_if_missing,
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract the fast field value from the document
|
||||
/// (or use the default value) and records it.
|
||||
///
|
||||
/// Values for string fast fields are skipped.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
let val = self.extract_val(doc);
|
||||
self.add_val(val);
|
||||
match doc.get_first(self.field) {
|
||||
Some(v) => {
|
||||
self.add_val(super::value_to_u64(v));
|
||||
}
|
||||
None => {
|
||||
self.add_val(self.val_if_missing);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// get iterator over the data
|
||||
@@ -284,6 +328,7 @@ impl IntFastFieldWriter {
|
||||
} else {
|
||||
(self.val_min, self.val_max)
|
||||
};
|
||||
|
||||
let fastfield_accessor = WriterFastFieldAccessProvider {
|
||||
doc_id_map,
|
||||
vals: &self.vals,
|
||||
|
||||
130
src/future_result.rs
Normal file
@@ -0,0 +1,130 @@
|
||||
use std::future::Future;
|
||||
use std::pin::Pin;
|
||||
use std::task::Poll;
|
||||
|
||||
use crate::TantivyError;
|
||||
|
||||
/// `FutureResult` is a handle that makes it possible to wait for the completion
|
||||
/// of an ongoing task.
|
||||
///
|
||||
/// Contrary to some `Future`, it does not need to be polled for the task to
|
||||
/// progress. Dropping the `FutureResult` does not cancel the task being executed
|
||||
/// either.
|
||||
///
|
||||
/// - In a sync context, you can call `FutureResult::wait()`. The function
|
||||
/// does not rely on `block_on`.
|
||||
/// - In an async context, you can call simply use `FutureResult` as a future.
|
||||
pub struct FutureResult<T> {
|
||||
inner: Inner<T>,
|
||||
}
|
||||
|
||||
enum Inner<T> {
|
||||
FailedBeforeStart(Option<TantivyError>),
|
||||
InProgress {
|
||||
receiver: oneshot::Receiver<crate::Result<T>>,
|
||||
error_msg_if_failure: &'static str,
|
||||
},
|
||||
}
|
||||
|
||||
impl<T> From<TantivyError> for FutureResult<T> {
|
||||
fn from(err: TantivyError) -> Self {
|
||||
FutureResult {
|
||||
inner: Inner::FailedBeforeStart(Some(err)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> FutureResult<T> {
|
||||
pub(crate) fn create(
|
||||
error_msg_if_failure: &'static str,
|
||||
) -> (Self, oneshot::Sender<crate::Result<T>>) {
|
||||
let (sender, receiver) = oneshot::channel();
|
||||
let inner: Inner<T> = Inner::InProgress {
|
||||
receiver,
|
||||
error_msg_if_failure,
|
||||
};
|
||||
(FutureResult { inner }, sender)
|
||||
}
|
||||
|
||||
/// Blocks until the scheduled result is available.
|
||||
///
|
||||
/// In an async context, you should simply use `ScheduledResult` as a future.
|
||||
pub fn wait(self) -> crate::Result<T> {
|
||||
match self.inner {
|
||||
Inner::FailedBeforeStart(err) => Err(err.unwrap()),
|
||||
Inner::InProgress {
|
||||
receiver,
|
||||
error_msg_if_failure,
|
||||
} => receiver.recv().unwrap_or_else(|_| {
|
||||
Err(crate::TantivyError::SystemError(
|
||||
error_msg_if_failure.to_string(),
|
||||
))
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Future for FutureResult<T> {
|
||||
type Output = crate::Result<T>;
|
||||
|
||||
fn poll(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<Self::Output> {
|
||||
unsafe {
|
||||
match &mut Pin::get_unchecked_mut(self).inner {
|
||||
Inner::FailedBeforeStart(err) => Poll::Ready(Err(err.take().unwrap())),
|
||||
Inner::InProgress {
|
||||
receiver,
|
||||
error_msg_if_failure,
|
||||
} => match Future::poll(Pin::new_unchecked(receiver), cx) {
|
||||
Poll::Ready(oneshot_res) => {
|
||||
let res = oneshot_res.unwrap_or_else(|_| {
|
||||
Err(crate::TantivyError::SystemError(
|
||||
error_msg_if_failure.to_string(),
|
||||
))
|
||||
});
|
||||
Poll::Ready(res)
|
||||
}
|
||||
Poll::Pending => Poll::Pending,
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use futures::executor::block_on;
|
||||
|
||||
use super::FutureResult;
|
||||
use crate::TantivyError;
|
||||
|
||||
#[test]
|
||||
fn test_scheduled_result_failed_to_schedule() {
|
||||
let scheduled_result: FutureResult<()> = FutureResult::from(TantivyError::Poisoned);
|
||||
let res = block_on(scheduled_result);
|
||||
assert!(matches!(res, Err(TantivyError::Poisoned)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
fn test_scheduled_result_error() {
|
||||
let (scheduled_result, tx): (FutureResult<()>, _) = FutureResult::create("failed");
|
||||
drop(tx);
|
||||
let res = block_on(scheduled_result);
|
||||
assert!(matches!(res, Err(TantivyError::SystemError(_))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_scheduled_result_sent_success() {
|
||||
let (scheduled_result, tx): (FutureResult<u64>, _) = FutureResult::create("failed");
|
||||
tx.send(Ok(2u64)).unwrap();
|
||||
assert_eq!(block_on(scheduled_result).unwrap(), 2u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_scheduled_result_sent_error() {
|
||||
let (scheduled_result, tx): (FutureResult<u64>, _) = FutureResult::create("failed");
|
||||
tx.send(Err(TantivyError::Poisoned)).unwrap();
|
||||
let res = block_on(scheduled_result);
|
||||
assert!(matches!(res, Err(TantivyError::Poisoned)));
|
||||
}
|
||||
}
|
||||
@@ -116,14 +116,14 @@ pub fn demux(
|
||||
) -> crate::Result<Vec<Index>> {
|
||||
let mut indices = vec![];
|
||||
for (target_segment_ord, output_directory) in output_directories.into_iter().enumerate() {
|
||||
let delete_bitsets = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
|
||||
let alive_bitset = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
|
||||
.into_iter()
|
||||
.map(Some)
|
||||
.collect_vec();
|
||||
let index = merge_filtered_segments(
|
||||
segments,
|
||||
target_settings.clone(),
|
||||
delete_bitsets,
|
||||
alive_bitset,
|
||||
output_directory,
|
||||
)?;
|
||||
indices.push(index);
|
||||
@@ -141,7 +141,7 @@ mod tests {
|
||||
use crate::{DocAddress, Term};
|
||||
|
||||
#[test]
|
||||
fn test_demux_map_to_deletebitset() {
|
||||
fn test_demux_map_to_alive_bitset() {
|
||||
let max_value = 2;
|
||||
let mut demux_mapping = DemuxMapping::default();
|
||||
// segment ordinal 0 mapping
|
||||
|
||||
@@ -5,8 +5,6 @@ use std::thread::JoinHandle;
|
||||
|
||||
use common::BitSet;
|
||||
use crossbeam::channel;
|
||||
use futures::executor::block_on;
|
||||
use futures::future::Future;
|
||||
use smallvec::smallvec;
|
||||
|
||||
use super::operation::{AddOperation, UserOperation};
|
||||
@@ -24,7 +22,7 @@ use crate::indexer::operation::DeleteOperation;
|
||||
use crate::indexer::stamper::Stamper;
|
||||
use crate::indexer::{MergePolicy, SegmentEntry, SegmentWriter};
|
||||
use crate::schema::{Document, IndexRecordOption, Term};
|
||||
use crate::Opstamp;
|
||||
use crate::{FutureResult, Opstamp};
|
||||
|
||||
// Size of the margin for the `memory_arena`. A segment is closed when the remaining memory
|
||||
// in the `memory_arena` goes below MARGIN_IN_BYTES.
|
||||
@@ -214,7 +212,7 @@ fn index_documents(
|
||||
meta.untrack_temp_docstore();
|
||||
// update segment_updater inventory to remove tempstore
|
||||
let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt);
|
||||
block_on(segment_updater.schedule_add_segment(segment_entry))?;
|
||||
segment_updater.schedule_add_segment(segment_entry).wait()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -368,7 +366,9 @@ impl IndexWriter {
|
||||
pub fn add_segment(&self, segment_meta: SegmentMeta) -> crate::Result<()> {
|
||||
let delete_cursor = self.delete_queue.cursor();
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
|
||||
block_on(self.segment_updater.schedule_add_segment(segment_entry))
|
||||
self.segment_updater
|
||||
.schedule_add_segment(segment_entry)
|
||||
.wait()
|
||||
}
|
||||
|
||||
/// Creates a new segment.
|
||||
@@ -465,8 +465,8 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
/// Detects and removes the files that are not used by the index anymore.
|
||||
pub async fn garbage_collect_files(&self) -> crate::Result<GarbageCollectionResult> {
|
||||
self.segment_updater.schedule_garbage_collect().await
|
||||
pub fn garbage_collect_files(&self) -> FutureResult<GarbageCollectionResult> {
|
||||
self.segment_updater.schedule_garbage_collect()
|
||||
}
|
||||
|
||||
/// Deletes all documents from the index
|
||||
@@ -516,13 +516,10 @@ impl IndexWriter {
|
||||
/// Merges a given list of segments
|
||||
///
|
||||
/// `segment_ids` is required to be non-empty.
|
||||
pub fn merge(
|
||||
&mut self,
|
||||
segment_ids: &[SegmentId],
|
||||
) -> impl Future<Output = crate::Result<SegmentMeta>> {
|
||||
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> FutureResult<SegmentMeta> {
|
||||
let merge_operation = self.segment_updater.make_merge_operation(segment_ids);
|
||||
let segment_updater = self.segment_updater.clone();
|
||||
async move { segment_updater.start_merge(merge_operation)?.await }
|
||||
segment_updater.start_merge(merge_operation)
|
||||
}
|
||||
|
||||
/// Closes the current document channel send.
|
||||
@@ -781,7 +778,6 @@ impl Drop for IndexWriter {
|
||||
mod tests {
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use futures::executor::block_on;
|
||||
use proptest::prelude::*;
|
||||
use proptest::prop_oneof;
|
||||
use proptest::strategy::Strategy;
|
||||
@@ -1456,7 +1452,7 @@ mod tests {
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
if segment_ids.len() >= 2 {
|
||||
block_on(index_writer.merge(&segment_ids)).unwrap();
|
||||
index_writer.merge(&segment_ids).wait().unwrap();
|
||||
assert!(index_writer.segment_updater().wait_merging_thread().is_ok());
|
||||
}
|
||||
}
|
||||
@@ -1472,7 +1468,7 @@ mod tests {
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
if segment_ids.len() >= 2 {
|
||||
block_on(index_writer.merge(&segment_ids)).unwrap();
|
||||
index_writer.merge(&segment_ids).wait().unwrap();
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use chrono::Utc;
|
||||
use fnv::FnvHashMap;
|
||||
use murmurhash32::murmurhash2;
|
||||
|
||||
@@ -6,8 +5,10 @@ use crate::fastfield::FastValue;
|
||||
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
||||
use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
||||
use crate::schema::Type;
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::{OffsetDateTime, UtcOffset};
|
||||
use crate::tokenizer::TextAnalyzer;
|
||||
use crate::{DocId, Term};
|
||||
use crate::{DateTime, DocId, Term};
|
||||
|
||||
/// This object is a map storing the last position for a given path for the current document
|
||||
/// being indexed.
|
||||
@@ -56,7 +57,7 @@ struct IndexingPositionsPerPath {
|
||||
impl IndexingPositionsPerPath {
|
||||
fn get_position(&mut self, term: &Term) -> &mut IndexingPosition {
|
||||
self.positions_per_path
|
||||
.entry(murmurhash2(term.as_slice()))
|
||||
.entry(murmurhash2(term.value_bytes()))
|
||||
.or_insert_with(Default::default)
|
||||
}
|
||||
}
|
||||
@@ -148,10 +149,11 @@ fn index_json_value<'a>(
|
||||
json_term_writer.term_buffer,
|
||||
ctx,
|
||||
indexing_position,
|
||||
None,
|
||||
);
|
||||
}
|
||||
TextOrDateTime::DateTime(dt) => {
|
||||
json_term_writer.set_fast_value(dt);
|
||||
json_term_writer.set_fast_value(DateTime::from_utc(dt));
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
},
|
||||
@@ -184,13 +186,13 @@ fn index_json_value<'a>(
|
||||
|
||||
enum TextOrDateTime<'a> {
|
||||
Text(&'a str),
|
||||
DateTime(crate::DateTime),
|
||||
DateTime(OffsetDateTime),
|
||||
}
|
||||
|
||||
fn infer_type_from_str(text: &str) -> TextOrDateTime {
|
||||
match chrono::DateTime::parse_from_rfc3339(text) {
|
||||
match OffsetDateTime::parse(text, &Rfc3339) {
|
||||
Ok(dt) => {
|
||||
let dt_utc = dt.with_timezone(&Utc);
|
||||
let dt_utc = dt.to_offset(UtcOffset::UTC);
|
||||
TextOrDateTime::DateTime(dt_utc)
|
||||
}
|
||||
Err(_) => TextOrDateTime::Text(text),
|
||||
@@ -206,7 +208,7 @@ impl<'a> JsonTermWriter<'a> {
|
||||
pub fn wrap(term_buffer: &'a mut Term) -> Self {
|
||||
term_buffer.clear_with_type(Type::Json);
|
||||
let mut path_stack = Vec::with_capacity(10);
|
||||
path_stack.push(5);
|
||||
path_stack.push(5); // magic number?
|
||||
Self {
|
||||
term_buffer,
|
||||
path_stack,
|
||||
@@ -248,8 +250,8 @@ impl<'a> JsonTermWriter<'a> {
|
||||
/// Returns the json path of the term being currently built.
|
||||
#[cfg(test)]
|
||||
pub(crate) fn path(&self) -> &[u8] {
|
||||
let end_of_path = self.path_stack.last().cloned().unwrap_or(6);
|
||||
&self.term().as_slice()[5..end_of_path - 1]
|
||||
let end_of_path = self.path_stack.last().cloned().unwrap_or(6); // TODO remove magic number
|
||||
&self.term().value_bytes()[..end_of_path - 1]
|
||||
}
|
||||
|
||||
pub fn set_fast_value<T: FastValue>(&mut self, val: T) {
|
||||
@@ -319,10 +321,7 @@ mod tests {
|
||||
let mut json_writer = JsonTermWriter::wrap(&mut term);
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00sred"
|
||||
)
|
||||
assert_eq!(json_writer.term().value_bytes(), b"color\x00sred")
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -334,8 +333,8 @@ mod tests {
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(-4i64);
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
|
||||
json_writer.term().value_bytes(),
|
||||
b"color\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
|
||||
)
|
||||
}
|
||||
|
||||
@@ -348,8 +347,8 @@ mod tests {
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(4u64);
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
|
||||
json_writer.term().value_bytes(),
|
||||
b"color\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
|
||||
)
|
||||
}
|
||||
|
||||
@@ -362,8 +361,8 @@ mod tests {
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_fast_value(4.0f64);
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
|
||||
json_writer.term().value_bytes(),
|
||||
b"color\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
|
||||
)
|
||||
}
|
||||
|
||||
@@ -378,8 +377,8 @@ mod tests {
|
||||
json_writer.push_path_segment("color");
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jattribute\x01color\x00sred"
|
||||
json_writer.term().value_bytes(),
|
||||
b"attribute\x01color\x00sred"
|
||||
)
|
||||
}
|
||||
|
||||
@@ -393,10 +392,7 @@ mod tests {
|
||||
json_writer.push_path_segment("hue");
|
||||
json_writer.pop_path_segment();
|
||||
json_writer.set_str("red");
|
||||
assert_eq!(
|
||||
json_writer.term().as_slice(),
|
||||
b"\x00\x00\x00\x01jcolor\x00sred"
|
||||
)
|
||||
assert_eq!(json_writer.term().value_bytes(), b"color\x00sred")
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -170,8 +170,8 @@ impl IndexMerger {
|
||||
index_settings: IndexSettings,
|
||||
segments: &[Segment],
|
||||
) -> crate::Result<IndexMerger> {
|
||||
let delete_bitsets = segments.iter().map(|_| None).collect_vec();
|
||||
Self::open_with_custom_alive_set(schema, index_settings, segments, delete_bitsets)
|
||||
let alive_bitset = segments.iter().map(|_| None).collect_vec();
|
||||
Self::open_with_custom_alive_set(schema, index_settings, segments, alive_bitset)
|
||||
}
|
||||
|
||||
// Create merge with a custom delete set.
|
||||
@@ -180,7 +180,7 @@ impl IndexMerger {
|
||||
// corresponds to the segment index.
|
||||
//
|
||||
// If `None` is provided for custom alive set, the regular alive set will be used.
|
||||
// If a delete_bitsets is provided, the union between the provided and regular
|
||||
// If a alive_bitset is provided, the union between the provided and regular
|
||||
// alive set will be used.
|
||||
//
|
||||
// This can be used to merge but also apply an additional filter.
|
||||
@@ -283,12 +283,12 @@ impl IndexMerger {
|
||||
for (field, field_entry) in self.schema.fields() {
|
||||
let field_type = field_entry.field_type();
|
||||
match field_type {
|
||||
FieldType::Facet(_) => {
|
||||
FieldType::Facet(_) | FieldType::Str(_) if field_type.is_fast() => {
|
||||
let term_ordinal_mapping = term_ord_mappings.remove(&field).expect(
|
||||
"Logic Error in Tantivy (Please report). Facet field should have required \
|
||||
a`term_ordinal_mapping`.",
|
||||
);
|
||||
self.write_hierarchical_facet_field(
|
||||
self.write_term_id_fast_field(
|
||||
field,
|
||||
&term_ordinal_mapping,
|
||||
fast_field_serializer,
|
||||
@@ -312,8 +312,8 @@ impl IndexMerger {
|
||||
self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?;
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::JsonObject(_) => {
|
||||
// We don't handle json / string fast field for the moment
|
||||
_ => {
|
||||
// We don't handle json fast field for the moment
|
||||
// They can be implemented using what is done
|
||||
// for facets in the future
|
||||
}
|
||||
@@ -590,14 +590,14 @@ impl IndexMerger {
|
||||
)
|
||||
}
|
||||
|
||||
fn write_hierarchical_facet_field(
|
||||
fn write_term_id_fast_field(
|
||||
&self,
|
||||
field: Field,
|
||||
term_ordinal_mappings: &TermOrdinalMapping,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &SegmentDocIdMapping,
|
||||
) -> crate::Result<()> {
|
||||
debug_time!("write-hierarchical-facet-field");
|
||||
debug_time!("write-term-id-fast-field");
|
||||
|
||||
// Multifastfield consists of 2 fastfields.
|
||||
// The first serves as an index into the second one and is stricly increasing.
|
||||
@@ -848,6 +848,9 @@ impl IndexMerger {
|
||||
|
||||
let mut term_ord_mapping_opt = match field_type {
|
||||
FieldType::Facet(_) => Some(TermOrdinalMapping::new(max_term_ords)),
|
||||
FieldType::Str(options) if options.is_fast() => {
|
||||
Some(TermOrdinalMapping::new(max_term_ords))
|
||||
}
|
||||
_ => None,
|
||||
};
|
||||
|
||||
@@ -1133,7 +1136,6 @@ impl IndexMerger {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use futures::executor::block_on;
|
||||
use schema::FAST;
|
||||
|
||||
use crate::collector::tests::{
|
||||
@@ -1147,9 +1149,10 @@ mod tests {
|
||||
Cardinality, Document, Facet, FacetOptions, IndexRecordOption, NumericOptions, Term,
|
||||
TextFieldIndexing, INDEXED, TEXT,
|
||||
};
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{
|
||||
assert_nearly_equals, schema, DocAddress, DocSet, IndexSettings, IndexSortByField,
|
||||
IndexWriter, Order, Searcher, SegmentId,
|
||||
assert_nearly_equals, schema, DateTime, DocAddress, DocSet, IndexSettings,
|
||||
IndexSortByField, IndexWriter, Order, Searcher, SegmentId,
|
||||
};
|
||||
|
||||
#[test]
|
||||
@@ -1157,9 +1160,7 @@ mod tests {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_fieldtype = schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_tokenizer("default")
|
||||
.set_index_option(IndexRecordOption::WithFreqs),
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
@@ -1169,14 +1170,14 @@ mod tests {
|
||||
let bytes_score_field = schema_builder.add_bytes_field("score_bytes", FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let reader = index.reader()?;
|
||||
let curr_time = chrono::Utc::now();
|
||||
let curr_time = OffsetDateTime::now_utc();
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "af b",
|
||||
score_field => 3u64,
|
||||
date_field => curr_time,
|
||||
date_field => DateTime::from_utc(curr_time),
|
||||
bytes_score_field => 3u32.to_be_bytes().as_ref()
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
@@ -1193,7 +1194,7 @@ mod tests {
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "af b",
|
||||
date_field => curr_time,
|
||||
date_field => DateTime::from_utc(curr_time),
|
||||
score_field => 11u64,
|
||||
bytes_score_field => 11u32.to_be_bytes().as_ref()
|
||||
))?;
|
||||
@@ -1209,7 +1210,7 @@ mod tests {
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
{
|
||||
@@ -1249,7 +1250,10 @@ mod tests {
|
||||
]
|
||||
);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_date(date_field, &curr_time)])?,
|
||||
get_doc_ids(vec![Term::from_field_date(
|
||||
date_field,
|
||||
DateTime::from_utc(curr_time)
|
||||
)])?,
|
||||
vec![DocAddress::new(0, 0), DocAddress::new(0, 3)]
|
||||
);
|
||||
}
|
||||
@@ -1458,7 +1462,7 @@ mod tests {
|
||||
{
|
||||
// merging the segments
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
@@ -1551,7 +1555,7 @@ mod tests {
|
||||
{
|
||||
// Test merging a single segment in order to remove deletes.
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
reader.reload()?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
@@ -1771,7 +1775,10 @@ mod tests {
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
|
||||
index_writer
|
||||
.merge(&segment_ids)
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
reader.reload().unwrap();
|
||||
test_searcher(
|
||||
@@ -1826,7 +1833,7 @@ mod tests {
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
reader.reload()?;
|
||||
// commit has not been called yet. The document should still be
|
||||
// there.
|
||||
@@ -1853,7 +1860,7 @@ mod tests {
|
||||
index_writer.commit()?;
|
||||
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
|
||||
// assert delete has not been committed
|
||||
reader.reload()?;
|
||||
@@ -1954,7 +1961,7 @@ mod tests {
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
reader.reload()?;
|
||||
@@ -2082,7 +2089,7 @@ mod tests {
|
||||
.iter()
|
||||
.map(|reader| reader.segment_id())
|
||||
.collect();
|
||||
block_on(writer.merge(&segment_ids[..]))?;
|
||||
writer.merge(&segment_ids[..]).wait()?;
|
||||
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use futures::executor::block_on;
|
||||
|
||||
use crate::collector::TopDocs;
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::{AliveBitSet, FastFieldReader, MultiValuedFastFieldReader};
|
||||
@@ -50,7 +48,7 @@ mod tests {
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
|
||||
assert!(index_writer.merge(&segment_ids).wait().is_ok());
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
index
|
||||
@@ -140,7 +138,7 @@ mod tests {
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
Ok(index)
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
use futures::executor::block_on;
|
||||
|
||||
use super::IndexWriter;
|
||||
use crate::Opstamp;
|
||||
use crate::{FutureResult, Opstamp};
|
||||
|
||||
/// A prepared commit
|
||||
pub struct PreparedCommit<'a> {
|
||||
@@ -35,9 +33,9 @@ impl<'a> PreparedCommit<'a> {
|
||||
}
|
||||
|
||||
/// Proceeds to commit.
|
||||
/// See `.commit_async()`.
|
||||
/// See `.commit_future()`.
|
||||
pub fn commit(self) -> crate::Result<Opstamp> {
|
||||
block_on(self.commit_async())
|
||||
self.commit_future().wait()
|
||||
}
|
||||
|
||||
/// Proceeds to commit.
|
||||
@@ -45,12 +43,10 @@ impl<'a> PreparedCommit<'a> {
|
||||
/// Unfortunately, contrary to what `PrepareCommit` may suggests,
|
||||
/// this operation is not at all really light.
|
||||
/// At this point deletes have not been flushed yet.
|
||||
pub async fn commit_async(self) -> crate::Result<Opstamp> {
|
||||
pub fn commit_future(self) -> FutureResult<Opstamp> {
|
||||
info!("committing {}", self.opstamp);
|
||||
self.index_writer
|
||||
.segment_updater()
|
||||
.schedule_commit(self.opstamp, self.payload)
|
||||
.await?;
|
||||
Ok(self.opstamp)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,9 +8,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use fail::fail_point;
|
||||
use futures::channel::oneshot;
|
||||
use futures::executor::{ThreadPool, ThreadPoolBuilder};
|
||||
use futures::future::{Future, TryFutureExt};
|
||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||
|
||||
use super::segment_manager::SegmentManager;
|
||||
use crate::core::{
|
||||
@@ -29,7 +27,7 @@ use crate::indexer::{
|
||||
SegmentSerializer,
|
||||
};
|
||||
use crate::schema::Schema;
|
||||
use crate::{Opstamp, TantivyError};
|
||||
use crate::{FutureResult, Opstamp, TantivyError};
|
||||
|
||||
const NUM_MERGE_THREADS: usize = 4;
|
||||
|
||||
@@ -105,7 +103,7 @@ impl Deref for SegmentUpdater {
|
||||
}
|
||||
}
|
||||
|
||||
async fn garbage_collect_files(
|
||||
fn garbage_collect_files(
|
||||
segment_updater: SegmentUpdater,
|
||||
) -> crate::Result<GarbageCollectionResult> {
|
||||
info!("Running garbage collection");
|
||||
@@ -309,18 +307,18 @@ impl SegmentUpdater {
|
||||
let segments = index.searchable_segment_metas()?;
|
||||
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
|
||||
let pool = ThreadPoolBuilder::new()
|
||||
.name_prefix("segment_updater")
|
||||
.pool_size(1)
|
||||
.create()
|
||||
.thread_name(|_| "segment_updater".to_string())
|
||||
.num_threads(1)
|
||||
.build()
|
||||
.map_err(|_| {
|
||||
crate::TantivyError::SystemError(
|
||||
"Failed to spawn segment updater thread".to_string(),
|
||||
)
|
||||
})?;
|
||||
let merge_thread_pool = ThreadPoolBuilder::new()
|
||||
.name_prefix("merge_thread")
|
||||
.pool_size(NUM_MERGE_THREADS)
|
||||
.create()
|
||||
.thread_name(|i| format!("merge_thread_{i}"))
|
||||
.num_threads(NUM_MERGE_THREADS)
|
||||
.build()
|
||||
.map_err(|_| {
|
||||
crate::TantivyError::SystemError(
|
||||
"Failed to spawn segment merging thread".to_string(),
|
||||
@@ -349,39 +347,30 @@ impl SegmentUpdater {
|
||||
*self.merge_policy.write().unwrap() = arc_merge_policy;
|
||||
}
|
||||
|
||||
async fn schedule_task<
|
||||
T: 'static + Send,
|
||||
F: Future<Output = crate::Result<T>> + 'static + Send,
|
||||
>(
|
||||
fn schedule_task<T: 'static + Send, F: FnOnce() -> crate::Result<T> + 'static + Send>(
|
||||
&self,
|
||||
task: F,
|
||||
) -> crate::Result<T> {
|
||||
) -> FutureResult<T> {
|
||||
if !self.is_alive() {
|
||||
return Err(crate::TantivyError::SystemError(
|
||||
"Segment updater killed".to_string(),
|
||||
));
|
||||
return crate::TantivyError::SystemError("Segment updater killed".to_string()).into();
|
||||
}
|
||||
let (sender, receiver) = oneshot::channel();
|
||||
self.pool.spawn_ok(async move {
|
||||
let task_result = task.await;
|
||||
let (scheduled_result, sender) = FutureResult::create(
|
||||
"A segment_updater future did not succeed. This should never happen.",
|
||||
);
|
||||
self.pool.spawn(|| {
|
||||
let task_result = task();
|
||||
let _ = sender.send(task_result);
|
||||
});
|
||||
let task_result = receiver.await;
|
||||
task_result.unwrap_or_else(|_| {
|
||||
let err_msg =
|
||||
"A segment_updater future did not success. This should never happen.".to_string();
|
||||
Err(crate::TantivyError::SystemError(err_msg))
|
||||
})
|
||||
scheduled_result
|
||||
}
|
||||
|
||||
pub async fn schedule_add_segment(&self, segment_entry: SegmentEntry) -> crate::Result<()> {
|
||||
pub fn schedule_add_segment(&self, segment_entry: SegmentEntry) -> FutureResult<()> {
|
||||
let segment_updater = self.clone();
|
||||
self.schedule_task(async move {
|
||||
self.schedule_task(move || {
|
||||
segment_updater.segment_manager.add_segment(segment_entry);
|
||||
segment_updater.consider_merge_options().await;
|
||||
segment_updater.consider_merge_options();
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Orders `SegmentManager` to remove all segments
|
||||
@@ -448,9 +437,9 @@ impl SegmentUpdater {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn schedule_garbage_collect(&self) -> crate::Result<GarbageCollectionResult> {
|
||||
let garbage_collect_future = garbage_collect_files(self.clone());
|
||||
self.schedule_task(garbage_collect_future).await
|
||||
pub fn schedule_garbage_collect(&self) -> FutureResult<GarbageCollectionResult> {
|
||||
let self_clone = self.clone();
|
||||
self.schedule_task(move || garbage_collect_files(self_clone))
|
||||
}
|
||||
|
||||
/// List the files that are useful to the index.
|
||||
@@ -468,21 +457,20 @@ impl SegmentUpdater {
|
||||
files
|
||||
}
|
||||
|
||||
pub(crate) async fn schedule_commit(
|
||||
pub(crate) fn schedule_commit(
|
||||
&self,
|
||||
opstamp: Opstamp,
|
||||
payload: Option<String>,
|
||||
) -> crate::Result<()> {
|
||||
) -> FutureResult<Opstamp> {
|
||||
let segment_updater: SegmentUpdater = self.clone();
|
||||
self.schedule_task(async move {
|
||||
self.schedule_task(move || {
|
||||
let segment_entries = segment_updater.purge_deletes(opstamp)?;
|
||||
segment_updater.segment_manager.commit(segment_entries);
|
||||
segment_updater.save_metas(opstamp, payload)?;
|
||||
let _ = garbage_collect_files(segment_updater.clone()).await;
|
||||
segment_updater.consider_merge_options().await;
|
||||
Ok(())
|
||||
let _ = garbage_collect_files(segment_updater.clone());
|
||||
segment_updater.consider_merge_options();
|
||||
Ok(opstamp)
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
fn store_meta(&self, index_meta: &IndexMeta) {
|
||||
@@ -515,26 +503,33 @@ impl SegmentUpdater {
|
||||
// suggested and the moment when it ended up being executed.)
|
||||
//
|
||||
// `segment_ids` is required to be non-empty.
|
||||
pub fn start_merge(
|
||||
&self,
|
||||
merge_operation: MergeOperation,
|
||||
) -> crate::Result<impl Future<Output = crate::Result<SegmentMeta>>> {
|
||||
pub fn start_merge(&self, merge_operation: MergeOperation) -> FutureResult<SegmentMeta> {
|
||||
assert!(
|
||||
!merge_operation.segment_ids().is_empty(),
|
||||
"Segment_ids cannot be empty."
|
||||
);
|
||||
|
||||
let segment_updater = self.clone();
|
||||
let segment_entries: Vec<SegmentEntry> = self
|
||||
let segment_entries: Vec<SegmentEntry> = match self
|
||||
.segment_manager
|
||||
.start_merge(merge_operation.segment_ids())?;
|
||||
.start_merge(merge_operation.segment_ids())
|
||||
{
|
||||
Ok(segment_entries) => segment_entries,
|
||||
Err(err) => {
|
||||
warn!(
|
||||
"Starting the merge failed for the following reason. This is not fatal. {}",
|
||||
err
|
||||
);
|
||||
return err.into();
|
||||
}
|
||||
};
|
||||
|
||||
info!("Starting merge - {:?}", merge_operation.segment_ids());
|
||||
|
||||
let (merging_future_send, merging_future_recv) =
|
||||
oneshot::channel::<crate::Result<SegmentMeta>>();
|
||||
let (scheduled_result, merging_future_send) =
|
||||
FutureResult::create("Merge operation failed.");
|
||||
|
||||
self.merge_thread_pool.spawn_ok(async move {
|
||||
self.merge_thread_pool.spawn(move || {
|
||||
// The fact that `merge_operation` is moved here is important.
|
||||
// Its lifetime is used to track how many merging thread are currently running,
|
||||
// as well as which segment is currently in merge and therefore should not be
|
||||
@@ -545,28 +540,23 @@ impl SegmentUpdater {
|
||||
merge_operation.target_opstamp(),
|
||||
) {
|
||||
Ok(after_merge_segment_entry) => {
|
||||
let segment_meta = segment_updater
|
||||
.end_merge(merge_operation, after_merge_segment_entry)
|
||||
.await;
|
||||
let _send_result = merging_future_send.send(segment_meta);
|
||||
let segment_meta_res =
|
||||
segment_updater.end_merge(merge_operation, after_merge_segment_entry);
|
||||
let _send_result = merging_future_send.send(segment_meta_res);
|
||||
}
|
||||
Err(e) => {
|
||||
Err(merge_error) => {
|
||||
warn!(
|
||||
"Merge of {:?} was cancelled: {:?}",
|
||||
merge_operation.segment_ids().to_vec(),
|
||||
e
|
||||
merge_error
|
||||
);
|
||||
// ... cancel merge
|
||||
let _send_result = merging_future_send.send(Err(merge_error));
|
||||
assert!(!cfg!(test), "Merge failed.");
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(merging_future_recv.unwrap_or_else(|e| {
|
||||
Err(crate::TantivyError::SystemError(
|
||||
"Merge failed:".to_string() + &e.to_string(),
|
||||
))
|
||||
}))
|
||||
scheduled_result
|
||||
}
|
||||
|
||||
pub(crate) fn get_mergeable_segments(&self) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
|
||||
@@ -575,7 +565,7 @@ impl SegmentUpdater {
|
||||
.get_mergeable_segments(&merge_segment_ids)
|
||||
}
|
||||
|
||||
async fn consider_merge_options(&self) {
|
||||
fn consider_merge_options(&self) {
|
||||
let (committed_segments, uncommitted_segments) = self.get_mergeable_segments();
|
||||
|
||||
// Committed segments cannot be merged with uncommitted_segments.
|
||||
@@ -601,23 +591,21 @@ impl SegmentUpdater {
|
||||
merge_candidates.extend(committed_merge_candidates);
|
||||
|
||||
for merge_operation in merge_candidates {
|
||||
if let Err(err) = self.start_merge(merge_operation) {
|
||||
warn!(
|
||||
"Starting the merge failed for the following reason. This is not fatal. {}",
|
||||
err
|
||||
);
|
||||
}
|
||||
// If a merge cannot be started this is not a fatal error.
|
||||
// We do log a warning in `start_merge`.
|
||||
let _ = self.start_merge(merge_operation);
|
||||
}
|
||||
}
|
||||
|
||||
async fn end_merge(
|
||||
/// Queues a `end_merge` in the segment updater and blocks until it is successfully processed.
|
||||
fn end_merge(
|
||||
&self,
|
||||
merge_operation: MergeOperation,
|
||||
mut after_merge_segment_entry: SegmentEntry,
|
||||
) -> crate::Result<SegmentMeta> {
|
||||
let segment_updater = self.clone();
|
||||
let after_merge_segment_meta = after_merge_segment_entry.meta().clone();
|
||||
self.schedule_task(async move {
|
||||
self.schedule_task(move || {
|
||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||
{
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
@@ -655,13 +643,13 @@ impl SegmentUpdater {
|
||||
.save_metas(previous_metas.opstamp, previous_metas.payload.clone())?;
|
||||
}
|
||||
|
||||
segment_updater.consider_merge_options().await;
|
||||
segment_updater.consider_merge_options();
|
||||
} // we drop all possible handle to a now useless `SegmentMeta`.
|
||||
|
||||
let _ = garbage_collect_files(segment_updater).await;
|
||||
let _ = garbage_collect_files(segment_updater);
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
.wait()?;
|
||||
Ok(after_merge_segment_meta)
|
||||
}
|
||||
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
|
||||
use super::operation::AddOperation;
|
||||
use crate::core::Segment;
|
||||
use crate::fastfield::FastFieldsWriter;
|
||||
use crate::fastfield::{FastFieldsWriter, FastValue as _};
|
||||
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
||||
use crate::indexer::json_term_writer::index_json_values;
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::postings::{
|
||||
compute_table_size, serialize_postings, IndexingContext, IndexingPosition,
|
||||
PerFieldPostingsWriter, PostingsWriter,
|
||||
serialize_postings, IndexingContext, IndexingPosition, PerFieldPostingsWriter, PostingsWriter,
|
||||
};
|
||||
use crate::schema::{FieldEntry, FieldType, FieldValue, Schema, Term, Value};
|
||||
use crate::store::{StoreReader, StoreWriter};
|
||||
@@ -16,25 +15,6 @@ use crate::tokenizer::{
|
||||
};
|
||||
use crate::{DocId, Document, Opstamp, SegmentComponent};
|
||||
|
||||
/// Computes the initial size of the hash table.
|
||||
///
|
||||
/// Returns the recommended initial table size as a power of 2.
|
||||
///
|
||||
/// Note this is a very dumb way to compute log2, but it is easier to proofread that way.
|
||||
fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result<usize> {
|
||||
let table_memory_upper_bound = per_thread_memory_budget / 3;
|
||||
(10..20) // We cap it at 2^19 = 512K capacity.
|
||||
.map(|power| 1 << power)
|
||||
.take_while(|capacity| compute_table_size(*capacity) < table_memory_upper_bound)
|
||||
.last()
|
||||
.ok_or_else(|| {
|
||||
crate::TantivyError::InvalidArgument(format!(
|
||||
"per thread memory budget (={per_thread_memory_budget}) is too small. Raise the \
|
||||
memory budget or lower the number of threads."
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
fn remap_doc_opstamps(
|
||||
opstamps: Vec<Opstamp>,
|
||||
doc_id_mapping_opt: Option<&DocIdMapping>,
|
||||
@@ -78,12 +58,11 @@ impl SegmentWriter {
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(
|
||||
memory_budget_in_bytes: usize,
|
||||
_memory_budget_in_bytes: usize,
|
||||
segment: Segment,
|
||||
schema: Schema,
|
||||
) -> crate::Result<SegmentWriter> {
|
||||
let tokenizer_manager = segment.index().tokenizers().clone();
|
||||
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
|
||||
let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
|
||||
let per_field_text_analyzers = schema
|
||||
@@ -106,7 +85,7 @@ impl SegmentWriter {
|
||||
.collect();
|
||||
Ok(SegmentWriter {
|
||||
max_doc: 0,
|
||||
ctx: IndexingContext::new(table_size),
|
||||
ctx: IndexingContext::new(),
|
||||
per_field_postings_writers,
|
||||
fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
|
||||
segment_serializer,
|
||||
@@ -149,6 +128,7 @@ impl SegmentWriter {
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.ctx.mem_usage()
|
||||
+ self.fieldnorms_writer.mem_usage()
|
||||
+ self.per_field_postings_writers.mem_usage()
|
||||
+ self.fast_field_writers.mem_usage()
|
||||
+ self.segment_serializer.mem_usage()
|
||||
}
|
||||
@@ -188,7 +168,7 @@ impl SegmentWriter {
|
||||
});
|
||||
if let Some(unordered_term_id) = unordered_term_id_opt {
|
||||
self.fast_field_writers
|
||||
.get_multivalue_writer_mut(field)
|
||||
.get_term_id_writer_mut(field)
|
||||
.expect("writer for facet missing")
|
||||
.add_val(unordered_term_id);
|
||||
}
|
||||
@@ -221,18 +201,22 @@ impl SegmentWriter {
|
||||
}
|
||||
|
||||
let mut indexing_position = IndexingPosition::default();
|
||||
|
||||
for mut token_stream in token_streams {
|
||||
assert_eq!(term_buffer.as_slice().len(), 5);
|
||||
// assert_eq!(term_buffer.as_slice().len(), 5);
|
||||
postings_writer.index_text(
|
||||
doc_id,
|
||||
&mut *token_stream,
|
||||
term_buffer,
|
||||
ctx,
|
||||
&mut indexing_position,
|
||||
self.fast_field_writers.get_term_id_writer_mut(field),
|
||||
);
|
||||
}
|
||||
self.fieldnorms_writer
|
||||
.record(doc_id, field, indexing_position.num_tokens);
|
||||
if field_entry.has_fieldnorms() {
|
||||
self.fieldnorms_writer
|
||||
.record(doc_id, field, indexing_position.num_tokens);
|
||||
}
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
for value in values {
|
||||
@@ -244,7 +228,7 @@ impl SegmentWriter {
|
||||
FieldType::Date(_) => {
|
||||
for value in values {
|
||||
let date_val = value.as_date().ok_or_else(make_schema_error)?;
|
||||
term_buffer.set_i64(date_val.timestamp());
|
||||
term_buffer.set_u64(date_val.to_u64());
|
||||
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
|
||||
}
|
||||
}
|
||||
@@ -414,25 +398,15 @@ pub fn prepare_doc_for_store(doc: Document, schema: &Schema) -> Document {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use chrono::Utc;
|
||||
|
||||
use super::compute_initial_table_size;
|
||||
use crate::collector::Count;
|
||||
use crate::indexer::json_term_writer::JsonTermWriter;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::query::PhraseQuery;
|
||||
use crate::schema::{IndexRecordOption, Schema, Type, STORED, STRING, TEXT};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::tokenizer::{PreTokenizedString, Token};
|
||||
use crate::{DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED};
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(compute_initial_table_size(100_000).unwrap(), 1 << 11);
|
||||
assert_eq!(compute_initial_table_size(1_000_000).unwrap(), 1 << 14);
|
||||
assert_eq!(compute_initial_table_size(10_000_000).unwrap(), 1 << 17);
|
||||
assert_eq!(compute_initial_table_size(1_000_000_000).unwrap(), 1 << 19);
|
||||
assert_eq!(compute_initial_table_size(4_000_000_000).unwrap(), 1 << 19);
|
||||
}
|
||||
use crate::{DateTime, DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED};
|
||||
|
||||
#[test]
|
||||
fn test_prepare_for_store() {
|
||||
@@ -523,11 +497,9 @@ mod tests {
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("date");
|
||||
json_term_writer.set_fast_value(
|
||||
chrono::DateTime::parse_from_rfc3339("1985-04-12T23:20:50.52Z")
|
||||
.unwrap()
|
||||
.with_timezone(&Utc),
|
||||
);
|
||||
json_term_writer.set_fast_value(DateTime::from_utc(
|
||||
OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
|
||||
));
|
||||
assert!(term_stream.advance());
|
||||
assert_eq!(term_stream.key(), json_term_writer.term().value_bytes());
|
||||
|
||||
|
||||
100
src/lib.rs
@@ -123,10 +123,95 @@ mod functional_test;
|
||||
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
mod future_result;
|
||||
|
||||
pub use chrono;
|
||||
/// Re-export of the `time` crate
|
||||
///
|
||||
/// Tantivy uses [`time`](https://crates.io/crates/time) for dates.
|
||||
pub use time;
|
||||
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
|
||||
|
||||
/// A date/time value with second precision.
|
||||
///
|
||||
/// This timestamp does not carry any explicit time zone information.
|
||||
/// Users are responsible for applying the provided conversion
|
||||
/// functions consistently. Internally the time zone is assumed
|
||||
/// to be UTC, which is also used implicitly for JSON serialization.
|
||||
///
|
||||
/// All constructors and conversions are provided as explicit
|
||||
/// functions and not by implementing any `From`/`Into` traits
|
||||
/// to prevent unintended usage.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct DateTime {
|
||||
unix_timestamp: i64,
|
||||
}
|
||||
|
||||
impl DateTime {
|
||||
/// Create new from UNIX timestamp
|
||||
pub const fn from_unix_timestamp(unix_timestamp: i64) -> Self {
|
||||
Self { unix_timestamp }
|
||||
}
|
||||
|
||||
/// Create new from `OffsetDateTime`
|
||||
///
|
||||
/// The given date/time is converted to UTC and the actual
|
||||
/// time zone is discarded.
|
||||
pub const fn from_utc(dt: OffsetDateTime) -> Self {
|
||||
Self::from_unix_timestamp(dt.unix_timestamp())
|
||||
}
|
||||
|
||||
/// Create new from `PrimitiveDateTime`
|
||||
///
|
||||
/// Implicitly assumes that the given date/time is in UTC!
|
||||
/// Otherwise the original value must only be reobtained with
|
||||
/// [`Self::into_primitive()`].
|
||||
pub const fn from_primitive(dt: PrimitiveDateTime) -> Self {
|
||||
Self::from_utc(dt.assume_utc())
|
||||
}
|
||||
|
||||
/// Convert to UNIX timestamp
|
||||
pub const fn into_unix_timestamp(self) -> i64 {
|
||||
let Self { unix_timestamp } = self;
|
||||
unix_timestamp
|
||||
}
|
||||
|
||||
/// Convert to UTC `OffsetDateTime`
|
||||
pub fn into_utc(self) -> OffsetDateTime {
|
||||
let Self { unix_timestamp } = self;
|
||||
let utc_datetime =
|
||||
OffsetDateTime::from_unix_timestamp(unix_timestamp).expect("valid UNIX timestamp");
|
||||
debug_assert_eq!(UtcOffset::UTC, utc_datetime.offset());
|
||||
utc_datetime
|
||||
}
|
||||
|
||||
/// Convert to `OffsetDateTime` with the given time zone
|
||||
pub fn into_offset(self, offset: UtcOffset) -> OffsetDateTime {
|
||||
self.into_utc().to_offset(offset)
|
||||
}
|
||||
|
||||
/// Convert to `PrimitiveDateTime` without any time zone
|
||||
///
|
||||
/// The value should have been constructed with [`Self::from_primitive()`].
|
||||
/// Otherwise the time zone is implicitly assumed to be UTC.
|
||||
pub fn into_primitive(self) -> PrimitiveDateTime {
|
||||
let utc_datetime = self.into_utc();
|
||||
// Discard the UTC time zone offset
|
||||
debug_assert_eq!(UtcOffset::UTC, utc_datetime.offset());
|
||||
PrimitiveDateTime::new(utc_datetime.date(), utc_datetime.time())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for DateTime {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let utc_rfc3339 = self.into_utc().format(&Rfc3339).map_err(|_| fmt::Error)?;
|
||||
f.write_str(&utc_rfc3339)
|
||||
}
|
||||
}
|
||||
|
||||
pub use crate::error::TantivyError;
|
||||
pub use crate::future_result::FutureResult;
|
||||
|
||||
/// Tantivy result.
|
||||
///
|
||||
@@ -138,9 +223,6 @@ pub type Result<T> = std::result::Result<T, TantivyError>;
|
||||
#[cfg(feature = "quickwit")]
|
||||
pub type AsyncIoResult<T> = std::result::Result<T, crate::error::AsyncIoError>;
|
||||
|
||||
/// Tantivy DateTime
|
||||
pub type DateTime = chrono::DateTime<chrono::Utc>;
|
||||
|
||||
mod core;
|
||||
mod indexer;
|
||||
|
||||
@@ -308,6 +390,7 @@ pub mod tests {
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::schema::*;
|
||||
use crate::{DocAddress, Index, Postings, ReloadPolicy};
|
||||
@@ -935,8 +1018,6 @@ pub mod tests {
|
||||
// motivated by #729
|
||||
#[test]
|
||||
fn test_update_via_delete_insert() -> crate::Result<()> {
|
||||
use futures::executor::block_on;
|
||||
|
||||
use crate::collector::Count;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::AllQuery;
|
||||
@@ -990,8 +1071,7 @@ pub mod tests {
|
||||
.iter()
|
||||
.map(|reader| reader.segment_id())
|
||||
.collect();
|
||||
block_on(index_writer.merge(&segment_ids)).unwrap();
|
||||
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_reader.reload()?;
|
||||
let searcher = index_reader.searcher();
|
||||
assert_eq!(searcher.search(&AllQuery, &Count)?, DOC_COUNT as usize);
|
||||
@@ -1006,6 +1086,7 @@ pub mod tests {
|
||||
let schema = builder.build();
|
||||
let index = Index::create_in_dir(&index_path, schema)?;
|
||||
let mut writer = index.writer(50_000_000)?;
|
||||
writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
for _ in 0..5000 {
|
||||
writer.add_document(doc!(body => "foo"))?;
|
||||
writer.add_document(doc!(body => "boo"))?;
|
||||
@@ -1017,8 +1098,7 @@ pub mod tests {
|
||||
writer.delete_term(Term::from_field_text(body, "foo"));
|
||||
writer.commit()?;
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let _ = futures::executor::block_on(writer.merge(&segment_ids));
|
||||
|
||||
writer.merge(&segment_ids).wait()?;
|
||||
assert!(index.validate_checksum()?.is_empty());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,27 +1,24 @@
|
||||
use crate::postings::stacker::{MemoryArena, TermHashMap};
|
||||
use crate::postings::stacker::MemoryArena;
|
||||
|
||||
/// IndexingContext contains all of the transient memory arenas
|
||||
/// required for building the inverted index.
|
||||
pub(crate) struct IndexingContext {
|
||||
/// The term index is an adhoc hashmap,
|
||||
/// itself backed by a dedicated memory arena.
|
||||
pub term_index: TermHashMap,
|
||||
/// Arena is a memory arena that stores posting lists / term frequencies / positions.
|
||||
pub arena: MemoryArena,
|
||||
pub arena_terms: MemoryArena,
|
||||
}
|
||||
|
||||
impl IndexingContext {
|
||||
/// Create a new IndexingContext given the size of the term hash map.
|
||||
pub(crate) fn new(table_size: usize) -> IndexingContext {
|
||||
let term_index = TermHashMap::new(table_size);
|
||||
pub(crate) fn new() -> IndexingContext {
|
||||
IndexingContext {
|
||||
arena: MemoryArena::new(),
|
||||
term_index,
|
||||
arena_terms: MemoryArena::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the memory usage for the inverted index memory arenas, in bytes.
|
||||
pub(crate) fn mem_usage(&self) -> usize {
|
||||
self.term_index.mem_usage() + self.arena.mem_usage()
|
||||
self.arena.mem_usage() + self.arena_terms.mem_usage()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
use std::io;
|
||||
|
||||
use super::stacker::TermHashMap;
|
||||
use crate::fastfield::MultiValuedFastFieldWriter;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
use crate::postings::recorder::{BufferLender, NothingRecorder, Recorder};
|
||||
@@ -25,6 +27,14 @@ impl<Rec: Recorder> From<JsonPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
fn mem_usage(&self) -> usize {
|
||||
self.str_posting_writer.mem_usage() + self.non_str_posting_writer.mem_usage()
|
||||
}
|
||||
|
||||
fn term_map(&self) -> &TermHashMap {
|
||||
self.str_posting_writer.term_map()
|
||||
}
|
||||
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: crate::DocId,
|
||||
@@ -42,6 +52,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
term_buffer: &mut Term,
|
||||
ctx: &mut IndexingContext,
|
||||
indexing_position: &mut IndexingPosition,
|
||||
_fast_field_writer: Option<&mut MultiValuedFastFieldWriter>,
|
||||
) {
|
||||
self.str_posting_writer.index_text(
|
||||
doc_id,
|
||||
@@ -49,6 +60,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
term_buffer,
|
||||
ctx,
|
||||
indexing_position,
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -71,6 +83,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
&self.str_posting_writer.term_map,
|
||||
serializer,
|
||||
)?;
|
||||
} else {
|
||||
@@ -80,6 +93,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
&self.str_posting_writer.term_map,
|
||||
serializer,
|
||||
)?;
|
||||
}
|
||||
|
||||
@@ -26,7 +26,6 @@ pub(crate) use self::postings_writer::{serialize_postings, IndexingPosition, Pos
|
||||
pub use self::segment_postings::SegmentPostings;
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
pub(crate) use self::skip::{BlockInfo, SkipReader};
|
||||
pub(crate) use self::stacker::compute_table_size;
|
||||
pub use self::term_info::TermInfo;
|
||||
|
||||
pub(crate) type UnorderedTermId = u64;
|
||||
|
||||
@@ -10,9 +10,10 @@ pub(crate) struct PerFieldPostingsWriter {
|
||||
|
||||
impl PerFieldPostingsWriter {
|
||||
pub fn for_schema(schema: &Schema) -> Self {
|
||||
let num_fields = schema.num_fields();
|
||||
let per_field_postings_writers = schema
|
||||
.fields()
|
||||
.map(|(_, field_entry)| posting_writer_from_field_entry(field_entry))
|
||||
.map(|(_, field_entry)| posting_writer_from_field_entry(field_entry, num_fields))
|
||||
.collect();
|
||||
PerFieldPostingsWriter {
|
||||
per_field_postings_writers,
|
||||
@@ -26,9 +27,19 @@ impl PerFieldPostingsWriter {
|
||||
pub(crate) fn get_for_field_mut(&mut self, field: Field) -> &mut dyn PostingsWriter {
|
||||
self.per_field_postings_writers[field.field_id() as usize].as_mut()
|
||||
}
|
||||
|
||||
pub(crate) fn mem_usage(&self) -> usize {
|
||||
self.per_field_postings_writers
|
||||
.iter()
|
||||
.map(|postings_writer| postings_writer.mem_usage())
|
||||
.sum()
|
||||
}
|
||||
}
|
||||
|
||||
fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter> {
|
||||
fn posting_writer_from_field_entry(
|
||||
field_entry: &FieldEntry,
|
||||
_num_fields: usize,
|
||||
) -> Box<dyn PostingsWriter> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::Range;
|
||||
|
||||
use fnv::FnvHashMap;
|
||||
|
||||
use super::stacker::Addr;
|
||||
use super::stacker::{Addr, TermHashMap};
|
||||
use crate::fastfield::MultiValuedFastFieldWriter;
|
||||
use crate::fieldnorm::FieldNormReaders;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::recorder::{BufferLender, Recorder};
|
||||
@@ -20,31 +20,6 @@ use crate::DocId;
|
||||
|
||||
const POSITION_GAP: u32 = 1;
|
||||
|
||||
fn make_field_partition(
|
||||
term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
||||
) -> Vec<(Field, Range<usize>)> {
|
||||
let term_offsets_it = term_offsets
|
||||
.iter()
|
||||
.map(|(term, _, _)| term.field())
|
||||
.enumerate();
|
||||
let mut prev_field_opt = None;
|
||||
let mut fields = vec![];
|
||||
let mut offsets = vec![];
|
||||
for (offset, field) in term_offsets_it {
|
||||
if Some(field) != prev_field_opt {
|
||||
prev_field_opt = Some(field);
|
||||
fields.push(field);
|
||||
offsets.push(offset);
|
||||
}
|
||||
}
|
||||
offsets.push(term_offsets.len());
|
||||
let mut field_offsets = vec![];
|
||||
for i in 0..fields.len() {
|
||||
field_offsets.push((fields[i], offsets[i]..offsets[i + 1]));
|
||||
}
|
||||
field_offsets
|
||||
}
|
||||
|
||||
/// Serialize the inverted index.
|
||||
/// It pushes all term, one field at a time, towards the
|
||||
/// postings serializer.
|
||||
@@ -56,23 +31,23 @@ pub(crate) fn serialize_postings(
|
||||
schema: &Schema,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
|
||||
Vec::with_capacity(ctx.term_index.len());
|
||||
term_offsets.extend(ctx.term_index.iter());
|
||||
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
|
||||
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
|
||||
HashMap::new();
|
||||
|
||||
let field_offsets = make_field_partition(&term_offsets);
|
||||
for (field, byte_offsets) in field_offsets {
|
||||
for (field, _) in schema.fields() {
|
||||
let postings_writer = per_field_postings_writers.get_for_field(field);
|
||||
|
||||
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
|
||||
Vec::with_capacity(postings_writer.term_map().len());
|
||||
term_offsets.extend(postings_writer.term_map().iter(&ctx.arena_terms));
|
||||
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
|
||||
|
||||
let field_entry = schema.get_field_entry(field);
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(_) | FieldType::Facet(_) => {
|
||||
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||
// for the field.
|
||||
let unordered_term_ids = term_offsets[byte_offsets.clone()]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket);
|
||||
let unordered_term_ids = term_offsets.iter().map(|&(_, _, bucket)| bucket);
|
||||
let mapping: FnvHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
|
||||
.enumerate()
|
||||
.map(|(term_ord, unord_term_id)| {
|
||||
@@ -86,16 +61,10 @@ pub(crate) fn serialize_postings(
|
||||
FieldType::JsonObject(_) => {}
|
||||
}
|
||||
|
||||
let postings_writer = per_field_postings_writers.get_for_field(field);
|
||||
let fieldnorm_reader = fieldnorm_readers.get_field(field)?;
|
||||
let mut field_serializer =
|
||||
serializer.new_field(field, postings_writer.total_num_tokens(), fieldnorm_reader)?;
|
||||
postings_writer.serialize(
|
||||
&term_offsets[byte_offsets],
|
||||
doc_id_map,
|
||||
&ctx,
|
||||
&mut field_serializer,
|
||||
)?;
|
||||
postings_writer.serialize(&term_offsets, doc_id_map, &ctx, &mut field_serializer)?;
|
||||
field_serializer.close()?;
|
||||
}
|
||||
Ok(unordered_term_mappings)
|
||||
@@ -127,6 +96,10 @@ pub(crate) trait PostingsWriter {
|
||||
ctx: &mut IndexingContext,
|
||||
) -> UnorderedTermId;
|
||||
|
||||
fn mem_usage(&self) -> usize;
|
||||
|
||||
fn term_map(&self) -> &TermHashMap;
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(
|
||||
@@ -145,8 +118,9 @@ pub(crate) trait PostingsWriter {
|
||||
term_buffer: &mut Term,
|
||||
ctx: &mut IndexingContext,
|
||||
indexing_position: &mut IndexingPosition,
|
||||
mut term_id_fast_field_writer_opt: Option<&mut MultiValuedFastFieldWriter>,
|
||||
) {
|
||||
let end_of_path_idx = term_buffer.as_slice().len();
|
||||
let end_of_path_idx = term_buffer.value_bytes().len();
|
||||
let mut num_tokens = 0;
|
||||
let mut end_position = 0;
|
||||
token_stream.process(&mut |token: &Token| {
|
||||
@@ -164,9 +138,14 @@ pub(crate) trait PostingsWriter {
|
||||
term_buffer.append_bytes(token.text.as_bytes());
|
||||
let start_position = indexing_position.end_position + token.position as u32;
|
||||
end_position = start_position + token.position_length as u32;
|
||||
self.subscribe(doc_id, start_position, term_buffer, ctx);
|
||||
let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
|
||||
if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
|
||||
term_id_fast_field_writer.add_val(unordered_term_id);
|
||||
}
|
||||
|
||||
num_tokens += 1;
|
||||
});
|
||||
|
||||
indexing_position.end_position = end_position + POSITION_GAP;
|
||||
indexing_position.num_tokens += num_tokens;
|
||||
term_buffer.truncate(end_of_path_idx);
|
||||
@@ -181,6 +160,7 @@ pub(crate) trait PostingsWriter {
|
||||
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder> {
|
||||
total_num_tokens: u64,
|
||||
_recorder_type: PhantomData<Rec>,
|
||||
pub(crate) term_map: TermHashMap,
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> From<SpecializedPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
|
||||
@@ -199,9 +179,10 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
buffer_lender: &mut BufferLender,
|
||||
ctx: &IndexingContext,
|
||||
term_index: &TermHashMap,
|
||||
serializer: &mut FieldSerializer,
|
||||
) -> io::Result<()> {
|
||||
let recorder: Rec = ctx.term_index.read(addr);
|
||||
let recorder: Rec = term_index.read(addr, &ctx.arena_terms);
|
||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||
serializer.new_term(term.value_bytes(), term_doc_freq)?;
|
||||
recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
|
||||
@@ -211,6 +192,14 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
|
||||
}
|
||||
|
||||
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
fn mem_usage(&self) -> usize {
|
||||
self.term_map.mem_usage()
|
||||
}
|
||||
|
||||
fn term_map(&self) -> &TermHashMap {
|
||||
&self.term_map
|
||||
}
|
||||
|
||||
fn subscribe(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
@@ -218,25 +207,30 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
term: &Term,
|
||||
ctx: &mut IndexingContext,
|
||||
) -> UnorderedTermId {
|
||||
debug_assert!(term.as_slice().len() >= 4);
|
||||
//debug_assert!(term.value_bytes().len() >= 1);
|
||||
self.total_num_tokens += 1;
|
||||
let (term_index, arena) = (&mut ctx.term_index, &mut ctx.arena);
|
||||
term_index.mutate_or_create(term.as_slice(), |opt_recorder: Option<Rec>| {
|
||||
if let Some(mut recorder) = opt_recorder {
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
recorder.close_doc(arena);
|
||||
let arena = &mut ctx.arena;
|
||||
let arena_terms = &mut ctx.arena_terms;
|
||||
self.term_map.mutate_or_create(
|
||||
term.value_bytes(),
|
||||
arena_terms,
|
||||
|opt_recorder: Option<Rec>| {
|
||||
if let Some(mut recorder) = opt_recorder {
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
recorder.close_doc(arena);
|
||||
recorder.new_doc(doc, arena);
|
||||
}
|
||||
recorder.record_position(position, arena);
|
||||
recorder
|
||||
} else {
|
||||
let mut recorder = Rec::default();
|
||||
recorder.new_doc(doc, arena);
|
||||
recorder.record_position(position, arena);
|
||||
recorder
|
||||
}
|
||||
recorder.record_position(position, arena);
|
||||
recorder
|
||||
} else {
|
||||
let mut recorder = Rec::default();
|
||||
recorder.new_doc(doc, arena);
|
||||
recorder.record_position(position, arena);
|
||||
recorder
|
||||
}
|
||||
}) as UnorderedTermId
|
||||
},
|
||||
) as UnorderedTermId
|
||||
}
|
||||
|
||||
fn serialize(
|
||||
@@ -248,7 +242,15 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
|
||||
) -> io::Result<()> {
|
||||
let mut buffer_lender = BufferLender::default();
|
||||
for (term, addr, _) in term_addrs {
|
||||
Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?;
|
||||
Self::serialize_one_term(
|
||||
term,
|
||||
*addr,
|
||||
doc_id_map,
|
||||
&mut buffer_lender,
|
||||
ctx,
|
||||
&self.term_map,
|
||||
serializer,
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -46,6 +46,7 @@ impl Addr {
|
||||
}
|
||||
|
||||
/// Returns the `Addr` object for `addr + offset`
|
||||
#[inline]
|
||||
pub fn offset(self, offset: u32) -> Addr {
|
||||
Addr(self.0.wrapping_add(offset))
|
||||
}
|
||||
@@ -54,20 +55,24 @@ impl Addr {
|
||||
Addr((page_id << NUM_BITS_PAGE_ADDR | local_addr) as u32)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn page_id(self) -> usize {
|
||||
(self.0 as usize) >> NUM_BITS_PAGE_ADDR
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn page_local_addr(self) -> usize {
|
||||
(self.0 as usize) & (PAGE_SIZE - 1)
|
||||
}
|
||||
|
||||
/// Returns true if and only if the `Addr` is null.
|
||||
#[inline]
|
||||
pub fn is_null(self) -> bool {
|
||||
self.0 == u32::max_value()
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
|
||||
assert_eq!(dest.len(), std::mem::size_of::<Item>());
|
||||
unsafe {
|
||||
@@ -75,6 +80,7 @@ pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn load<Item: Copy + 'static>(data: &[u8]) -> Item {
|
||||
assert_eq!(data.len(), std::mem::size_of::<Item>());
|
||||
unsafe { ptr::read_unaligned(data.as_ptr() as *const Item) }
|
||||
@@ -110,6 +116,7 @@ impl MemoryArena {
|
||||
self.pages.len() * PAGE_SIZE
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn write_at<Item: Copy + 'static>(&mut self, addr: Addr, val: Item) {
|
||||
let dest = self.slice_mut(addr, std::mem::size_of::<Item>());
|
||||
store(dest, val);
|
||||
@@ -120,6 +127,7 @@ impl MemoryArena {
|
||||
/// # Panics
|
||||
///
|
||||
/// If the address is erroneous
|
||||
#[inline]
|
||||
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
|
||||
load(self.slice(addr, mem::size_of::<Item>()))
|
||||
}
|
||||
@@ -128,6 +136,7 @@ impl MemoryArena {
|
||||
self.pages[addr.page_id()].slice(addr.page_local_addr(), len)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn slice_from(&self, addr: Addr) -> &[u8] {
|
||||
self.pages[addr.page_id()].slice_from(addr.page_local_addr())
|
||||
}
|
||||
|
||||
@@ -4,4 +4,4 @@ mod term_hashmap;
|
||||
|
||||
pub(crate) use self::expull::ExpUnrolledLinkedList;
|
||||
pub(crate) use self::memory_arena::{Addr, MemoryArena};
|
||||
pub(crate) use self::term_hashmap::{compute_table_size, TermHashMap};
|
||||
pub(crate) use self::term_hashmap::TermHashMap;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::convert::TryInto;
|
||||
use std::{iter, mem, slice};
|
||||
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
use murmurhash32::murmurhash2;
|
||||
|
||||
use super::{Addr, MemoryArena};
|
||||
@@ -8,13 +8,6 @@ use crate::postings::stacker::memory_arena::store;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::Term;
|
||||
|
||||
/// Returns the actual memory size in bytes
|
||||
/// required to create a table with a given capacity.
|
||||
/// required to create a table of size
|
||||
pub(crate) fn compute_table_size(capacity: usize) -> usize {
|
||||
capacity * mem::size_of::<KeyValue>()
|
||||
}
|
||||
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
/// The key is actually a `BytesRef` object stored in an external memory arena.
|
||||
/// The `value_addr` also points to an address in the memory arena.
|
||||
@@ -36,6 +29,7 @@ impl Default for KeyValue {
|
||||
}
|
||||
|
||||
impl KeyValue {
|
||||
#[inline]
|
||||
fn is_empty(self) -> bool {
|
||||
self.key_value_addr.is_null()
|
||||
}
|
||||
@@ -51,12 +45,17 @@ impl KeyValue {
|
||||
/// or copying the key as long as there is no insert.
|
||||
pub struct TermHashMap {
|
||||
table: Box<[KeyValue]>,
|
||||
memory_arena: MemoryArena,
|
||||
mask: usize,
|
||||
occupied: Vec<usize>,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl Default for TermHashMap {
|
||||
fn default() -> Self {
|
||||
Self::new(1 << 10)
|
||||
}
|
||||
}
|
||||
|
||||
struct QuadraticProbing {
|
||||
hash: usize,
|
||||
i: usize,
|
||||
@@ -75,18 +74,21 @@ impl QuadraticProbing {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Iter<'a> {
|
||||
pub struct Iter<'a, 'm> {
|
||||
hashmap: &'a TermHashMap,
|
||||
memory_arena: &'m MemoryArena,
|
||||
inner: slice::Iter<'a, usize>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Iter<'a> {
|
||||
type Item = (Term<&'a [u8]>, Addr, UnorderedTermId);
|
||||
impl<'a, 'm> Iterator for Iter<'a, 'm> {
|
||||
type Item = (Term<&'m [u8]>, Addr, UnorderedTermId);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.inner.next().cloned().map(move |bucket: usize| {
|
||||
let kv = self.hashmap.table[bucket];
|
||||
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
|
||||
let (key, offset): (&'m [u8], Addr) = self
|
||||
.hashmap
|
||||
.get_key_value(kv.key_value_addr, self.memory_arena);
|
||||
(Term::wrap(key), offset, kv.unordered_term_id)
|
||||
})
|
||||
}
|
||||
@@ -106,21 +108,19 @@ impl TermHashMap {
|
||||
pub(crate) fn new(table_size: usize) -> TermHashMap {
|
||||
assert!(table_size > 0);
|
||||
let table_size_power_of_2 = compute_previous_power_of_two(table_size);
|
||||
let memory_arena = MemoryArena::new();
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default())
|
||||
.take(table_size_power_of_2)
|
||||
.collect();
|
||||
TermHashMap {
|
||||
table: table.into_boxed_slice(),
|
||||
memory_arena,
|
||||
mask: table_size_power_of_2 - 1,
|
||||
occupied: Vec::with_capacity(table_size_power_of_2 / 2),
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
|
||||
self.memory_arena.read(addr)
|
||||
pub fn read<Item: Copy + 'static>(&self, addr: Addr, memory_arena: &MemoryArena) -> Item {
|
||||
memory_arena.read(addr)
|
||||
}
|
||||
|
||||
fn probe(&self, hash: u32) -> QuadraticProbing {
|
||||
@@ -129,6 +129,8 @@ impl TermHashMap {
|
||||
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.table.len() * mem::size_of::<KeyValue>()
|
||||
+ self.occupied.len()
|
||||
* std::mem::size_of_val(&self.occupied.get(0).cloned().unwrap_or_default())
|
||||
}
|
||||
|
||||
fn is_saturated(&self) -> bool {
|
||||
@@ -136,16 +138,22 @@ impl TermHashMap {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
||||
let data = self.memory_arena.slice_from(addr);
|
||||
let key_bytes_len = NativeEndian::read_u16(data) as usize;
|
||||
let key_bytes: &[u8] = &data[2..][..key_bytes_len];
|
||||
fn get_key_value<'m>(&self, addr: Addr, memory_arena: &'m MemoryArena) -> (&'m [u8], Addr) {
|
||||
let data = memory_arena.slice_from(addr);
|
||||
let (key_bytes_len_enc, data) = data.split_at(2);
|
||||
let key_bytes_len: u16 = u16::from_ne_bytes(key_bytes_len_enc.try_into().unwrap());
|
||||
let key_bytes: &[u8] = &data[..key_bytes_len as usize];
|
||||
(key_bytes, addr.offset(2u32 + key_bytes_len as u32))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
|
||||
let (stored_key, value_addr) = self.get_key_value(addr);
|
||||
fn get_value_addr_if_key_match(
|
||||
&self,
|
||||
target_key: &[u8],
|
||||
addr: Addr,
|
||||
memory_arena: &mut MemoryArena,
|
||||
) -> Option<Addr> {
|
||||
let (stored_key, value_addr) = self.get_key_value(addr, memory_arena);
|
||||
if stored_key == target_key {
|
||||
Some(value_addr)
|
||||
} else {
|
||||
@@ -169,10 +177,11 @@ impl TermHashMap {
|
||||
self.len
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Iter<'_> {
|
||||
pub fn iter<'a, 'm>(&'a self, memory_arena: &'m MemoryArena) -> Iter<'a, 'm> {
|
||||
Iter {
|
||||
inner: self.occupied.iter(),
|
||||
hashmap: self,
|
||||
memory_arena,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,6 +218,7 @@ impl TermHashMap {
|
||||
pub fn mutate_or_create<V, TMutator>(
|
||||
&mut self,
|
||||
key: &[u8],
|
||||
memory_arena: &mut MemoryArena,
|
||||
mut updater: TMutator,
|
||||
) -> UnorderedTermId
|
||||
where
|
||||
@@ -219,28 +229,33 @@ impl TermHashMap {
|
||||
self.resize();
|
||||
}
|
||||
let hash = murmurhash2(key);
|
||||
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
|
||||
if kv.is_empty() {
|
||||
// The key does not exists yet.
|
||||
let val = updater(None);
|
||||
let num_bytes = std::mem::size_of::<u16>() + key.len() + std::mem::size_of::<V>();
|
||||
let key_addr = self.memory_arena.allocate_space(num_bytes);
|
||||
let key_addr = memory_arena.allocate_space(num_bytes);
|
||||
{
|
||||
let data = self.memory_arena.slice_mut(key_addr, num_bytes);
|
||||
NativeEndian::write_u16(data, key.len() as u16);
|
||||
let stop = 2 + key.len();
|
||||
data[2..stop].copy_from_slice(key);
|
||||
let data = memory_arena.slice_mut(key_addr, num_bytes);
|
||||
let (key_len, data) = data.split_at_mut(2);
|
||||
key_len.copy_from_slice(&(key.len() as u16).to_le_bytes());
|
||||
let stop = key.len();
|
||||
data[..key.len()].copy_from_slice(key);
|
||||
store(&mut data[stop..], val);
|
||||
}
|
||||
return self.set_bucket(hash, key_addr, bucket);
|
||||
} else if kv.hash == hash {
|
||||
if let Some(val_addr) = self.get_value_addr_if_key_match(key, kv.key_value_addr) {
|
||||
let v = self.memory_arena.read(val_addr);
|
||||
if let Some(val_addr) =
|
||||
self.get_value_addr_if_key_match(key, kv.key_value_addr, memory_arena)
|
||||
{
|
||||
let v = memory_arena.read(val_addr);
|
||||
let new_v = updater(Some(v));
|
||||
self.memory_arena.write_at(val_addr, new_v);
|
||||
memory_arena.write_at(val_addr, new_v);
|
||||
return kv.unordered_term_id;
|
||||
}
|
||||
}
|
||||
@@ -254,26 +269,28 @@ mod tests {
|
||||
use std::collections::HashMap;
|
||||
|
||||
use super::{compute_previous_power_of_two, TermHashMap};
|
||||
use crate::postings::stacker::MemoryArena;
|
||||
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let mut arena = MemoryArena::new();
|
||||
let mut hash_map: TermHashMap = TermHashMap::new(1 << 18);
|
||||
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
|
||||
hash_map.mutate_or_create(b"abc", &mut arena, |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
3u32
|
||||
});
|
||||
hash_map.mutate_or_create(b"abcd", |opt_val: Option<u32>| {
|
||||
hash_map.mutate_or_create(b"abcd", &mut arena, |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, None);
|
||||
4u32
|
||||
});
|
||||
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
|
||||
hash_map.mutate_or_create(b"abc", &mut arena, |opt_val: Option<u32>| {
|
||||
assert_eq!(opt_val, Some(3u32));
|
||||
5u32
|
||||
});
|
||||
let mut vanilla_hash_map = HashMap::new();
|
||||
let iter_values = hash_map.iter();
|
||||
let iter_values = hash_map.iter(&arena);
|
||||
for (key, addr, _) in iter_values {
|
||||
let val: u32 = hash_map.memory_arena.read(addr);
|
||||
let val: u32 = arena.read(addr);
|
||||
vanilla_hash_map.insert(key.to_owned(), val);
|
||||
}
|
||||
assert_eq!(vanilla_hash_map.len(), 2);
|
||||
|
||||
@@ -244,12 +244,12 @@ impl MoreLikeThis {
|
||||
FieldType::Date(_) => {
|
||||
for value in values {
|
||||
// TODO: Ask if this is the semantic (timestamp) we want
|
||||
let val = value
|
||||
let unix_timestamp = value
|
||||
.as_date()
|
||||
.ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))?
|
||||
.timestamp();
|
||||
if !self.is_noise_word(val.to_string()) {
|
||||
let term = Term::from_field_i64(field, val);
|
||||
.into_unix_timestamp();
|
||||
if !self.is_noise_word(unix_timestamp.to_string()) {
|
||||
let term = Term::from_field_i64(field, unix_timestamp);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,9 +126,7 @@ pub mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions};
|
||||
let no_positions = TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_tokenizer("default")
|
||||
.set_index_option(IndexRecordOption::WithFreqs),
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
);
|
||||
|
||||
let text_field = schema_builder.add_text_field("text", no_positions);
|
||||
|
||||
@@ -15,15 +15,17 @@ use crate::query::{
|
||||
use crate::schema::{
|
||||
Facet, FacetParseError, Field, FieldType, IndexRecordOption, Schema, Term, Type,
|
||||
};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::{OffsetDateTime, UtcOffset};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::Score;
|
||||
use crate::{DateTime, Score};
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
#[derive(Debug, PartialEq, Eq, Error)]
|
||||
pub enum QueryParserError {
|
||||
/// Error in the query syntax
|
||||
#[error("Syntax Error")]
|
||||
SyntaxError,
|
||||
#[error("Syntax Error: {0}")]
|
||||
SyntaxError(String),
|
||||
/// This query is unsupported.
|
||||
#[error("Unsupported query: {0}")]
|
||||
UnsupportedQuery(String),
|
||||
@@ -59,15 +61,20 @@ pub enum QueryParserError {
|
||||
FieldDoesNotHavePositionsIndexed(String),
|
||||
/// The tokenizer for the given field is unknown
|
||||
/// The two argument strings are the name of the field, the name of the tokenizer
|
||||
#[error("The tokenizer '{0:?}' for the field '{1:?}' is unknown")]
|
||||
UnknownTokenizer(String, String),
|
||||
#[error("The tokenizer '{tokenizer:?}' for the field '{field:?}' is unknown")]
|
||||
UnknownTokenizer {
|
||||
/// The name of the tokenizer
|
||||
tokenizer: String,
|
||||
/// The field name
|
||||
field: String,
|
||||
},
|
||||
/// The query contains a range query with a phrase as one of the bounds.
|
||||
/// Only terms can be used as bounds.
|
||||
#[error("A range query cannot have a phrase as one of the bounds")]
|
||||
RangeMustNotHavePhrase,
|
||||
/// The format for the date field is not RFC 3339 compliant.
|
||||
#[error("The date field has an invalid format")]
|
||||
DateFormatError(#[from] chrono::ParseError),
|
||||
DateFormatError(#[from] time::error::Parse),
|
||||
/// The format for the facet field is invalid.
|
||||
#[error("The facet field is malformed: {0}")]
|
||||
FacetFormatError(#[from] FacetParseError),
|
||||
@@ -266,8 +273,8 @@ impl QueryParser {
|
||||
|
||||
/// Parse the user query into an AST.
|
||||
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAst, QueryParserError> {
|
||||
let user_input_ast =
|
||||
tantivy_query_grammar::parse_query(query).map_err(|_| QueryParserError::SyntaxError)?;
|
||||
let user_input_ast = tantivy_query_grammar::parse_query(query)
|
||||
.map_err(|_| QueryParserError::SyntaxError(query.to_string()))?;
|
||||
self.compute_logical_ast(user_input_ast)
|
||||
}
|
||||
|
||||
@@ -326,11 +333,8 @@ impl QueryParser {
|
||||
Ok(Term::from_field_f64(field, val))
|
||||
}
|
||||
FieldType::Date(_) => {
|
||||
let dt = chrono::DateTime::parse_from_rfc3339(phrase)?;
|
||||
Ok(Term::from_field_date(
|
||||
field,
|
||||
&dt.with_timezone(&chrono::Utc),
|
||||
))
|
||||
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
|
||||
Ok(Term::from_field_date(field, DateTime::from_utc(dt)))
|
||||
}
|
||||
FieldType::Str(ref str_options) => {
|
||||
let option = str_options.get_indexing_options().ok_or_else(|| {
|
||||
@@ -340,11 +344,9 @@ impl QueryParser {
|
||||
let text_analyzer =
|
||||
self.tokenizer_manager
|
||||
.get(option.tokenizer())
|
||||
.ok_or_else(|| {
|
||||
QueryParserError::UnknownTokenizer(
|
||||
field_entry.name().to_string(),
|
||||
option.tokenizer().to_string(),
|
||||
)
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
field: field_entry.name().to_string(),
|
||||
tokenizer: option.tokenizer().to_string(),
|
||||
})?;
|
||||
let mut terms: Vec<Term> = Vec::new();
|
||||
let mut token_stream = text_analyzer.token_stream(phrase);
|
||||
@@ -405,8 +407,8 @@ impl QueryParser {
|
||||
Ok(vec![LogicalLiteral::Term(f64_term)])
|
||||
}
|
||||
FieldType::Date(_) => {
|
||||
let dt = chrono::DateTime::parse_from_rfc3339(phrase)?;
|
||||
let dt_term = Term::from_field_date(field, &dt.with_timezone(&chrono::Utc));
|
||||
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
|
||||
let dt_term = Term::from_field_date(field, DateTime::from_utc(dt));
|
||||
Ok(vec![LogicalLiteral::Term(dt_term)])
|
||||
}
|
||||
FieldType::Str(ref str_options) => {
|
||||
@@ -417,11 +419,9 @@ impl QueryParser {
|
||||
let text_analyzer =
|
||||
self.tokenizer_manager
|
||||
.get(option.tokenizer())
|
||||
.ok_or_else(|| {
|
||||
QueryParserError::UnknownTokenizer(
|
||||
field_name.to_string(),
|
||||
option.tokenizer().to_string(),
|
||||
)
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
field: field_name.to_string(),
|
||||
tokenizer: option.tokenizer().to_string(),
|
||||
})?;
|
||||
let index_record_option = option.index_option();
|
||||
Ok(generate_literals_for_str(
|
||||
@@ -442,11 +442,9 @@ impl QueryParser {
|
||||
let text_analyzer =
|
||||
self.tokenizer_manager
|
||||
.get(option.tokenizer())
|
||||
.ok_or_else(|| {
|
||||
QueryParserError::UnknownTokenizer(
|
||||
field_name.to_string(),
|
||||
option.tokenizer().to_string(),
|
||||
)
|
||||
.ok_or_else(|| QueryParserError::UnknownTokenizer {
|
||||
field: field_name.to_string(),
|
||||
tokenizer: option.tokenizer().to_string(),
|
||||
})?;
|
||||
let index_record_option = option.index_option();
|
||||
generate_literals_for_json_object(
|
||||
@@ -549,7 +547,7 @@ impl QueryParser {
|
||||
.map(|json_field| (json_field, full_path.as_str(), literal.phrase.as_str()))
|
||||
.collect();
|
||||
if triplets.is_empty() {
|
||||
return Err(QueryParserError::FieldDoesNotExist(field_name.to_string()));
|
||||
return Err(QueryParserError::FieldDoesNotExist(full_path.to_string()));
|
||||
}
|
||||
Ok(triplets)
|
||||
}
|
||||
@@ -666,12 +664,12 @@ enum NumValue {
|
||||
U64(u64),
|
||||
I64(i64),
|
||||
F64(f64),
|
||||
DateTime(crate::DateTime),
|
||||
DateTime(OffsetDateTime),
|
||||
}
|
||||
|
||||
fn infer_type_num(phrase: &str) -> Option<NumValue> {
|
||||
if let Ok(dt) = chrono::DateTime::parse_from_rfc3339(phrase) {
|
||||
let dt_utc = dt.with_timezone(&chrono::Utc);
|
||||
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
|
||||
let dt_utc = dt.to_offset(UtcOffset::UTC);
|
||||
return Some(NumValue::DateTime(dt_utc));
|
||||
}
|
||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
||||
@@ -713,14 +711,14 @@ fn generate_literals_for_json_object(
|
||||
json_term_writer.set_fast_value(f64_val);
|
||||
}
|
||||
NumValue::DateTime(dt_val) => {
|
||||
json_term_writer.set_fast_value(dt_val);
|
||||
json_term_writer.set_fast_value(DateTime::from_utc(dt_val));
|
||||
}
|
||||
}
|
||||
logical_literals.push(LogicalLiteral::Term(json_term_writer.term().clone()));
|
||||
}
|
||||
json_term_writer.close_path_and_set_type(Type::Str);
|
||||
drop(json_term_writer);
|
||||
let term_num_bytes = term.as_slice().len();
|
||||
let term_num_bytes = term.value_bytes().len();
|
||||
let mut token_stream = text_analyzer.token_stream(phrase);
|
||||
let mut terms: Vec<(usize, Term)> = Vec::new();
|
||||
token_stream.process(&mut |token| {
|
||||
@@ -1040,6 +1038,7 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_json_field_possibly_a_date() {
|
||||
// Subseconds are discarded
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
r#"json.date:"2019-10-12T07:20:50.52Z""#,
|
||||
r#"(Term(type=Json, field=14, path=date, vtype=Date, 2019-10-12T07:20:50Z) "[(0, Term(type=Json, field=14, path=date, vtype=Str, "2019")), (1, Term(type=Json, field=14, path=date, vtype=Str, "10")), (2, Term(type=Json, field=14, path=date, vtype=Str, "12t07")), (3, Term(type=Json, field=14, path=date, vtype=Str, "20")), (4, Term(type=Json, field=14, path=date, vtype=Str, "50")), (5, Term(type=Json, field=14, path=date, vtype=Str, "52z"))]")"#,
|
||||
@@ -1221,9 +1220,11 @@ mod test {
|
||||
#[test]
|
||||
pub fn test_query_parser_field_does_not_exist() {
|
||||
let query_parser = make_query_parser();
|
||||
assert_matches!(
|
||||
query_parser.parse_query("boujou:\"18446744073709551615\""),
|
||||
Err(QueryParserError::FieldDoesNotExist(_))
|
||||
assert_eq!(
|
||||
query_parser
|
||||
.parse_query("boujou:\"18446744073709551615\"")
|
||||
.unwrap_err(),
|
||||
QueryParserError::FieldDoesNotExist("boujou".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1248,9 +1249,10 @@ mod test {
|
||||
let default_fields = vec![title];
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager);
|
||||
|
||||
assert_matches!(
|
||||
query_parser.parse_query("title:\"happy tax payer\""),
|
||||
Err(QueryParserError::UnknownTokenizer(_, _))
|
||||
Err(QueryParserError::UnknownTokenizer { .. })
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -125,7 +125,6 @@ impl Scorer for TermScorer {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use futures::executor::block_on;
|
||||
use proptest::prelude::*;
|
||||
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
@@ -321,9 +320,7 @@ mod tests {
|
||||
.collect();
|
||||
test_block_wand_aux(&term_query, &searcher)?;
|
||||
}
|
||||
{
|
||||
let _ = block_on(writer.merge(&segment_ids[..]));
|
||||
}
|
||||
writer.merge(&segment_ids[..]).wait().unwrap();
|
||||
{
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
@@ -110,7 +110,7 @@ impl Document {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
/// Add a date field
|
||||
/// Add a date field with unspecified time zone offset
|
||||
pub fn add_date(&mut self, field: Field, value: DateTime) {
|
||||
self.add_field_value(field, value);
|
||||
}
|
||||
|
||||
@@ -93,13 +93,7 @@ impl FieldEntry {
|
||||
|
||||
/// Returns true if the field is a int (signed or unsigned) fast field
|
||||
pub fn is_fast(&self) -> bool {
|
||||
match self.field_type {
|
||||
FieldType::U64(ref options)
|
||||
| FieldType::I64(ref options)
|
||||
| FieldType::Date(ref options)
|
||||
| FieldType::F64(ref options) => options.is_fast(),
|
||||
_ => false,
|
||||
}
|
||||
self.field_type.is_fast()
|
||||
}
|
||||
|
||||
/// Returns true if the field is stored
|
||||
@@ -144,7 +138,8 @@ mod tests {
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false
|
||||
"stored": false,
|
||||
"fast": false
|
||||
}
|
||||
}"#;
|
||||
let field_value_json = serde_json::to_string_pretty(&field_value).unwrap();
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use chrono::{FixedOffset, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value as JsonValue;
|
||||
use thiserror::Error;
|
||||
@@ -9,7 +8,10 @@ use crate::schema::{
|
||||
Facet, IndexRecordOption, JsonObjectOptions, NumericOptions, TextFieldIndexing, TextOptions,
|
||||
Value,
|
||||
};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
|
||||
/// Possible error that may occur while parsing a field value
|
||||
/// At this point the JSON is known to be valid.
|
||||
@@ -183,7 +185,21 @@ impl FieldType {
|
||||
}
|
||||
}
|
||||
|
||||
/// returns true if the field is normed.
|
||||
/// returns true if the field is fast.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
match *self {
|
||||
FieldType::Bytes(ref bytes_options) => bytes_options.is_fast(),
|
||||
FieldType::Str(ref text_options) => text_options.is_fast(),
|
||||
FieldType::U64(ref int_options)
|
||||
| FieldType::I64(ref int_options)
|
||||
| FieldType::F64(ref int_options)
|
||||
| FieldType::Date(ref int_options) => int_options.get_fastfield_cardinality().is_some(),
|
||||
FieldType::Facet(_) => true,
|
||||
FieldType::JsonObject(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// returns true if the field is normed (see [fieldnorms](crate::fieldnorm)).
|
||||
pub fn has_fieldnorms(&self) -> bool {
|
||||
match *self {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
@@ -244,33 +260,33 @@ impl FieldType {
|
||||
/// target field is a `Str`, this method will return an Error.
|
||||
pub fn value_from_json(&self, json: JsonValue) -> Result<Value, ValueParsingError> {
|
||||
match json {
|
||||
JsonValue::String(field_text) => match *self {
|
||||
FieldType::Date(_) => {
|
||||
let dt_with_fixed_tz: chrono::DateTime<FixedOffset> =
|
||||
chrono::DateTime::parse_from_rfc3339(&field_text).map_err(|_err| {
|
||||
ValueParsingError::TypeError {
|
||||
JsonValue::String(field_text) => {
|
||||
match *self {
|
||||
FieldType::Date(_) => {
|
||||
let dt_with_fixed_tz = OffsetDateTime::parse(&field_text, &Rfc3339)
|
||||
.map_err(|_err| ValueParsingError::TypeError {
|
||||
expected: "rfc3339 format",
|
||||
json: JsonValue::String(field_text),
|
||||
}
|
||||
})?;
|
||||
Ok(Value::Date(dt_with_fixed_tz.with_timezone(&Utc)))
|
||||
}
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text)),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "an integer",
|
||||
})?;
|
||||
Ok(DateTime::from_utc(dt_with_fixed_tz).into())
|
||||
}
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text)),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
|
||||
Err(ValueParsingError::TypeError {
|
||||
expected: "an integer",
|
||||
json: JsonValue::String(field_text),
|
||||
})
|
||||
}
|
||||
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))),
|
||||
FieldType::Bytes(_) => base64::decode(&field_text)
|
||||
.map(Value::Bytes)
|
||||
.map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }),
|
||||
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
|
||||
expected: "a json object",
|
||||
json: JsonValue::String(field_text),
|
||||
})
|
||||
}),
|
||||
}
|
||||
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))),
|
||||
FieldType::Bytes(_) => base64::decode(&field_text)
|
||||
.map(Value::Bytes)
|
||||
.map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }),
|
||||
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {
|
||||
expected: "a json object",
|
||||
json: JsonValue::String(field_text),
|
||||
}),
|
||||
},
|
||||
}
|
||||
JsonValue::Number(field_val_num) => match self {
|
||||
FieldType::I64(_) | FieldType::Date(_) => {
|
||||
if let Some(field_val_i64) = field_val_num.as_i64() {
|
||||
@@ -342,12 +358,12 @@ impl FieldType {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use chrono::{NaiveDate, NaiveDateTime, NaiveTime, Utc};
|
||||
use serde_json::json;
|
||||
|
||||
use super::FieldType;
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
use crate::schema::{Schema, TextOptions, Type, Value, INDEXED};
|
||||
use crate::time::{Date, Month, PrimitiveDateTime, Time};
|
||||
use crate::tokenizer::{PreTokenizedString, Token};
|
||||
use crate::{DateTime, Document};
|
||||
|
||||
@@ -359,7 +375,8 @@ mod tests {
|
||||
let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#;
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
let date = doc.get_first(date_field).unwrap();
|
||||
assert_eq!(format!("{:?}", date), "Date(2019-10-12T05:20:50.520Z)");
|
||||
// Time zone is converted to UTC and subseconds are discarded
|
||||
assert_eq!("Date(2019-10-12T05:20:50Z)", format!("{:?}", date));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -368,12 +385,12 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field("date", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let naive_date = NaiveDate::from_ymd(1982, 9, 17);
|
||||
let naive_time = NaiveTime::from_hms(13, 20, 00);
|
||||
let date_time = DateTime::from_utc(NaiveDateTime::new(naive_date, naive_time), Utc);
|
||||
doc.add_date(date_field, date_time);
|
||||
let naive_date = Date::from_calendar_date(1982, Month::September, 17).unwrap();
|
||||
let naive_time = Time::from_hms(13, 20, 0).unwrap();
|
||||
let date_time = PrimitiveDateTime::new(naive_date, naive_time);
|
||||
doc.add_date(date_field, DateTime::from_primitive(date_time));
|
||||
let doc_json = schema.to_json(&doc);
|
||||
assert_eq!(doc_json, r#"{"date":["1982-09-17T13:20:00+00:00"]}"#);
|
||||
assert_eq!(doc_json, r#"{"date":["1982-09-17T13:20:00Z"]}"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -29,6 +29,12 @@ pub enum IndexRecordOption {
|
||||
WithFreqsAndPositions,
|
||||
}
|
||||
|
||||
impl Default for IndexRecordOption {
|
||||
fn default() -> Self {
|
||||
IndexRecordOption::Basic
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexRecordOption {
|
||||
/// Returns true if this option includes encoding
|
||||
/// term frequencies.
|
||||
|
||||
@@ -34,6 +34,20 @@ impl JsonObjectOptions {
|
||||
pub fn get_text_indexing_options(&self) -> Option<&TextFieldIndexing> {
|
||||
self.indexing.as_ref()
|
||||
}
|
||||
|
||||
/// Sets the field as stored
|
||||
#[must_use]
|
||||
pub fn set_stored(mut self) -> Self {
|
||||
self.stored = true;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the field as indexed, with the specific indexing options.
|
||||
#[must_use]
|
||||
pub fn set_indexing_options(mut self, indexing: TextFieldIndexing) -> Self {
|
||||
self.indexing = Some(indexing);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl From<StoredFlag> for JsonObjectOptions {
|
||||
|
||||
@@ -417,6 +417,7 @@ mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use matches::{assert_matches, matches};
|
||||
use pretty_assertions::assert_eq;
|
||||
use serde_json;
|
||||
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
@@ -469,7 +470,8 @@ mod tests {
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false
|
||||
"stored": false,
|
||||
"fast": false
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -481,7 +483,8 @@ mod tests {
|
||||
"fieldnorms": false,
|
||||
"tokenizer": "raw"
|
||||
},
|
||||
"stored": false
|
||||
"stored": false,
|
||||
"fast": false
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -784,7 +787,8 @@ mod tests {
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false
|
||||
"stored": false,
|
||||
"fast": false
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -816,7 +820,8 @@ mod tests {
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "raw"
|
||||
},
|
||||
"stored": true
|
||||
"stored": true,
|
||||
"fast": false
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -838,7 +843,8 @@ mod tests {
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false
|
||||
"stored": false,
|
||||
"fast": false
|
||||
}
|
||||
},
|
||||
{
|
||||
|
||||
@@ -17,7 +17,7 @@ use crate::DateTime;
|
||||
///
|
||||
/// - <value> is, if this is not the json term, a binary representation specific to the type.
|
||||
/// If it is a JSON Term, then it is preprended with the path that leads to this leaf value.
|
||||
const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
|
||||
const FAST_VALUE_TERM_LEN: usize = 8;
|
||||
|
||||
/// Separates the different segments of
|
||||
/// the json path.
|
||||
@@ -33,22 +33,33 @@ pub const JSON_END_OF_PATH: u8 = 0u8;
|
||||
///
|
||||
/// It actually wraps a `Vec<u8>`.
|
||||
#[derive(Clone)]
|
||||
pub struct Term<B = Vec<u8>>(B)
|
||||
where B: AsRef<[u8]>;
|
||||
pub struct Term<B = Vec<u8>> {
|
||||
data: B,
|
||||
field: Field,
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
impl AsMut<Vec<u8>> for Term {
|
||||
fn as_mut(&mut self) -> &mut Vec<u8> {
|
||||
&mut self.0
|
||||
&mut self.data
|
||||
}
|
||||
}
|
||||
|
||||
impl Term {
|
||||
pub(crate) fn new() -> Term {
|
||||
Term(Vec::with_capacity(100))
|
||||
Self::with_capacity(32)
|
||||
}
|
||||
|
||||
pub(crate) fn with_capacity(cap: usize) -> Term {
|
||||
Term {
|
||||
data: Vec::with_capacity(cap),
|
||||
field: Field::from_field_id(0),
|
||||
field_type: Type::Str,
|
||||
}
|
||||
}
|
||||
|
||||
fn from_fast_value<T: FastValue>(field: Field, val: &T) -> Term {
|
||||
let mut term = Term(vec![0u8; FAST_VALUE_TERM_LEN]);
|
||||
let mut term = Term::with_capacity(FAST_VALUE_TERM_LEN);
|
||||
term.set_field(T::to_type(), field);
|
||||
term.set_u64(val.to_u64());
|
||||
term
|
||||
@@ -70,8 +81,8 @@ impl Term {
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a DateTime value
|
||||
pub fn from_field_date(field: Field, val: &DateTime) -> Term {
|
||||
Term::from_fast_value(field, val)
|
||||
pub fn from_field_date(field: Field, val: DateTime) -> Term {
|
||||
Term::from_fast_value(field, &val)
|
||||
}
|
||||
|
||||
/// Creates a `Term` given a facet.
|
||||
@@ -86,9 +97,9 @@ impl Term {
|
||||
}
|
||||
|
||||
fn create_bytes_term(typ: Type, field: Field, bytes: &[u8]) -> Term {
|
||||
let mut term = Term(vec![0u8; 5 + bytes.len()]);
|
||||
let mut term = Term::with_capacity(bytes.len());
|
||||
term.set_field(typ, field);
|
||||
term.0.extend_from_slice(bytes);
|
||||
term.data.extend_from_slice(bytes);
|
||||
term
|
||||
}
|
||||
|
||||
@@ -98,10 +109,9 @@ impl Term {
|
||||
}
|
||||
|
||||
pub(crate) fn set_field(&mut self, typ: Type, field: Field) {
|
||||
self.0.clear();
|
||||
self.0
|
||||
.extend_from_slice(field.field_id().to_be_bytes().as_ref());
|
||||
self.0.push(typ.to_code());
|
||||
self.field = field;
|
||||
self.field_type = typ;
|
||||
self.data.clear();
|
||||
}
|
||||
|
||||
/// Sets a u64 value in the term.
|
||||
@@ -112,11 +122,9 @@ impl Term {
|
||||
/// the natural order of the values.
|
||||
pub fn set_u64(&mut self, val: u64) {
|
||||
self.set_fast_value(val);
|
||||
self.set_bytes(val.to_be_bytes().as_ref());
|
||||
}
|
||||
|
||||
fn set_fast_value<T: FastValue>(&mut self, val: T) {
|
||||
self.0.resize(FAST_VALUE_TERM_LEN, 0u8);
|
||||
self.set_bytes(val.to_u64().to_be_bytes().as_ref());
|
||||
}
|
||||
|
||||
@@ -126,7 +134,7 @@ impl Term {
|
||||
}
|
||||
|
||||
/// Sets a `i64` value in the term.
|
||||
pub fn set_date(&mut self, date: crate::DateTime) {
|
||||
pub fn set_date(&mut self, date: DateTime) {
|
||||
self.set_fast_value(date);
|
||||
}
|
||||
|
||||
@@ -137,8 +145,8 @@ impl Term {
|
||||
|
||||
/// Sets the value of a `Bytes` field.
|
||||
pub fn set_bytes(&mut self, bytes: &[u8]) {
|
||||
self.0.resize(5, 0u8);
|
||||
self.0.extend(bytes);
|
||||
self.data.clear();
|
||||
self.data.extend(bytes);
|
||||
}
|
||||
|
||||
/// Set the texts only, keeping the field untouched.
|
||||
@@ -148,18 +156,18 @@ impl Term {
|
||||
|
||||
/// Removes the value_bytes and set the type code.
|
||||
pub fn clear_with_type(&mut self, typ: Type) {
|
||||
self.truncate(5);
|
||||
self.0[4] = typ.to_code();
|
||||
self.data.clear();
|
||||
self.field_type = typ;
|
||||
}
|
||||
|
||||
/// Truncate the term right after the field and the type code.
|
||||
pub fn truncate(&mut self, len: usize) {
|
||||
self.0.truncate(len);
|
||||
self.data.truncate(len);
|
||||
}
|
||||
|
||||
/// Truncate the term right after the field and the type code.
|
||||
pub fn append_bytes(&mut self, bytes: &[u8]) {
|
||||
self.0.extend_from_slice(bytes);
|
||||
self.data.extend_from_slice(bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -167,7 +175,7 @@ impl<B> Ord for Term<B>
|
||||
where B: AsRef<[u8]>
|
||||
{
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
self.as_slice().cmp(other.as_slice())
|
||||
self.value_bytes().cmp(other.value_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -183,7 +191,7 @@ impl<B> PartialEq for Term<B>
|
||||
where B: AsRef<[u8]>
|
||||
{
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.as_slice() == other.as_slice()
|
||||
self.value_bytes() == other.value_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -193,7 +201,7 @@ impl<B> Hash for Term<B>
|
||||
where B: AsRef<[u8]>
|
||||
{
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
self.0.as_ref().hash(state)
|
||||
self.data.as_ref().hash(state)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -202,14 +210,15 @@ where B: AsRef<[u8]>
|
||||
{
|
||||
/// Wraps a object holding bytes
|
||||
pub fn wrap(data: B) -> Term<B> {
|
||||
Term(data)
|
||||
Term {
|
||||
data,
|
||||
field: Field::from_field_id(0),
|
||||
field_type: Type::Str,
|
||||
}
|
||||
}
|
||||
|
||||
fn typ_code(&self) -> u8 {
|
||||
*self
|
||||
.as_slice()
|
||||
.get(4)
|
||||
.expect("the byte representation is too short")
|
||||
self.field_type as u8
|
||||
}
|
||||
|
||||
/// Return the type of the term.
|
||||
@@ -219,55 +228,7 @@ where B: AsRef<[u8]>
|
||||
|
||||
/// Returns the field.
|
||||
pub fn field(&self) -> Field {
|
||||
let mut field_id_bytes = [0u8; 4];
|
||||
field_id_bytes.copy_from_slice(&self.0.as_ref()[..4]);
|
||||
Field::from_field_id(u32::from_be_bytes(field_id_bytes))
|
||||
}
|
||||
|
||||
/// Returns the `u64` value stored in a term.
|
||||
///
|
||||
/// Returns None if the term is not of the u64 type, or if the term byte representation
|
||||
/// is invalid.
|
||||
pub fn as_u64(&self) -> Option<u64> {
|
||||
self.get_fast_type::<u64>()
|
||||
}
|
||||
|
||||
fn get_fast_type<T: FastValue>(&self) -> Option<T> {
|
||||
if self.typ() != T::to_type() {
|
||||
return None;
|
||||
}
|
||||
let mut value_bytes = [0u8; 8];
|
||||
let bytes = self.value_bytes();
|
||||
if bytes.len() != 8 {
|
||||
return None;
|
||||
}
|
||||
value_bytes.copy_from_slice(self.value_bytes());
|
||||
let value_u64 = u64::from_be_bytes(value_bytes);
|
||||
Some(FastValue::from_u64(value_u64))
|
||||
}
|
||||
|
||||
/// Returns the `i64` value stored in a term.
|
||||
///
|
||||
/// Returns None if the term is not of the i64 type, or if the term byte representation
|
||||
/// is invalid.
|
||||
pub fn as_i64(&self) -> Option<i64> {
|
||||
self.get_fast_type::<i64>()
|
||||
}
|
||||
|
||||
/// Returns the `f64` value stored in a term.
|
||||
///
|
||||
/// Returns None if the term is not of the f64 type, or if the term byte representation
|
||||
/// is invalid.
|
||||
pub fn as_f64(&self) -> Option<f64> {
|
||||
self.get_fast_type::<f64>()
|
||||
}
|
||||
|
||||
/// Returns the `Date` value stored in a term.
|
||||
///
|
||||
/// Returns None if the term is not of the Date type, or if the term byte representation
|
||||
/// is invalid.
|
||||
pub fn as_date(&self) -> Option<crate::DateTime> {
|
||||
self.get_fast_type::<crate::DateTime>()
|
||||
self.field
|
||||
}
|
||||
|
||||
/// Returns the text associated with the term.
|
||||
@@ -275,43 +236,12 @@ where B: AsRef<[u8]>
|
||||
/// Returns None if the field is not of string type
|
||||
/// or if the bytes are not valid utf-8.
|
||||
pub fn as_str(&self) -> Option<&str> {
|
||||
if self.as_slice().len() < 5 {
|
||||
return None;
|
||||
}
|
||||
if self.typ() != Type::Str {
|
||||
return None;
|
||||
}
|
||||
str::from_utf8(self.value_bytes()).ok()
|
||||
}
|
||||
|
||||
/// Returns the facet associated with the term.
|
||||
///
|
||||
/// Returns None if the field is not of facet type
|
||||
/// or if the bytes are not valid utf-8.
|
||||
pub fn as_facet(&self) -> Option<Facet> {
|
||||
if self.as_slice().len() < 5 {
|
||||
return None;
|
||||
}
|
||||
if self.typ() != Type::Facet {
|
||||
return None;
|
||||
}
|
||||
let facet_encode_str = str::from_utf8(self.value_bytes()).ok()?;
|
||||
Some(Facet::from_encoded_string(facet_encode_str.to_string()))
|
||||
}
|
||||
|
||||
/// Returns the bytes associated with the term.
|
||||
///
|
||||
/// Returns None if the field is not of bytes type.
|
||||
pub fn as_bytes(&self) -> Option<&[u8]> {
|
||||
if self.as_slice().len() < 5 {
|
||||
return None;
|
||||
}
|
||||
if self.typ() != Type::Bytes {
|
||||
return None;
|
||||
}
|
||||
Some(self.value_bytes())
|
||||
}
|
||||
|
||||
/// Returns the serialized value of the term.
|
||||
/// (this does not include the field.)
|
||||
///
|
||||
@@ -319,15 +249,7 @@ where B: AsRef<[u8]>
|
||||
/// If the term is a u64, its value is encoded according
|
||||
/// to `byteorder::LittleEndian`.
|
||||
pub fn value_bytes(&self) -> &[u8] {
|
||||
&self.0.as_ref()[5..]
|
||||
}
|
||||
|
||||
/// Returns the underlying `&[u8]`.
|
||||
///
|
||||
/// Do NOT rely on this byte representation in the index.
|
||||
/// This value is likely to change in the future.
|
||||
pub(crate) fn as_slice(&self) -> &[u8] {
|
||||
self.0.as_ref()
|
||||
&self.data.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -374,7 +296,7 @@ fn debug_value_bytes(typ: Type, bytes: &[u8], f: &mut fmt::Formatter) -> fmt::Re
|
||||
}
|
||||
// TODO pretty print these types too.
|
||||
Type::Date => {
|
||||
write_opt(f, get_fast_type::<crate::DateTime>(bytes))?;
|
||||
write_opt(f, get_fast_type::<DateTime>(bytes))?;
|
||||
}
|
||||
Type::Facet => {
|
||||
let facet_str = str::from_utf8(bytes)
|
||||
@@ -434,7 +356,6 @@ mod tests {
|
||||
let term = Term::from_field_u64(count_field, 983u64);
|
||||
assert_eq!(term.field(), count_field);
|
||||
assert_eq!(term.typ(), Type::U64);
|
||||
assert_eq!(term.as_slice().len(), super::FAST_VALUE_TERM_LEN);
|
||||
assert_eq!(term.as_u64(), Some(983u64))
|
||||
assert_eq!(term.value_bytes().len(), super::FAST_VALUE_TERM_LEN);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,14 +3,20 @@ use std::ops::BitOr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::flags::FastFlag;
|
||||
use crate::schema::flags::{SchemaFlagList, StoredFlag};
|
||||
use crate::schema::IndexRecordOption;
|
||||
|
||||
/// Define how a text field should be handled by tantivy.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize, Default)]
|
||||
pub struct TextOptions {
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
indexing: Option<TextFieldIndexing>,
|
||||
#[serde(default)]
|
||||
stored: bool,
|
||||
#[serde(default)]
|
||||
fast: bool,
|
||||
}
|
||||
|
||||
impl TextOptions {
|
||||
@@ -24,6 +30,25 @@ impl TextOptions {
|
||||
self.stored
|
||||
}
|
||||
|
||||
/// Returns true iff the value is a fast field.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
self.fast
|
||||
}
|
||||
|
||||
/// Set the field as a fast field.
|
||||
///
|
||||
/// Fast fields are designed for random access.
|
||||
/// Access time are similar to a random lookup in an array.
|
||||
/// Text fast fields will have the term ids stored in the fast field.
|
||||
/// The fast field will be a multivalued fast field.
|
||||
///
|
||||
/// The original text can be retrieved via `ord_to_term` from the dictionary.
|
||||
#[must_use]
|
||||
pub fn set_fast(mut self) -> TextOptions {
|
||||
self.fast = true;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the field as stored
|
||||
#[must_use]
|
||||
pub fn set_stored(mut self) -> TextOptions {
|
||||
@@ -39,26 +64,60 @@ impl TextOptions {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
|
||||
struct TokenizerName(Cow<'static, str>);
|
||||
|
||||
const DEFAULT_TOKENIZER_NAME: &str = "default";
|
||||
|
||||
const NO_TOKENIZER_NAME: &str = "raw";
|
||||
|
||||
impl Default for TokenizerName {
|
||||
fn default() -> Self {
|
||||
TokenizerName::from_static(DEFAULT_TOKENIZER_NAME)
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenizerName {
|
||||
const fn from_static(name: &'static str) -> Self {
|
||||
TokenizerName(Cow::Borrowed(name))
|
||||
}
|
||||
fn from_name(name: &str) -> Self {
|
||||
TokenizerName(Cow::Owned(name.to_string()))
|
||||
}
|
||||
fn name(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration defining indexing for a text field.
|
||||
///
|
||||
/// It defines
|
||||
/// - the amount of information that should be stored about the presence of a term in a document.
|
||||
/// - The amount of information that should be stored about the presence of a term in a document.
|
||||
/// Essentially, should we store the term frequency and/or the positions (See
|
||||
/// [`IndexRecordOption`](./enum.IndexRecordOption.html)).
|
||||
/// - the name of the `Tokenizer` that should be used to process the field.
|
||||
/// - The name of the `Tokenizer` that should be used to process the field.
|
||||
/// - Flag indicating, if fieldnorms should be stored (See [fieldnorm](crate::fieldnorm)). Defaults
|
||||
/// to `true`.
|
||||
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
|
||||
pub struct TextFieldIndexing {
|
||||
#[serde(default)]
|
||||
record: IndexRecordOption,
|
||||
#[serde(default = "default_fieldnorms")]
|
||||
fieldnorms: bool,
|
||||
tokenizer: Cow<'static, str>,
|
||||
#[serde(default)]
|
||||
tokenizer: TokenizerName,
|
||||
}
|
||||
|
||||
pub(crate) fn default_fieldnorms() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
impl Default for TextFieldIndexing {
|
||||
fn default() -> TextFieldIndexing {
|
||||
TextFieldIndexing {
|
||||
tokenizer: Cow::Borrowed("default"),
|
||||
record: IndexRecordOption::Basic,
|
||||
fieldnorms: true,
|
||||
tokenizer: TokenizerName::default(),
|
||||
record: IndexRecordOption::default(),
|
||||
fieldnorms: default_fieldnorms(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -67,13 +126,13 @@ impl TextFieldIndexing {
|
||||
/// Sets the tokenizer to be used for a given field.
|
||||
#[must_use]
|
||||
pub fn set_tokenizer(mut self, tokenizer_name: &str) -> TextFieldIndexing {
|
||||
self.tokenizer = Cow::Owned(tokenizer_name.to_string());
|
||||
self.tokenizer = TokenizerName::from_name(tokenizer_name);
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns the tokenizer that will be used for this field.
|
||||
pub fn tokenizer(&self) -> &str {
|
||||
&self.tokenizer
|
||||
self.tokenizer.name()
|
||||
}
|
||||
|
||||
/// Sets fieldnorms
|
||||
@@ -83,7 +142,7 @@ impl TextFieldIndexing {
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns true if and only if fieldnorms are stored.
|
||||
/// Returns true if and only if [fieldnorms](crate::fieldnorm)are stored.
|
||||
pub fn fieldnorms(&self) -> bool {
|
||||
self.fieldnorms
|
||||
}
|
||||
@@ -108,21 +167,23 @@ impl TextFieldIndexing {
|
||||
/// The field will be untokenized and indexed.
|
||||
pub const STRING: TextOptions = TextOptions {
|
||||
indexing: Some(TextFieldIndexing {
|
||||
tokenizer: Cow::Borrowed("raw"),
|
||||
tokenizer: TokenizerName::from_static(NO_TOKENIZER_NAME),
|
||||
fieldnorms: true,
|
||||
record: IndexRecordOption::Basic,
|
||||
}),
|
||||
stored: false,
|
||||
fast: false,
|
||||
};
|
||||
|
||||
/// The field will be tokenized and indexed.
|
||||
pub const TEXT: TextOptions = TextOptions {
|
||||
indexing: Some(TextFieldIndexing {
|
||||
tokenizer: Cow::Borrowed("default"),
|
||||
tokenizer: TokenizerName::from_static(DEFAULT_TOKENIZER_NAME),
|
||||
fieldnorms: true,
|
||||
record: IndexRecordOption::WithFreqsAndPositions,
|
||||
}),
|
||||
stored: false,
|
||||
fast: false,
|
||||
};
|
||||
|
||||
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
|
||||
@@ -133,6 +194,7 @@ impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
|
||||
TextOptions {
|
||||
indexing: self.indexing.or(other.indexing),
|
||||
stored: self.stored | other.stored,
|
||||
fast: self.fast | other.fast,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -148,6 +210,17 @@ impl From<StoredFlag> for TextOptions {
|
||||
TextOptions {
|
||||
indexing: None,
|
||||
stored: true,
|
||||
fast: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FastFlag> for TextOptions {
|
||||
fn from(_: FastFlag) -> TextOptions {
|
||||
TextOptions {
|
||||
indexing: None,
|
||||
stored: false,
|
||||
fast: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -187,4 +260,24 @@ mod tests {
|
||||
assert!(IndexRecordOption::WithFreqsAndPositions > IndexRecordOption::WithFreqs);
|
||||
assert!(IndexRecordOption::WithFreqs > IndexRecordOption::Basic);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serde_default_test() {
|
||||
let json = r#"
|
||||
{
|
||||
"indexing": {
|
||||
"record": "basic",
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false
|
||||
}
|
||||
"#;
|
||||
let options: TextOptions = serde_json::from_str(json).unwrap();
|
||||
let options2: TextOptions = serde_json::from_str("{\"indexing\": {}}").unwrap();
|
||||
assert_eq!(options, options2);
|
||||
assert_eq!(options.indexing.unwrap().record, IndexRecordOption::Basic);
|
||||
let options3: TextOptions = serde_json::from_str("{}").unwrap();
|
||||
assert_eq!(options3.indexing, None);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ pub enum Value {
|
||||
I64(i64),
|
||||
/// 64-bits Float `f64`
|
||||
F64(f64),
|
||||
/// Signed 64-bits Date time stamp `date`
|
||||
/// Date/time with second precision
|
||||
Date(DateTime),
|
||||
/// Facet
|
||||
Facet(Facet),
|
||||
@@ -43,7 +43,7 @@ impl Serialize for Value {
|
||||
Value::U64(u) => serializer.serialize_u64(u),
|
||||
Value::I64(u) => serializer.serialize_i64(u),
|
||||
Value::F64(u) => serializer.serialize_f64(u),
|
||||
Value::Date(ref date) => serializer.serialize_str(&date.to_rfc3339()),
|
||||
Value::Date(ref date) => time::serde::rfc3339::serialize(&date.into_utc(), serializer),
|
||||
Value::Facet(ref facet) => facet.serialize(serializer),
|
||||
Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes),
|
||||
Value::JsonObject(ref obj) => obj.serialize(serializer),
|
||||
@@ -154,9 +154,9 @@ impl Value {
|
||||
/// Returns the Date-value, provided the value is of the `Date` type.
|
||||
///
|
||||
/// Returns None if the value is not of type `Date`.
|
||||
pub fn as_date(&self) -> Option<&DateTime> {
|
||||
pub fn as_date(&self) -> Option<DateTime> {
|
||||
if let Value::Date(date) = self {
|
||||
Some(date)
|
||||
Some(*date)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -209,9 +209,9 @@ impl From<f64> for Value {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::DateTime> for Value {
|
||||
fn from(date_time: crate::DateTime) -> Value {
|
||||
Value::Date(date_time)
|
||||
impl From<DateTime> for Value {
|
||||
fn from(dt: DateTime) -> Value {
|
||||
Value::Date(dt)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -265,12 +265,12 @@ impl From<serde_json::Value> for Value {
|
||||
mod binary_serialize {
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
use chrono::{TimeZone, Utc};
|
||||
use common::{f64_to_u64, u64_to_f64, BinarySerializable};
|
||||
|
||||
use super::Value;
|
||||
use crate::schema::Facet;
|
||||
use crate::tokenizer::PreTokenizedString;
|
||||
use crate::DateTime;
|
||||
|
||||
const TEXT_CODE: u8 = 0;
|
||||
const U64_CODE: u8 = 1;
|
||||
@@ -319,7 +319,8 @@ mod binary_serialize {
|
||||
}
|
||||
Value::Date(ref val) => {
|
||||
DATE_CODE.serialize(writer)?;
|
||||
val.timestamp().serialize(writer)
|
||||
let DateTime { unix_timestamp } = val;
|
||||
unix_timestamp.serialize(writer)
|
||||
}
|
||||
Value::Facet(ref facet) => {
|
||||
HIERARCHICAL_FACET_CODE.serialize(writer)?;
|
||||
@@ -357,8 +358,8 @@ mod binary_serialize {
|
||||
Ok(Value::F64(value))
|
||||
}
|
||||
DATE_CODE => {
|
||||
let timestamp = i64::deserialize(reader)?;
|
||||
Ok(Value::Date(Utc.timestamp(timestamp, 0)))
|
||||
let unix_timestamp = i64::deserialize(reader)?;
|
||||
Ok(Value::Date(DateTime::from_unix_timestamp(unix_timestamp)))
|
||||
}
|
||||
HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)),
|
||||
BYTES_CODE => Ok(Value::Bytes(Vec::<u8>::deserialize(reader)?)),
|
||||
@@ -401,15 +402,24 @@ mod binary_serialize {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use super::Value;
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::DateTime;
|
||||
|
||||
#[test]
|
||||
fn test_serialize_date() {
|
||||
let value = Value::Date(DateTime::from_str("1996-12-20T00:39:57+00:00").unwrap());
|
||||
let value = Value::from(DateTime::from_utc(
|
||||
OffsetDateTime::parse("1996-12-20T00:39:57+00:00", &Rfc3339).unwrap(),
|
||||
));
|
||||
let serialized_value_json = serde_json::to_string_pretty(&value).unwrap();
|
||||
assert_eq!(serialized_value_json, r#""1996-12-20T00:39:57+00:00""#);
|
||||
assert_eq!(serialized_value_json, r#""1996-12-20T00:39:57Z""#);
|
||||
let value = Value::from(DateTime::from_utc(
|
||||
OffsetDateTime::parse("1996-12-20T00:39:57-01:00", &Rfc3339).unwrap(),
|
||||
));
|
||||
let serialized_value_json = serde_json::to_string_pretty(&value).unwrap();
|
||||
// The time zone information gets lost by conversion into `Value::Date` and
|
||||
// implicitly becomes UTC.
|
||||
assert_eq!(serialized_value_json, r#""1996-12-20T01:39:57Z""#);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,7 +41,6 @@ mod tests {
|
||||
|
||||
use std::io;
|
||||
|
||||
use futures::executor::block_on;
|
||||
use proptest::strategy::{BoxedStrategy, Strategy};
|
||||
|
||||
use super::{SkipIndex, SkipIndexBuilder};
|
||||
@@ -145,7 +144,7 @@ mod tests {
|
||||
index_writer.delete_term(Term::from_field_text(text, "testb"));
|
||||
index_writer.commit()?;
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.merge(&segment_ids).wait().unwrap();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 30);
|
||||
|
||||
@@ -55,8 +55,6 @@ pub mod tests {
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use futures::executor::block_on;
|
||||
|
||||
use super::*;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::AliveBitSet;
|
||||
@@ -269,7 +267,7 @@ pub mod tests {
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
|
||||
assert!(index_writer.merge(&segment_ids).wait().is_ok());
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
|
||||
@@ -316,7 +314,7 @@ pub mod tests {
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> {
|
||||
}
|
||||
if !self.token_mut().text.is_ascii() {
|
||||
// ignore its already ascii
|
||||
to_ascii(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
to_ascii(&self.tail.token().text, &mut self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
}
|
||||
true
|
||||
@@ -1625,9 +1625,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_to_ascii() {
|
||||
let mut input = "Rámon".to_string();
|
||||
let input = "Rámon".to_string();
|
||||
let mut buffer = String::new();
|
||||
to_ascii(&mut input, &mut buffer);
|
||||
to_ascii(&input, &mut buffer);
|
||||
assert_eq!("Ramon", buffer);
|
||||
}
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ impl<'a> TokenStream for LowerCaserTokenStream<'a> {
|
||||
// fast track for ascii.
|
||||
self.token_mut().text.make_ascii_lowercase();
|
||||
} else {
|
||||
to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
to_lowercase_unicode(&self.tail.token().text, &mut self.buffer);
|
||||
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
|
||||
}
|
||||
true
|
||||
|
||||