Compare commits
485 Commits
remove_ran
...
column-tra
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4a072e3c18 | ||
|
|
84e0c75598 | ||
|
|
08c4412d73 | ||
|
|
70e58adff9 | ||
|
|
0d1cd119e9 | ||
|
|
d3dd620048 | ||
|
|
e89c220b56 | ||
|
|
a451f6d60d | ||
|
|
f740ddeee3 | ||
|
|
7a26cc9022 | ||
|
|
54972caa7c | ||
|
|
5d436759b0 | ||
|
|
6f563b1606 | ||
|
|
095fb68fda | ||
|
|
6316eaefc6 | ||
|
|
5331be800b | ||
|
|
c73b425bc1 | ||
|
|
54cfd0d154 | ||
|
|
0dd62169c8 | ||
|
|
3a9727aa91 | ||
|
|
17093e8ffe | ||
|
|
03e4630cd8 | ||
|
|
4ae0317d68 | ||
|
|
107b19855f | ||
|
|
d8f66ba07e | ||
|
|
f908549245 | ||
|
|
3673a5df9b | ||
|
|
3984cafccc | ||
|
|
298b5dd726 | ||
|
|
8bbb22e9bf | ||
|
|
513f68209d | ||
|
|
91f2f7e722 | ||
|
|
c476b530cf | ||
|
|
77dd202e19 | ||
|
|
00ebff3c16 | ||
|
|
9a6d37c42c | ||
|
|
bb01e99e05 | ||
|
|
535f1a5d83 | ||
|
|
625f9174a7 | ||
|
|
11a4d97cf5 | ||
|
|
1c3d39677a | ||
|
|
6f65995cfd | ||
|
|
e2e4190571 | ||
|
|
82209c58aa | ||
|
|
21519788ea | ||
|
|
4c6c6e4a9c | ||
|
|
df0ac9e901 | ||
|
|
71ab482720 | ||
|
|
2ae383e452 | ||
|
|
8b3a6f6231 | ||
|
|
11edd6bd59 | ||
|
|
193a3c21f4 | ||
|
|
998b1263f6 | ||
|
|
72272bdf81 | ||
|
|
c39c2d79da | ||
|
|
67d94f5bd2 | ||
|
|
abbd934ac9 | ||
|
|
7f9ba0ee50 | ||
|
|
8edcd6f958 | ||
|
|
f50700835d | ||
|
|
494e92ca59 | ||
|
|
4a3169011d | ||
|
|
050fc5dde9 | ||
|
|
ce45889add | ||
|
|
4875174d16 | ||
|
|
0c634c5bc6 | ||
|
|
e25ab5d537 | ||
|
|
27400c9ad3 | ||
|
|
19074e1d5e | ||
|
|
014b1adc3e | ||
|
|
84295d5b35 | ||
|
|
625bcb4877 | ||
|
|
f01cb7d3aa | ||
|
|
8e773ade77 | ||
|
|
fad3faefe2 | ||
|
|
9811d15657 | ||
|
|
31ba5a3c16 | ||
|
|
f4d7621370 | ||
|
|
d4b2b7de8b | ||
|
|
d5ee4edf25 | ||
|
|
fcc7bd7024 | ||
|
|
ce8d6b259a | ||
|
|
099e626156 | ||
|
|
71041b2314 | ||
|
|
09aae134e6 | ||
|
|
6a9d09cf7a | ||
|
|
704d0a8d8b | ||
|
|
195309a557 | ||
|
|
da0f78e06c | ||
|
|
9b6b60cc2b | ||
|
|
6444516a82 | ||
|
|
a9b0d1a0ab | ||
|
|
2b333ca635 | ||
|
|
80a1418284 | ||
|
|
5ab5f070ed | ||
|
|
d122f2c74e | ||
|
|
5b564916f0 | ||
|
|
06fd8684b7 | ||
|
|
931bab8010 | ||
|
|
8dac30e6d1 | ||
|
|
2e0a7d072f | ||
|
|
af84e74284 | ||
|
|
fff1a03842 | ||
|
|
90e296f2d0 | ||
|
|
5f966d747b | ||
|
|
d24f31f965 | ||
|
|
f26b686a1c | ||
|
|
775e936f7d | ||
|
|
7e032a9efd | ||
|
|
23fe73a6c0 | ||
|
|
a4be239d38 | ||
|
|
2406d9278b | ||
|
|
6c2d9737f1 | ||
|
|
a5688572a5 | ||
|
|
431b5a091e | ||
|
|
2c17271cd9 | ||
|
|
5750224d4c | ||
|
|
02691f2445 | ||
|
|
e31e78f39f | ||
|
|
9db2f0e82b | ||
|
|
2ed5cc873d | ||
|
|
d278417300 | ||
|
|
d89a8dd118 | ||
|
|
1bd44a5f61 | ||
|
|
d750ced813 | ||
|
|
fbc469e5df | ||
|
|
c1273670e4 | ||
|
|
7eb267341e | ||
|
|
db1836691e | ||
|
|
437cd350a2 | ||
|
|
8024ecf013 | ||
|
|
9baefbe2ab | ||
|
|
ad76d11008 | ||
|
|
c3220bece0 | ||
|
|
2b713f0977 | ||
|
|
0bc6b4a117 | ||
|
|
79e42d4a6d | ||
|
|
0135fbc4c8 | ||
|
|
449594f67a | ||
|
|
8b6647e908 | ||
|
|
efabcbcdf5 | ||
|
|
7bf5962554 | ||
|
|
4c7dedef29 | ||
|
|
93f356a7a7 | ||
|
|
6ca5f77466 | ||
|
|
2e2822f89d | ||
|
|
de178a1901 | ||
|
|
11e4225f23 | ||
|
|
f21b73d1f6 | ||
|
|
1440f3243b | ||
|
|
83d0c13fb0 | ||
|
|
88054aa333 | ||
|
|
635c39ba48 | ||
|
|
eab2257637 | ||
|
|
328bd96c24 | ||
|
|
fc24842a43 | ||
|
|
2d6f1d43ff | ||
|
|
ca0973ec78 | ||
|
|
38ee60d792 | ||
|
|
f68be28284 | ||
|
|
fc43ab9280 | ||
|
|
38c2ea6a5d | ||
|
|
26a0fd1fbe | ||
|
|
811b91ecb3 | ||
|
|
25c00ce856 | ||
|
|
e5debb97a7 | ||
|
|
bc4cd9ffaa | ||
|
|
9a13d8709b | ||
|
|
e6eadf1a2f | ||
|
|
7cca7e6a47 | ||
|
|
ef2492dba6 | ||
|
|
2981e6c1df | ||
|
|
b33b4c0092 | ||
|
|
4d9d2b6db0 | ||
|
|
ed868f93a3 | ||
|
|
5e599d96d7 | ||
|
|
314ae43a45 | ||
|
|
fce91b2f3a | ||
|
|
9bcd2b8104 | ||
|
|
0c9c257150 | ||
|
|
1af85a2956 | ||
|
|
bc4c3d0c6b | ||
|
|
6937c75f05 | ||
|
|
e54429e827 | ||
|
|
ca836b6414 | ||
|
|
f0a2b1cc44 | ||
|
|
fcfdc44c61 | ||
|
|
3171f0b9ba | ||
|
|
89e19f14b5 | ||
|
|
1a6a1396cd | ||
|
|
e766375700 | ||
|
|
496b4a4fdb | ||
|
|
93cc8498b3 | ||
|
|
0aa3d63a9f | ||
|
|
4e2a053b69 | ||
|
|
71c4393ec4 | ||
|
|
b2e97e266a | ||
|
|
9ee4772140 | ||
|
|
c95013b11e | ||
|
|
71f75071d2 | ||
|
|
b114e553cd | ||
|
|
17dcc99e43 | ||
|
|
c5c2e59b2b | ||
|
|
fc045e6bf9 | ||
|
|
6837a4d468 | ||
|
|
0759bf9448 | ||
|
|
152e8238d7 | ||
|
|
d4e5b48437 | ||
|
|
03040ed81d | ||
|
|
aaa22ad225 | ||
|
|
44ea7313ca | ||
|
|
3223bdf254 | ||
|
|
11ac451250 | ||
|
|
6a4632211a | ||
|
|
a99e5459e3 | ||
|
|
3f88718f38 | ||
|
|
cbd06ab189 | ||
|
|
749395bbb8 | ||
|
|
617ba1f0c0 | ||
|
|
2f1cd7e7f0 | ||
|
|
58c0cb5fc4 | ||
|
|
7f45a6ac96 | ||
|
|
0ade871126 | ||
|
|
aab65490c9 | ||
|
|
d77e8de36a | ||
|
|
d11a8cce26 | ||
|
|
bc607a921b | ||
|
|
1273f33338 | ||
|
|
e30449743c | ||
|
|
ed26552296 | ||
|
|
65d129afbd | ||
|
|
386ffab76c | ||
|
|
57a8d0359c | ||
|
|
14cb66ee00 | ||
|
|
9e38343352 | ||
|
|
944302ae2f | ||
|
|
be70804d17 | ||
|
|
a1afc80600 | ||
|
|
02e24fda52 | ||
|
|
7e3c0c5392 | ||
|
|
fdb2524f9e | ||
|
|
4db655ae82 | ||
|
|
bb44cc84c4 | ||
|
|
8c1e1cf1ad | ||
|
|
b5b16948b0 | ||
|
|
c305d3a2a2 | ||
|
|
038d234ff1 | ||
|
|
c45eb9a9fa | ||
|
|
824d6f96fe | ||
|
|
7cf821bac0 | ||
|
|
ae83fc8298 | ||
|
|
a7bc361145 | ||
|
|
2805291400 | ||
|
|
6614a2cba0 | ||
|
|
6f4d203d1b | ||
|
|
1be6c6111c | ||
|
|
c7c3eab256 | ||
|
|
ec69875d15 | ||
|
|
d832cfcfd8 | ||
|
|
ab6b532cc4 | ||
|
|
4b6047f7d7 | ||
|
|
5ca04beb94 | ||
|
|
902d05ebec | ||
|
|
f1b298642a | ||
|
|
dd13dedaeb | ||
|
|
46724b4a05 | ||
|
|
24432bf523 | ||
|
|
31d3bcfff2 | ||
|
|
706fbd6886 | ||
|
|
8a8a048015 | ||
|
|
c72549cb9a | ||
|
|
d6f803212c | ||
|
|
dac73537d2 | ||
|
|
bb5254de12 | ||
|
|
be5218c2f6 | ||
|
|
ec9478830a | ||
|
|
8807bfd13d | ||
|
|
447811c111 | ||
|
|
f29acf5d8c | ||
|
|
125707dbe0 | ||
|
|
46d5de920d | ||
|
|
d2a7bcf217 | ||
|
|
141b9aa245 | ||
|
|
c5a6282fa8 | ||
|
|
c0f524e1a3 | ||
|
|
958b2bee08 | ||
|
|
f619658e2c | ||
|
|
aa391bf843 | ||
|
|
47dcbdbeae | ||
|
|
691245bf20 | ||
|
|
90798d4b39 | ||
|
|
0b6d9f90cf | ||
|
|
8a5a12d961 | ||
|
|
e73542e2e8 | ||
|
|
0262e44bbd | ||
|
|
613aad7a8a | ||
|
|
1aa88b0c51 | ||
|
|
564fa38085 | ||
|
|
59ec21479f | ||
|
|
42283f9e91 | ||
|
|
b105bf72e1 | ||
|
|
226f577803 | ||
|
|
2e255c4bef | ||
|
|
387592809f | ||
|
|
cedced5bb0 | ||
|
|
d31f045872 | ||
|
|
6656a70d1b | ||
|
|
d36e0a9549 | ||
|
|
8771b2673f | ||
|
|
a41d3d51a4 | ||
|
|
cae34ffe47 | ||
|
|
4b62f7907d | ||
|
|
7fa6a0b665 | ||
|
|
458ed29a31 | ||
|
|
e37775fe21 | ||
|
|
1cd2434a32 | ||
|
|
de2cba6d1e | ||
|
|
c0b1a58d27 | ||
|
|
848b795b9f | ||
|
|
091b668624 | ||
|
|
5004290daa | ||
|
|
5d2c2b804c | ||
|
|
1a92b588e0 | ||
|
|
010e92c118 | ||
|
|
2ead010c83 | ||
|
|
c4f66eb185 | ||
|
|
d7b46d2137 | ||
|
|
d042ce74c7 | ||
|
|
7ba9e662b8 | ||
|
|
fdd5ef85e5 | ||
|
|
704498a1ac | ||
|
|
1232af7928 | ||
|
|
d37633e034 | ||
|
|
9815067171 | ||
|
|
972cb6c26d | ||
|
|
4dc80cfa25 | ||
|
|
cef145790c | ||
|
|
e05e2a0c51 | ||
|
|
e028515caf | ||
|
|
850b9eaea4 | ||
|
|
505e6a440c | ||
|
|
fcd651f6a9 | ||
|
|
e6653228a9 | ||
|
|
bdedefe07d | ||
|
|
13a4473faa | ||
|
|
2069e3e52b | ||
|
|
0d8263cba1 | ||
|
|
65b365b81c | ||
|
|
4c1366da87 | ||
|
|
eca6628b3c | ||
|
|
9679c5f306 | ||
|
|
5a2497b6fd | ||
|
|
99d4b1a177 | ||
|
|
732f6847c0 | ||
|
|
1c6d9bdc6a | ||
|
|
3ea6800ac5 | ||
|
|
395303b644 | ||
|
|
2c200b46cb | ||
|
|
17e00df112 | ||
|
|
3129d86743 | ||
|
|
e5e252cbc0 | ||
|
|
b2da82f151 | ||
|
|
c81b3030fa | ||
|
|
9e66c75fc6 | ||
|
|
ebdbb6bd2e | ||
|
|
c980b19dd9 | ||
|
|
098eea843a | ||
|
|
466dc8233c | ||
|
|
03c2f6ece2 | ||
|
|
1d4e9a29db | ||
|
|
f378d9a57b | ||
|
|
dde49ac8e2 | ||
|
|
c3cc93406d | ||
|
|
bd0f9211da | ||
|
|
c503c6e4fa | ||
|
|
02174d26af | ||
|
|
cf92be3bd6 | ||
|
|
72cef12db1 | ||
|
|
bbc0a2e233 | ||
|
|
4fd1a6c84b | ||
|
|
c83d99c414 | ||
|
|
eacf510175 | ||
|
|
8802d125f8 | ||
|
|
33301a3eb4 | ||
|
|
7234bef0eb | ||
|
|
fcff91559b | ||
|
|
b75d4e59d1 | ||
|
|
c6b5ab1dbe | ||
|
|
c12e07f0ce | ||
|
|
8b877a4c26 | ||
|
|
7dc0dc1c9b | ||
|
|
0462754673 | ||
|
|
5916ceda73 | ||
|
|
70283dc6c8 | ||
|
|
dbaf4f3623 | ||
|
|
4808648322 | ||
|
|
54afb9b34a | ||
|
|
d336c8b938 | ||
|
|
980d1b2796 | ||
|
|
6317982876 | ||
|
|
e2fbbc08ca | ||
|
|
99cd25beae | ||
|
|
737ecc7015 | ||
|
|
09668459c8 | ||
|
|
e5fd30f438 | ||
|
|
c412a46105 | ||
|
|
3a78402496 | ||
|
|
d18ac136c0 | ||
|
|
b5b1244857 | ||
|
|
27acfa4dea | ||
|
|
02cffa4dea | ||
|
|
b52abbc771 | ||
|
|
894c61867f | ||
|
|
352e0cc58d | ||
|
|
ffe4446d90 | ||
|
|
4d05b26e7a | ||
|
|
0855649986 | ||
|
|
d828e58903 | ||
|
|
aa0396fe27 | ||
|
|
8d8315f8d0 | ||
|
|
078c0a2e2e | ||
|
|
f21e8dd875 | ||
|
|
74e36c7e97 | ||
|
|
f27ae04282 | ||
|
|
0ce49c9dd4 | ||
|
|
fe8e58e078 | ||
|
|
efc0d8341b | ||
|
|
22bcc83d10 | ||
|
|
5ee5037934 | ||
|
|
c217bfed1e | ||
|
|
c27ccd3e24 | ||
|
|
367f5da782 | ||
|
|
b256df6599 | ||
|
|
d7a6a409a1 | ||
|
|
a1f5cead96 | ||
|
|
37c5fe3c86 | ||
|
|
4583fa270b | ||
|
|
beb3a5bd73 | ||
|
|
93cbd52bf0 | ||
|
|
c22177a005 | ||
|
|
4da71273e1 | ||
|
|
2c78b31aab | ||
|
|
4ae1d87632 | ||
|
|
46b86a7976 | ||
|
|
3bc177e69d | ||
|
|
319609e9c1 | ||
|
|
9d87b89718 | ||
|
|
dd81e38e53 | ||
|
|
9f32b22602 | ||
|
|
096ce7488e | ||
|
|
a1782dd172 | ||
|
|
000d76b11a | ||
|
|
abd29f6646 | ||
|
|
b4ecf0ab2f | ||
|
|
798f7dbf67 | ||
|
|
06a2e47c8d | ||
|
|
e0b83eb291 | ||
|
|
13401f46ea | ||
|
|
1a45b030dc | ||
|
|
62052bcc2d | ||
|
|
3265f7bec3 | ||
|
|
ee0881712a | ||
|
|
483e0336b6 | ||
|
|
3e8f267e33 | ||
|
|
3b247fd968 | ||
|
|
750f6e6479 | ||
|
|
5b475e6603 | ||
|
|
0ca7f73dc5 | ||
|
|
47ed18845e | ||
|
|
dc141cdb29 | ||
|
|
f6cf6e889b | ||
|
|
f379a80233 | ||
|
|
4a320fd1ff | ||
|
|
85d23e8e3b | ||
|
|
022ab9d298 | ||
|
|
605e8603dc | ||
|
|
70f160b329 | ||
|
|
6d265e6bed | ||
|
|
fdc512391b | ||
|
|
108714c934 | ||
|
|
44e8cf98a5 | ||
|
|
f0ee69d9e9 | ||
|
|
b8a10c8406 | ||
|
|
ff4813529e | ||
|
|
470bc18e9b |
7
.github/dependabot.yml
vendored
@@ -6,3 +6,10 @@ updates:
|
||||
interval: daily
|
||||
time: "20:00"
|
||||
open-pull-requests-limit: 10
|
||||
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: daily
|
||||
time: "20:00"
|
||||
open-pull-requests-limit: 10
|
||||
|
||||
35
.github/workflows/coverage.yml
vendored
@@ -1,27 +1,26 @@
|
||||
name: coverage
|
||||
name: Coverage
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: coverage
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: xd009642/tarpaulin:develop-nightly
|
||||
options: --security-opt seccomp=unconfined
|
||||
coverage:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Generate code coverage
|
||||
run: |
|
||||
cargo +nightly tarpaulin --verbose --all-features --workspace --timeout 120 --out Xml
|
||||
|
||||
- name: Upload to codecov.io
|
||||
uses: codecov/codecov-action@v1
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install Rust
|
||||
run: rustup toolchain install nightly --profile minimal --component llvm-tools-preview
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- uses: taiki-e/install-action@cargo-llvm-cov
|
||||
- name: Generate code coverage
|
||||
run: cargo +nightly llvm-cov --all-features --workspace --lcov --output-path lcov.info
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v3
|
||||
continue-on-error: true
|
||||
with:
|
||||
# token: ${{secrets.CODECOV_TOKEN}} # not required for public repos
|
||||
fail_ci_if_error: true
|
||||
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
|
||||
files: lcov.info
|
||||
fail_ci_if_error: true
|
||||
|
||||
28
.github/workflows/long_running.yml
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
name: Long running tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
|
||||
|
||||
jobs:
|
||||
test:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install stable
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
profile: minimal
|
||||
override: true
|
||||
|
||||
- name: Run indexing_unsorted
|
||||
run: cargo test indexing_unsorted -- --ignored
|
||||
- name: Run indexing_sorted
|
||||
run: cargo test indexing_sorted -- --ignored
|
||||
64
.github/workflows/test.yml
vendored
@@ -1,4 +1,4 @@
|
||||
name: Rust
|
||||
name: Unit tests
|
||||
|
||||
on:
|
||||
push:
|
||||
@@ -10,21 +10,65 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
jobs:
|
||||
build:
|
||||
check:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Build
|
||||
run: cargo build --verbose --workspace
|
||||
- name: Install latest nightly to test also against unstable feature flag
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Install nightly
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
profile: minimal
|
||||
components: rustfmt
|
||||
- name: Run tests
|
||||
run: cargo test --all-features --verbose --workspace
|
||||
- name: Install stable
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
profile: minimal
|
||||
components: clippy
|
||||
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
|
||||
- name: Check Formatting
|
||||
run: cargo fmt --all -- --check
|
||||
run: cargo +nightly fmt --all -- --check
|
||||
|
||||
- uses: actions-rs/clippy-check@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
args: --tests
|
||||
|
||||
test:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
features: [
|
||||
{ label: "all", flags: "mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
|
||||
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
|
||||
]
|
||||
|
||||
name: test-${{ matrix.features.label}}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Install stable
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
profile: minimal
|
||||
override: true
|
||||
|
||||
- uses: taiki-e/install-action@nextest
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
|
||||
- name: Run tests
|
||||
run: cargo +stable nextest run --features ${{ matrix.features.flags }} --verbose --workspace
|
||||
|
||||
- name: Run doctests
|
||||
run: cargo +stable test --doc --features ${{ matrix.features.flags }} --verbose --workspace
|
||||
|
||||
1
.gitignore
vendored
@@ -1,4 +1,5 @@
|
||||
tantivy.iml
|
||||
.cargo
|
||||
proptest-regressions
|
||||
*.swp
|
||||
target
|
||||
|
||||
92
.travis.yml
@@ -1,92 +0,0 @@
|
||||
# Based on the "trust" template v0.1.2
|
||||
# https://github.com/japaric/trust/tree/v0.1.2
|
||||
|
||||
dist: trusty
|
||||
language: rust
|
||||
services: docker
|
||||
sudo: required
|
||||
|
||||
env:
|
||||
global:
|
||||
- CRATE_NAME=tantivy
|
||||
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
|
||||
# - secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
- kalakris-cmake
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- g++-4.8
|
||||
- libcurl4-openssl-dev
|
||||
- libelf-dev
|
||||
- libdw-dev
|
||||
- binutils-dev
|
||||
- cmake
|
||||
|
||||
matrix:
|
||||
include:
|
||||
# Android
|
||||
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
||||
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
|
||||
|
||||
# Linux
|
||||
#- env: TARGET=aarch64-unknown-linux-gnu
|
||||
#- env: TARGET=i686-unknown-linux-gnu
|
||||
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 #UPLOAD_DOCS=1
|
||||
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
|
||||
# OSX
|
||||
#- env: TARGET=x86_64-apple-darwin
|
||||
# os: osx
|
||||
|
||||
before_install:
|
||||
- set -e
|
||||
- rustup self update
|
||||
- rustup component add rustfmt
|
||||
|
||||
install:
|
||||
- sh ci/install.sh
|
||||
- source ~/.cargo/env || true
|
||||
- env | grep "TRAVIS"
|
||||
|
||||
before_script:
|
||||
- export PATH=$HOME/.cargo/bin:$PATH
|
||||
- cargo install cargo-update || echo "cargo-update already installed"
|
||||
- cargo install cargo-travis || echo "cargo-travis already installed"
|
||||
|
||||
script:
|
||||
- bash ci/script.sh
|
||||
- cargo fmt --all -- --check
|
||||
|
||||
before_deploy:
|
||||
- sh ci/before_deploy.sh
|
||||
|
||||
after_success:
|
||||
# Needs GH_TOKEN env var to be set in travis settings
|
||||
- if [[ -v GH_TOKEN ]]; then echo "GH TOKEN IS SET"; else echo "GH TOKEN NOT SET"; fi
|
||||
- if [[ -v UPLOAD_DOCS ]]; then cargo doc; cargo doc-upload; else echo "doc upload disabled."; fi
|
||||
|
||||
#cache: cargo
|
||||
#before_cache:
|
||||
# # Travis can't cache files that are not readable by "others"
|
||||
# - chmod -R a+r $HOME/.cargo
|
||||
# - find ./target/debug -type f -maxdepth 1 -delete
|
||||
# - rm -f ./target/.rustc_info.json
|
||||
# - rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
|
||||
# - rm -r target/debug/examples/
|
||||
# - ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
|
||||
|
||||
#branches:
|
||||
# only:
|
||||
# # release tags
|
||||
# - /^v\d+\.\d+\.\d+.*$/
|
||||
# - master
|
||||
|
||||
notifications:
|
||||
email:
|
||||
on_success: never
|
||||
@@ -10,6 +10,7 @@ Tantivy's bread and butter is to address the problem of full-text search :
|
||||
Given a large set of textual documents, and a text query, return the K-most relevant documents in a very efficient way. To execute these queries rapidly, the tantivy needs to build an index beforehand. The relevance score implemented in the tantivy is not configurable. Tantivy uses the same score as the default similarity used in Lucene / Elasticsearch, called [BM25](https://en.wikipedia.org/wiki/Okapi_BM25).
|
||||
|
||||
But tantivy's scope does not stop there. Numerous features are required to power rich-search applications. For instance, one may want to:
|
||||
|
||||
- compute the count of documents matching a query in the different section of an e-commerce website,
|
||||
- display an average price per meter square for a real estate search engine,
|
||||
- take into account historical user data to rank documents in a specific way,
|
||||
@@ -22,27 +23,28 @@ rapidly select all documents matching a given predicate (also known as a query)
|
||||
collect some information about them ([See collector](#collector-define-what-to-do-with-matched-documents)).
|
||||
|
||||
Roughly speaking the design is following these guiding principles:
|
||||
|
||||
- Search should be O(1) in memory.
|
||||
- Indexing should be O(1) in memory. (In practice it is just sublinear)
|
||||
- Search should be as fast as possible
|
||||
|
||||
This comes at the cost of the dynamicity of the index: while it is possible to add, and delete documents from our corpus, the tantivy is designed to handle these updates in large batches.
|
||||
|
||||
## [core/](src/core): Index, segments, searchers.
|
||||
## [core/](src/core): Index, segments, searchers
|
||||
|
||||
Core contains all of the high-level code to make it possible to create an index, add documents, delete documents and commit.
|
||||
|
||||
This is both the most high-level part of tantivy, the least performance-sensitive one, the seemingly most mundane code... And paradoxically the most complicated part.
|
||||
|
||||
### Index and Segments...
|
||||
### Index and Segments
|
||||
|
||||
A tantivy index is a collection of smaller independent immutable segments.
|
||||
A tantivy index is a collection of smaller independent immutable segments.
|
||||
Each segment contains its own independent set of data structures.
|
||||
|
||||
A segment is identified by a segment id that is in fact a UUID.
|
||||
The file of a segment has the format
|
||||
|
||||
```segment-id . ext ```
|
||||
```segment-id . ext```
|
||||
|
||||
The extension signals which data structure (or [`SegmentComponent`](src/core/segment_component.rs)) is stored in the file.
|
||||
|
||||
@@ -52,17 +54,15 @@ On commit, one segment per indexing thread is written to disk, and the `meta.jso
|
||||
|
||||
For a better idea of how indexing works, you may read the [following blog post](https://fulmicoton.com/posts/behold-tantivy-part2/).
|
||||
|
||||
|
||||
### Deletes
|
||||
|
||||
Deletes happen by deleting a "term". Tantivy does not offer any notion of primary id, so it is up to the user to use a field in their schema as if it was a primary id, and delete the associated term if they want to delete only one specific document.
|
||||
|
||||
On commit, tantivy will find all of the segments with documents matching this existing term and create a [tombstone file](src/fastfield/delete.rs) that represents the bitset of the document that are deleted.
|
||||
Like all segment files, this file is immutable. Because it is possible to have more than one tombstone file at a given instant, the tombstone filename has the format ``` segment_id . commit_opstamp . del```.
|
||||
On commit, tantivy will find all of the segments with documents matching this existing term and remove from [alive bitset file](src/fastfield/alive_bitset.rs) that represents the bitset of the alive document ids.
|
||||
Like all segment files, this file is immutable. Because it is possible to have more than one alive bitset file at a given instant, the alive bitset filename has the format ```segment_id . commit_opstamp . del```.
|
||||
|
||||
An opstamp is simply an incremental id that identifies any operation applied to the index. For instance, performing a commit or adding a document.
|
||||
|
||||
|
||||
### DocId
|
||||
|
||||
Within a segment, all documents are identified by a DocId that ranges within `[0, max_doc)`.
|
||||
@@ -74,6 +74,7 @@ The DocIds are simply allocated in the order documents are added to the index.
|
||||
|
||||
In separate threads, tantivy's index writer search for opportunities to merge segments.
|
||||
The point of segment merge is to:
|
||||
|
||||
- eventually get rid of tombstoned documents
|
||||
- reduce the otherwise ever-growing number of segments.
|
||||
|
||||
@@ -104,6 +105,7 @@ Tantivy's document follows a very strict schema, decided before building any ind
|
||||
The schema defines all of the fields that the indexes [`Document`](src/schema/document.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
|
||||
|
||||
Depending on the type of the field, you can decide to
|
||||
|
||||
- put it in the docstore
|
||||
- store it as a fast field
|
||||
- index it
|
||||
@@ -117,9 +119,10 @@ As of today, tantivy's schema imposes a 1:1 relationship between a field that is
|
||||
|
||||
This is not something tantivy supports, and it is up to the user to duplicate field / concatenate fields before feeding them to tantivy.
|
||||
|
||||
## General information about these data structures.
|
||||
## General information about these data structures
|
||||
|
||||
All data structures in tantivy, have:
|
||||
|
||||
- a writer
|
||||
- a serializer
|
||||
- a reader
|
||||
@@ -132,7 +135,7 @@ This conversion is done by the serializer.
|
||||
Finally, the reader is in charge of offering an API to read on this on-disk read-only representation.
|
||||
In tantivy, readers are designed to require very little anonymous memory. The data is read straight from an mmapped file, and loading an index is as fast as mmapping its files.
|
||||
|
||||
## [store/](src/store): Here is my DocId, Gimme my document!
|
||||
## [store/](src/store): Here is my DocId, Gimme my document
|
||||
|
||||
The docstore is a row-oriented storage that, for each document, stores a subset of the fields
|
||||
that are marked as stored in the schema. The docstore is compressed using a general-purpose algorithm
|
||||
@@ -146,6 +149,7 @@ Once the top 10 documents have been identified, we fetch them from the store, an
|
||||
**Not useful for**
|
||||
|
||||
Fetching a document from the store is typically a "slow" operation. It usually consists in
|
||||
|
||||
- searching into a compact tree-like data structure to find the position of the right block.
|
||||
- decompressing a small block
|
||||
- returning the document from this block.
|
||||
@@ -154,8 +158,7 @@ It is NOT meant to be called for every document matching a query.
|
||||
|
||||
As a rule of thumb, if you hit the docstore more than 100 times per search query, you are probably misusing tantivy.
|
||||
|
||||
|
||||
## [fastfield/](src/fastfield): Here is my DocId, Gimme my value!
|
||||
## [fastfield/](src/fastfield): Here is my DocId, Gimme my value
|
||||
|
||||
Fast fields are stored in a column-oriented storage that allows for random access.
|
||||
The only compression applied is bitpacking. The column comes with two meta data.
|
||||
@@ -163,7 +166,7 @@ The minimum value in the column and the number of bits per doc.
|
||||
|
||||
Fetching a value for a `DocId` is then as simple as computing
|
||||
|
||||
```
|
||||
```rust
|
||||
min_value + fetch_bits(num_bits * doc_id..num_bits * (doc_id+1))
|
||||
```
|
||||
|
||||
@@ -190,7 +193,7 @@ For advanced search engine, it is possible to store all of the features required
|
||||
|
||||
Finally facets are a specific kind of fast field, and the associated source code is in [`fastfield/facet_reader.rs`](src/fastfield/facet_reader.rs).
|
||||
|
||||
# The inverted search index.
|
||||
# The inverted search index
|
||||
|
||||
The inverted index is the core part of full-text search.
|
||||
When presented a new document with the text field "Hello, happy tax payer!", tantivy breaks it into a list of so-called tokens. In addition to just splitting these strings into tokens, it might also do different kinds of operations like dropping the punctuation, converting the character to lowercase, apply stemming, etc. Tantivy makes it possible to configure the operations to be applied in the schema (tokenizer/ is the place where these operations are implemented).
|
||||
@@ -215,19 +218,18 @@ The inverted index actually consists of two data structures chained together.
|
||||
|
||||
Where [TermInfo](src/postings/term_info.rs) is an object containing some meta data about a term.
|
||||
|
||||
|
||||
## [termdict/](src/termdict): Here is a term, give me the [TermInfo](src/postings/term_info.rs)!
|
||||
## [termdict/](src/termdict): Here is a term, give me the [TermInfo](src/postings/term_info.rs)
|
||||
|
||||
Tantivy's term dictionary is mainly in charge of supplying the function
|
||||
|
||||
[Term](src/schema/term.rs) ⟶ [TermInfo](src/postings/term_info.rs)
|
||||
|
||||
It is itself broken into two parts.
|
||||
|
||||
- [Term](src/schema/term.rs) ⟶ [TermOrdinal](src/termdict/mod.rs) is addressed by a finite state transducer, implemented by the fst crate.
|
||||
- [TermOrdinal](src/termdict/mod.rs) ⟶ [TermInfo](src/postings/term_info.rs) is addressed by the term info store.
|
||||
|
||||
|
||||
## [postings/](src/postings): Iterate over documents... very fast!
|
||||
## [postings/](src/postings): Iterate over documents... very fast
|
||||
|
||||
A posting list makes it possible to store a sorted list of doc ids and for each doc store
|
||||
a term frequency as well.
|
||||
@@ -249,7 +251,7 @@ For instance, when the phrase query "the art of war" does not match "the war of
|
||||
To make it possible, it is possible to specify in the schema that a field should store positions in addition to being indexed.
|
||||
|
||||
The token positions of all of the terms are then stored in a separate file with the extension `.pos`.
|
||||
The [TermInfo](src/postings/term_info.rs) gives an offset (expressed in position this time) in this file. As we iterate throught the docset,
|
||||
The [TermInfo](src/postings/term_info.rs) gives an offset (expressed in position this time) in this file. As we iterate through the docset,
|
||||
we advance the position reader by the number of term frequencies of the current document.
|
||||
|
||||
## [fieldnorms/](src/fieldnorms): Here is my doc, how many tokens in this field?
|
||||
@@ -257,7 +259,6 @@ we advance the position reader by the number of term frequencies of the current
|
||||
The [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) formula also requires to know the number of tokens stored in a specific field for a given document. We store this information on one byte per document in the fieldnorm.
|
||||
The fieldnorm is therefore compressed. Values up to 40 are encoded unchanged.
|
||||
|
||||
|
||||
## [tokenizer/](src/tokenizer): How should we process text?
|
||||
|
||||
Text processing is key to a good search experience.
|
||||
@@ -268,7 +269,6 @@ Text processing can be configured by selecting an off-the-shelf [`Tokenizer`](./
|
||||
|
||||
Tantivy's comes with few tokenizers, but external crates are offering advanced tokenizers, such as [Lindera](https://crates.io/crates/lindera) for Japanese.
|
||||
|
||||
|
||||
## [query/](src/query): Define and compose queries
|
||||
|
||||
The [Query](src/query/query.rs) trait defines what a query is.
|
||||
|
||||
161
CHANGELOG.md
@@ -1,18 +1,77 @@
|
||||
Tantivy 0.19
|
||||
================================
|
||||
|
||||
- Updated [Date Field Type](https://github.com/quickwit-oss/tantivy/pull/1396)
|
||||
The `DateTime` type has been updated to hold timestamps with microseconds precision.
|
||||
`DateOptions` and `DatePrecision` have been added to configure Date fields. The precision is used to hint on fast values compression. Otherwise, seconds precision is used everywhere else (i.e terms, indexing).
|
||||
- Remove Searcher pool and make `Searcher` cloneable.
|
||||
|
||||
Tantivy 0.18
|
||||
================================
|
||||
|
||||
- For date values `chrono` has been replaced with `time` (@uklotzde) #1304 :
|
||||
- The `time` crate is re-exported as `tantivy::time` instead of `tantivy::chrono`.
|
||||
- The type alias `tantivy::DateTime` has been removed.
|
||||
- `Value::Date` wraps `time::PrimitiveDateTime` without time zone information.
|
||||
- Internally date/time values are stored as seconds since UNIX epoch in UTC.
|
||||
- Converting a `time::OffsetDateTime` to `Value::Date` implicitly converts the value into UTC.
|
||||
If this is not desired do the time zone conversion yourself and use `time::PrimitiveDateTime`
|
||||
directly instead.
|
||||
- Add [histogram](https://github.com/quickwit-oss/tantivy/pull/1306) aggregation (@PSeitz)
|
||||
- Add support for fastfield on text fields (@PSeitz)
|
||||
- Add terms aggregation (@PSeitz)
|
||||
- Add support for zstd compression (@kryesh)
|
||||
|
||||
Tantivy 0.17
|
||||
================================
|
||||
|
||||
- LogMergePolicy now triggers merges if the ratio of deleted documents reaches a threshold (@shikhar @fulmicoton) [#115](https://github.com/quickwit-oss/tantivy/issues/115)
|
||||
- Adds a searcher Warmer API (@shikhar @fulmicoton)
|
||||
- Change to non-strict schema. Ignore fields in data which are not defined in schema. Previously this returned an error. #1211
|
||||
- Facets are necessarily indexed. Existing index with indexed facets should work out of the box. Index without facets that are marked with index: false should be broken (but they were already broken in a sense). (@fulmicoton) #1195 .
|
||||
- Bugfix that could in theory impact durability in theory on some filesystems [#1224](https://github.com/quickwit-oss/tantivy/issues/1224)
|
||||
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-oss/tantivy/issues/922)
|
||||
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-oss/tantivy/issues/1225)
|
||||
- Fix opening bytes index with dynamic codec (@PSeitz) [#1278](https://github.com/quickwit-oss/tantivy/issues/1278)
|
||||
- Added an aggregation collector for range, average and stats compatible with Elasticsearch. (@PSeitz)
|
||||
- Added a JSON schema type @fulmicoton [#1251](https://github.com/quickwit-oss/tantivy/issues/1251)
|
||||
- Added support for slop in phrase queries @halvorboe [#1068](https://github.com/quickwit-oss/tantivy/issues/1068)
|
||||
|
||||
Tantivy 0.16.2
|
||||
================================
|
||||
|
||||
- Bugfix in FuzzyTermQuery. (transposition_cost_one was not doing anything)
|
||||
|
||||
Tantivy 0.16.1
|
||||
========================
|
||||
|
||||
- Major Bugfix on multivalued fastfield. #1151
|
||||
- Demux operation (@PSeitz)
|
||||
|
||||
Tantivy 0.16.0
|
||||
=========================
|
||||
|
||||
- Bugfix in the filesum check. (@evanxg852000) #1127
|
||||
- Bugfix in positions when the index is sorted by a field. (@appaquet) #1125
|
||||
|
||||
Tantivy 0.15.3
|
||||
=========================
|
||||
- Major bugfix. Deleting documents was broken when the index was sorted by a field. (@appaquet, @fulmicoton) #1101
|
||||
|
||||
- Major bugfix. Deleting documents was broken when the index was sorted by a field. (@appaquet, @fulmicoton) #1101
|
||||
|
||||
Tantivy 0.15.2
|
||||
========================
|
||||
|
||||
- Major bugfix. DocStore still panics when a deleted doc is at the beginning of a block. (@appaquet) #1088
|
||||
|
||||
Tantivy 0.15.1
|
||||
=========================
|
||||
|
||||
- Major bugfix. DocStore panics when first block is deleted. (@appaquet) #1077
|
||||
|
||||
Tantivy 0.15.0
|
||||
=========================
|
||||
|
||||
- API Changes. Using Range instead of (start, end) in the API and internals (`FileSlice`, `OwnedBytes`, `Snippets`, ...)
|
||||
This change is breaking but migration is trivial.
|
||||
- Added an Histogram collector. (@fulmicoton) #994
|
||||
@@ -34,9 +93,9 @@ Tantivy 0.15.0
|
||||
- Updated TermMerger implementation to rely on the union feature of the FST (@scampi) #469
|
||||
- Add boolean marking whether position is required in the query_terms API call (@fulmicoton). #1070
|
||||
|
||||
|
||||
Tantivy 0.14.0
|
||||
=========================
|
||||
|
||||
- Remove dependency to atomicwrites #833 .Implemented by @fulmicoton upon suggestion and research from @asafigan).
|
||||
- Migrated tantivy error from the now deprecated `failure` crate to `thiserror` #760. (@hirevo)
|
||||
- API Change. Accessing the typed value off a `Schema::Value` now returns an Option instead of panicking if the type does not match.
|
||||
@@ -55,16 +114,19 @@ This version breaks compatibility and requires users to reindex everything.
|
||||
|
||||
Tantivy 0.13.2
|
||||
===================
|
||||
|
||||
Bugfix. Acquiring a facet reader on a segment that does not contain any
|
||||
doc with this facet returns `None`. (#896)
|
||||
|
||||
Tantivy 0.13.1
|
||||
===================
|
||||
|
||||
Made `Query` and `Collector` `Send + Sync`.
|
||||
Updated misc dependency versions.
|
||||
|
||||
Tantivy 0.13.0
|
||||
======================
|
||||
|
||||
Tantivy 0.13 introduce a change in the index format that will require
|
||||
you to reindex your index (BlockWAND information are added in the skiplist).
|
||||
The index size increase is minor as this information is only added for
|
||||
@@ -79,6 +141,7 @@ so that we can discuss possible solutions.
|
||||
A freshly created DocSet point directly to their first doc. A sentinel value called TERMINATED marks the end of a DocSet.
|
||||
`.advance()` returns the new DocId. `Scorer::skip(target)` has been replaced by `Scorer::seek(target)` and returns the resulting DocId.
|
||||
As a result, iterating through DocSet now looks as follows
|
||||
|
||||
```rust
|
||||
let mut doc = docset.doc();
|
||||
while doc != TERMINATED {
|
||||
@@ -86,7 +149,9 @@ while doc != TERMINATED {
|
||||
doc = docset.advance();
|
||||
}
|
||||
```
|
||||
|
||||
The change made it possible to greatly simplify a lot of the docset's code.
|
||||
|
||||
- Misc internal optimization and introduction of the `Scorer::for_each_pruning` function. (@fulmicoton)
|
||||
- Added an offset option to the Top(.*)Collectors. (@robyoung)
|
||||
- Added Block WAND. Performance on TOP-K on term-unions should be greatly increased. (@fulmicoton, and special thanks
|
||||
@@ -94,6 +159,7 @@ to the PISA team for answering all my questions!)
|
||||
|
||||
Tantivy 0.12.0
|
||||
======================
|
||||
|
||||
- Removing static dispatch in tokenizers for simplicity. (#762)
|
||||
- Added backward iteration for `TermDictionary` stream. (@halvorboe)
|
||||
- Fixed a performance issue when searching for the posting lists of a missing term (@audunhalland)
|
||||
@@ -104,30 +170,32 @@ Tantivy 0.12.0
|
||||
## How to update?
|
||||
|
||||
Crates relying on custom tokenizer, or registering tokenizer in the manager will require some
|
||||
minor changes. Check https://github.com/tantivy-search/tantivy/blob/main/examples/custom_tokenizer.rs
|
||||
minor changes. Check <https://github.com/quickwit-oss/tantivy/blob/main/examples/custom_tokenizer.rs>
|
||||
to check for some code sample.
|
||||
|
||||
Tantivy 0.11.3
|
||||
=======================
|
||||
|
||||
- Fixed DateTime as a fast field (#735)
|
||||
|
||||
Tantivy 0.11.2
|
||||
=======================
|
||||
|
||||
- The future returned by `IndexWriter::merge` does not borrow `self` mutably anymore (#732)
|
||||
- Exposing a constructor for `WatchHandle` (#731)
|
||||
|
||||
Tantivy 0.11.1
|
||||
=====================
|
||||
- Bug fix #729
|
||||
|
||||
- Bug fix #729
|
||||
|
||||
Tantivy 0.11.0
|
||||
=====================
|
||||
|
||||
- Added f64 field. Internally reuse u64 code the same way i64 does (@fdb-hiroshima)
|
||||
- Various bugfixes in the query parser.
|
||||
- Better handling of hyphens in query parser. (#609)
|
||||
- Better handling of whitespaces.
|
||||
- Better handling of hyphens in query parser. (#609)
|
||||
- Better handling of whitespaces.
|
||||
- Closes #498 - add support for Elastic-style unbounded range queries for alphanumeric types eg. "title:>hello", "weight:>=70.5", "height:<200" (@petr-tik)
|
||||
- API change around `Box<BoxableTokenizer>`. See detail in #629
|
||||
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
|
||||
@@ -158,7 +226,6 @@ Tantivy 0.10.1
|
||||
Avoid watching the mmap directory until someone effectively creates a reader that uses
|
||||
this functionality.
|
||||
|
||||
|
||||
Tantivy 0.10.0
|
||||
=====================
|
||||
|
||||
@@ -174,6 +241,7 @@ Tantivy 0.10.0
|
||||
|
||||
Minor
|
||||
---------
|
||||
|
||||
- Switched to Rust 2018 (@uvd)
|
||||
- Small simplification of the code.
|
||||
Calling .freq() or .doc() when .advance() has never been called
|
||||
@@ -181,8 +249,7 @@ on segment postings should panic from now on.
|
||||
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
|
||||
- Fast fields are now preloaded when the `SegmentReader` is created.
|
||||
- `IndexMeta` is now public. (@hntd187)
|
||||
- `IndexWriter` `add_document`, `delete_term`. `IndexWriter` is `Sync`, making it possible to use it with a `
|
||||
Arc<RwLock<IndexWriter>>`. `add_document` and `delete_term` can
|
||||
- `IndexWriter` `add_document`, `delete_term`. `IndexWriter` is `Sync`, making it possible to use it with a `Arc<RwLock<IndexWriter>>`. `add_document` and `delete_term` can
|
||||
only require a read lock. (@fulmicoton)
|
||||
- Introducing `Opstamp` as an expressive type alias for `u64`. (@petr-tik)
|
||||
- Stamper now relies on `AtomicU64` on all platforms (@petr-tik)
|
||||
@@ -198,16 +265,17 @@ Your program should be usable as is.
|
||||
Fast fields used to be accessed directly from the `SegmentReader`.
|
||||
The API changed, you are now required to acquire your fast field reader via the
|
||||
`segment_reader.fast_fields()`, and use one of the typed method:
|
||||
|
||||
- `.u64()`, `.i64()` if your field is single-valued ;
|
||||
- `.u64s()`, `.i64s()` if your field is multi-valued ;
|
||||
- `.bytes()` if your field is bytes fast field.
|
||||
|
||||
|
||||
|
||||
Tantivy 0.9.0
|
||||
=====================
|
||||
|
||||
*0.9.0 index format is not compatible with the
|
||||
previous index format.*
|
||||
|
||||
- MAJOR BUGFIX :
|
||||
Some `Mmap` objects were being leaked, and would never get released. (@fulmicoton)
|
||||
- Removed most unsafe (@fulmicoton)
|
||||
@@ -251,37 +319,40 @@ To update from tantivy 0.8, you will need to go through the following steps.
|
||||
|
||||
```
|
||||
|
||||
|
||||
Tantivy 0.8.2
|
||||
=====================
|
||||
|
||||
Fixing build for x86_64 platforms. (#496)
|
||||
No need to update from 0.8.1 if tantivy
|
||||
is building on your platform.
|
||||
|
||||
|
||||
Tantivy 0.8.1
|
||||
=====================
|
||||
|
||||
Hotfix of #476.
|
||||
|
||||
Merge was reflecting deletes before commit was passed.
|
||||
Thanks @barrotsteindev for reporting the bug.
|
||||
|
||||
|
||||
Tantivy 0.8.0
|
||||
=====================
|
||||
|
||||
*No change in the index format*
|
||||
|
||||
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)
|
||||
- Multithreaded search (@jwolfe, @fulmicoton)
|
||||
|
||||
|
||||
Tantivy 0.7.1
|
||||
=====================
|
||||
|
||||
*No change in the index format*
|
||||
|
||||
- Bugfix: NGramTokenizer panics on non ascii chars
|
||||
- Added a space usage API
|
||||
|
||||
Tantivy 0.7
|
||||
=====================
|
||||
|
||||
- Skip data for doc ids and positions (@fulmicoton),
|
||||
greatly improving performance
|
||||
- Tantivy error now rely on the failure crate (@drusellers)
|
||||
@@ -291,15 +362,15 @@ Tantivy 0.7
|
||||
|
||||
Tantivy 0.6.1
|
||||
=========================
|
||||
|
||||
- Bugfix #324. GC removing was removing file that were still in useful
|
||||
- Added support for parsing AllQuery and RangeQuery via QueryParser
|
||||
- AllQuery: `*`
|
||||
- RangeQuery:
|
||||
- Inclusive `field:[startIncl to endIncl]`
|
||||
- Exclusive `field:{startExcl to endExcl}`
|
||||
- Mixed `field:[startIncl to endExcl}` and vice versa
|
||||
- Unbounded `field:[start to *]`, `field:[* to end]`
|
||||
|
||||
- AllQuery: `*`
|
||||
- RangeQuery:
|
||||
- Inclusive `field:[startIncl to endIncl]`
|
||||
- Exclusive `field:{startExcl to endExcl}`
|
||||
- Mixed `field:[startIncl to endExcl}` and vice versa
|
||||
- Unbounded `field:[start to *]`, `field:[* to end]`
|
||||
|
||||
Tantivy 0.6
|
||||
==========================
|
||||
@@ -312,58 +383,53 @@ to this release!
|
||||
- Approximate field norms encoded over 1 byte. (@fulmicoton)
|
||||
- Compiles on stable rust (@fulmicoton)
|
||||
- Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270)
|
||||
- Completely uncompressed
|
||||
- Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
|
||||
- Completely uncompressed
|
||||
- Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
|
||||
- Add NGram token support (@drusellers)
|
||||
- Add Stopword Filter support (@drusellers)
|
||||
- Add a FuzzyTermQuery (@drusellers)
|
||||
- Add a RegexQuery (@drusellers)
|
||||
- Various performance improvements (@fulmicoton)_
|
||||
|
||||
|
||||
Tantivy 0.5.2
|
||||
===========================
|
||||
|
||||
- bugfix #274
|
||||
- bugfix #280
|
||||
- bugfix #289
|
||||
|
||||
|
||||
Tantivy 0.5.1
|
||||
==========================
|
||||
- bugfix #254 : tantivy failed if no documents in a segment contained a specific field.
|
||||
|
||||
- bugfix #254 : tantivy failed if no documents in a segment contained a specific field.
|
||||
|
||||
Tantivy 0.5
|
||||
==========================
|
||||
|
||||
- Faceting
|
||||
- RangeQuery
|
||||
- Configurable tokenization pipeline
|
||||
- Bugfix in PhraseQuery
|
||||
- Various query optimisation
|
||||
- Allowing very large indexes
|
||||
- 64 bits file address
|
||||
- Smarter encoding of the `TermInfo` objects
|
||||
|
||||
|
||||
- 64 bits file address
|
||||
- Smarter encoding of the `TermInfo` objects
|
||||
|
||||
Tantivy 0.4.3
|
||||
==========================
|
||||
|
||||
- Bugfix race condition when deleting files. (#198)
|
||||
|
||||
|
||||
Tantivy 0.4.2
|
||||
==========================
|
||||
|
||||
- Prevent usage of AVX2 instructions (#201)
|
||||
|
||||
|
||||
Tantivy 0.4.1
|
||||
==========================
|
||||
|
||||
- Bugfix for non-indexed fields. (#199)
|
||||
|
||||
|
||||
Tantivy 0.4.0
|
||||
==========================
|
||||
|
||||
@@ -378,37 +444,31 @@ Tantivy 0.4.0
|
||||
- Searching for a non-indexed field returns an explicit Error
|
||||
- Phrase query for non-tokenized field are not tokenized by the query parser.
|
||||
- Faster/Better indexing (@fulmicoton)
|
||||
- using murmurhash2
|
||||
- faster merging
|
||||
- more memory efficient fast field writer (@lnicola )
|
||||
- better handling of collisions
|
||||
- lesser memory usage
|
||||
- using murmurhash2
|
||||
- faster merging
|
||||
- more memory efficient fast field writer (@lnicola )
|
||||
- better handling of collisions
|
||||
- lesser memory usage
|
||||
- Added API, most notably to iterate over ranges of terms (@fulmicoton)
|
||||
- Bugfix that was preventing to unmap segment files, on index drop (@fulmicoton)
|
||||
- Made the doc! macro public (@fulmicoton)
|
||||
- Added an alternative implementation of the streaming dictionary (@fulmicoton)
|
||||
|
||||
|
||||
|
||||
Tantivy 0.3.1
|
||||
==========================
|
||||
|
||||
- Expose a method to trigger files garbage collection
|
||||
|
||||
|
||||
|
||||
Tantivy 0.3
|
||||
==========================
|
||||
|
||||
|
||||
Special thanks to @Kodraus @lnicola @Ameobea @manuel-woelker @celaus
|
||||
for their contribution to this release.
|
||||
|
||||
Thanks also to everyone in tantivy gitter chat
|
||||
for their advise and company :)
|
||||
|
||||
https://gitter.im/tantivy-search/tantivy
|
||||
|
||||
<https://gitter.im/tantivy-search/tantivy>
|
||||
|
||||
Warning:
|
||||
|
||||
@@ -417,19 +477,16 @@ code and index format.
|
||||
You should not expect backward compatibility before
|
||||
tantivy 1.0.
|
||||
|
||||
|
||||
|
||||
New Features
|
||||
------------
|
||||
|
||||
- Delete. You can now delete documents from an index.
|
||||
- Support for windows (Thanks to @lnicola)
|
||||
|
||||
|
||||
Various Bugfixes & small improvements
|
||||
----------------------------------------
|
||||
|
||||
- Added CI for Windows (https://ci.appveyor.com/project/fulmicoton/tantivy)
|
||||
- Added CI for Windows (<https://ci.appveyor.com/project/fulmicoton/tantivy>)
|
||||
Thanks to @KodrAus ! (#108)
|
||||
- Various dependy version update (Thanks to @Ameobea) #76
|
||||
- Fixed several race conditions in `Index.wait_merge_threads`
|
||||
@@ -441,7 +498,3 @@ Thanks to @KodrAus ! (#108)
|
||||
- Building binary targets for tantivy-cli (Thanks to @KodrAus)
|
||||
- Misc invisible bug fixes, and code cleanup.
|
||||
- Use
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
113
Cargo.toml
@@ -1,73 +1,82 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.16.0-dev"
|
||||
version = "0.18.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
description = """Search engine library"""
|
||||
documentation = "https://docs.rs/tantivy/"
|
||||
homepage = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.13"
|
||||
oneshot = "0.1.3"
|
||||
base64 = "0.13.0"
|
||||
byteorder = "1.4.3"
|
||||
crc32fast = "1.2.1"
|
||||
once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
tantivy-fst = "0.3"
|
||||
memmap = {version = "0.7", optional=true}
|
||||
lz4_flex = { version = "0.8.0", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3", optional = true }
|
||||
crc32fast = "1.3.2"
|
||||
once_cell = "1.10.0"
|
||||
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
|
||||
tantivy-fst = "0.3.0"
|
||||
memmap2 = { version = "0.5.3", optional = true }
|
||||
lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3.4", optional = true }
|
||||
zstd = { version = "0.11", optional = true }
|
||||
snap = { version = "1.0.5", optional = true }
|
||||
tempfile = { version = "3.2", optional = true }
|
||||
log = "0.4.14"
|
||||
serde = { version = "1.0.126", features = ["derive"] }
|
||||
serde_json = "1.0.64"
|
||||
num_cpus = "1.13"
|
||||
tempfile = { version = "3.3.0", optional = true }
|
||||
log = "0.4.16"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_json = "1.0.79"
|
||||
num_cpus = "1.13.1"
|
||||
fs2={ version = "0.4.3", optional = true }
|
||||
levenshtein_automata = "0.2"
|
||||
uuid = { version = "0.8.2", features = ["v4", "serde"] }
|
||||
crossbeam = "0.8"
|
||||
futures = { version = "0.3.15", features = ["thread-pool"] }
|
||||
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||
common = { version="0.1", path="./common" }
|
||||
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version="0.1", path="./ownedbytes" }
|
||||
stable_deref_trait = "1.2"
|
||||
rust-stemmers = "1.2"
|
||||
downcast-rs = "1.2"
|
||||
levenshtein_automata = "0.2.1"
|
||||
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
||||
crossbeam-channel = "0.5.4"
|
||||
tantivy-query-grammar = { version="0.18.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.2", path="./bitpacker" }
|
||||
common = { version = "0.3", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version="0.2", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version="0.3", path="./ownedbytes" }
|
||||
stable_deref_trait = "1.2.0"
|
||||
rust-stemmers = "1.2.0"
|
||||
downcast-rs = "1.2.0"
|
||||
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
|
||||
census = "0.4"
|
||||
census = "0.4.0"
|
||||
fnv = "1.0.7"
|
||||
thiserror = "1.0.24"
|
||||
thiserror = "1.0.30"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.4"
|
||||
murmurhash32 = "0.2"
|
||||
chrono = "0.4.19"
|
||||
smallvec = "1.6.1"
|
||||
rayon = "1.5"
|
||||
lru = "0.6.5"
|
||||
fastdivide = "0.3"
|
||||
itertools = "0.10.0"
|
||||
measure_time = "0.7.0"
|
||||
fail = "0.5.0"
|
||||
murmurhash32 = "0.2.0"
|
||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
smallvec = "1.8.0"
|
||||
rayon = "1.5.2"
|
||||
lru = "0.7.5"
|
||||
fastdivide = "0.4.0"
|
||||
itertools = "0.10.3"
|
||||
measure_time = "0.8.2"
|
||||
pretty_assertions = "1.2.1"
|
||||
serde_cbor = { version = "0.11.2", optional = true }
|
||||
async-trait = "0.1.53"
|
||||
arc-swap = "1.5.0"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3.9"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8.3"
|
||||
rand = "0.8.5"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.8"
|
||||
proptest = "1.0"
|
||||
criterion = "0.3.4"
|
||||
matches = "0.1.9"
|
||||
proptest = "1.0.0"
|
||||
criterion = "0.3.5"
|
||||
test-log = "0.2.10"
|
||||
env_logger = "0.9.0"
|
||||
pprof = { version = "0.10.0", features = ["flamegraph", "criterion"] }
|
||||
futures = "0.3.21"
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.4"
|
||||
version = "0.5.0"
|
||||
features = ["failpoints"]
|
||||
|
||||
[profile.release]
|
||||
@@ -81,22 +90,21 @@ overflow-checks = true
|
||||
|
||||
[features]
|
||||
default = ["mmap", "lz4-compression" ]
|
||||
mmap = ["fs2", "tempfile", "memmap"]
|
||||
mmap = ["fs2", "tempfile", "memmap2"]
|
||||
|
||||
brotli-compression = ["brotli"]
|
||||
lz4-compression = ["lz4_flex"]
|
||||
snappy-compression = ["snap"]
|
||||
zstd-compression = ["zstd"]
|
||||
|
||||
failpoints = ["fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
wasm-bindgen = ["uuid/wasm-bindgen"]
|
||||
|
||||
quickwit = ["serde_cbor"]
|
||||
|
||||
[workspace]
|
||||
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
# Following the "fail" crate best practises, we isolate
|
||||
# tests that define specific behavior in fail check points
|
||||
# in a different binary.
|
||||
@@ -112,3 +120,8 @@ required-features = ["fail/failpoints"]
|
||||
[[bench]]
|
||||
name = "analyzer"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "index-bench"
|
||||
harness = false
|
||||
|
||||
|
||||
3
Makefile
@@ -1,3 +1,6 @@
|
||||
test:
|
||||
echo "Run test only... No examples."
|
||||
cargo test --tests --lib
|
||||
|
||||
fmt:
|
||||
cargo +nightly fmt --all
|
||||
|
||||
102
README.md
@@ -1,26 +1,13 @@
|
||||
|
||||
[](https://travis-ci.org/tantivy-search/tantivy)
|
||||
[](https://codecov.io/gh/tantivy-search/tantivy)
|
||||
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[](https://docs.rs/crate/tantivy/)
|
||||
[](https://github.com/quickwit-oss/tantivy/actions/workflows/test.yml)
|
||||
[](https://codecov.io/gh/quickwit-oss/tantivy)
|
||||
[](https://discord.gg/MT27AG5EVE)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/main)
|
||||
[](https://crates.io/crates/tantivy)
|
||||
|
||||

|
||||
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/0)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/1)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/2)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/3)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/4)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/5)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
|
||||
|
||||
[](https://www.patreon.com/fulmicoton)
|
||||
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in Rust.
|
||||
**Tantivy** is a **full-text search engine library** written in Rust.
|
||||
|
||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||
an off-the-shelf search engine server, but rather a crate that can be used
|
||||
@@ -28,19 +15,23 @@ to build such a search engine.
|
||||
|
||||
Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
If you are looking for an alternative to Elasticsearch or Apache Solr, check out [Quickwit](https://github.com/quickwit-oss/quickwit), our search engine built on top of Tantivy.
|
||||
|
||||
# Benchmark
|
||||
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
||||
performance for different type of queries / collection.
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns
|
||||
performance for different types of queries/collections.
|
||||
|
||||
Your mileage WILL vary depending on the nature of queries and their load.
|
||||
|
||||
<img src="doc/assets/images/searchbenchmark.png">
|
||||
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy) and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
|
||||
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
|
||||
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
||||
- Tiny startup time (<10ms), perfect for command line tools
|
||||
- Tiny startup time (<10ms), perfect for command-line tools
|
||||
- BM25 scoring (the same as Lucene)
|
||||
- Natural query language (e.g. `(michael AND jackson) OR "king of pop"`)
|
||||
- Phrase queries search (e.g. `"michael jackson"`)
|
||||
@@ -55,36 +46,36 @@ Your mileage WILL vary depending on the nature of queries and their load.
|
||||
- Range queries
|
||||
- Faceted search
|
||||
- Configurable indexing (optional term frequency and position indexing)
|
||||
- JSON Field
|
||||
- Aggregation Collector: range buckets, average, and stats metrics
|
||||
- LogMergePolicy with deletes
|
||||
- Searcher Warmer API
|
||||
- Cheesy logo with a horse
|
||||
|
||||
## Non-features
|
||||
|
||||
- Distributed search is out of the scope of Tantivy. That being said, Tantivy is a
|
||||
library upon which one could build a distributed search. Serializable/mergeable collector state for instance,
|
||||
are within the scope of Tantivy.
|
||||
|
||||
Distributed search is out of the scope of Tantivy, but if you are looking for this feature, check out [Quickwit](https://github.com/quickwit-oss/quickwit/).
|
||||
|
||||
# Getting started
|
||||
|
||||
Tantivy works on stable Rust (>= 1.27) and supports Linux, MacOS, and Windows.
|
||||
Tantivy works on stable Rust (>= 1.27) and supports Linux, macOS, and Windows.
|
||||
|
||||
- [Tantivy's simple search example](https://tantivy-search.github.io/examples/basic_search.html)
|
||||
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli) - `tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
|
||||
- [tantivy-cli and its tutorial](https://github.com/quickwit-oss/tantivy-cli) - `tantivy-cli` is an actual command-line interface that makes it easy for you to create a search engine,
|
||||
index documents, and search via the CLI or a small server with a REST API.
|
||||
It walks you through getting a wikipedia search engine up and running in a few minutes.
|
||||
It walks you through getting a Wikipedia search engine up and running in a few minutes.
|
||||
- [Reference doc for the last released version](https://docs.rs/tantivy/)
|
||||
|
||||
# How can I support this project?
|
||||
|
||||
There are many ways to support this project.
|
||||
There are many ways to support this project.
|
||||
|
||||
- Use Tantivy and tell us about your experience on [Gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
|
||||
- Use Tantivy and tell us about your experience on [Discord](https://discord.gg/MT27AG5EVE) or by email (paul.masurel@gmail.com)
|
||||
- Report bugs
|
||||
- Write a blog post
|
||||
- Help with documentation by asking questions or submitting PRs
|
||||
- Contribute code (you can join [our Gitter](https://gitter.im/tantivy-search/tantivy))
|
||||
- Contribute code (you can join [our Discord server](https://discord.gg/MT27AG5EVE))
|
||||
- Talk about Tantivy around you
|
||||
- [](https://www.patreon.com/fulmicoton)
|
||||
|
||||
# Contributing code
|
||||
|
||||
@@ -96,7 +87,7 @@ Tantivy compiles on stable Rust but requires `Rust >= 1.27`.
|
||||
To check out and run tests, you can simply run:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/tantivy-search/tantivy.git
|
||||
git clone https://github.com/quickwit-oss/tantivy.git
|
||||
cd tantivy
|
||||
cargo build
|
||||
```
|
||||
@@ -132,3 +123,46 @@ By default, `rustc` compiles everything in the `examples/` directory in debug mo
|
||||
rust-gdb target/debug/examples/$EXAMPLE_NAME
|
||||
$ gdb run
|
||||
```
|
||||
|
||||
# Companies Using Tantivy
|
||||
|
||||
<p align="left">
|
||||
<img align="center" src="doc/assets/images/Nuclia.png#gh-light-mode-only" alt="Nuclia" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/humanfirst.png#gh-light-mode-only" alt="Humanfirst.ai" height="30" width="auto" />
|
||||
<img align="center" src="doc/assets/images/element.io.svg#gh-light-mode-only" alt="Element.io" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/nuclia-dark-theme.png#gh-dark-mode-only" alt="Nuclia" height="35" width="auto" />
|
||||
<img align="center" src="doc/assets/images/humanfirst.ai-dark-theme.png#gh-dark-mode-only" alt="Humanfirst.ai" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/element-dark-theme.png#gh-dark-mode-only" alt="Element.io" height="25" width="auto" />
|
||||
</p>
|
||||
|
||||
# FAQ
|
||||
|
||||
### Can I use Tantivy in other languages?
|
||||
|
||||
- Python → [tantivy-py](https://github.com/quickwit-oss/tantivy-py)
|
||||
- Ruby → [tantiny](https://github.com/baygeldin/tantiny)
|
||||
|
||||
You can also find other bindings on [GitHub](https://github.com/search?q=tantivy) but they may be less maintained.
|
||||
|
||||
### What are some examples of Tantivy use?
|
||||
|
||||
- [seshat](https://github.com/matrix-org/seshat/): A matrix message database/indexer
|
||||
- [tantiny](https://github.com/baygeldin/tantiny): Tiny full-text search for Ruby
|
||||
- [lnx](https://github.com/lnx-search/lnx): adaptable, typo tolerant search engine with a REST API
|
||||
- and [more](https://github.com/search?q=tantivy)!
|
||||
|
||||
### On average, how much faster is Tantivy compared to Lucene?
|
||||
|
||||
- According to our [search latency benchmark](https://tantivy-search.github.io/bench/), Tantivy is approximately 2x faster than Lucene.
|
||||
|
||||
### Does tantivy support incremental indexing?
|
||||
|
||||
- Yes.
|
||||
|
||||
### How can I edit documents?
|
||||
|
||||
- Data in tantivy is immutable. To edit a document, the document needs to be deleted and reindexed.
|
||||
|
||||
### When will my documents be searchable during indexing?
|
||||
|
||||
- Documents will be searchable after a `commit` is called on an `IndexWriter`. Existing `IndexReader`s will also need to be reloaded in order to reflect the changes. Finally, changes are only visible to newly acquired `Searcher`.
|
||||
|
||||
100000
benches/hdfs.json
Normal file
121
benches/index-bench.rs
Normal file
@@ -0,0 +1,121 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use pprof::criterion::{Output, PProfProfiler};
|
||||
use tantivy::schema::{INDEXED, STORED, STRING, TEXT};
|
||||
use tantivy::Index;
|
||||
|
||||
const HDFS_LOGS: &str = include_str!("hdfs.json");
|
||||
const NUM_REPEATS: usize = 2;
|
||||
|
||||
pub fn hdfs_index_benchmark(c: &mut Criterion) {
|
||||
let schema = {
|
||||
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
|
||||
schema_builder.add_u64_field("timestamp", INDEXED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
schema_builder.add_text_field("severity", STRING);
|
||||
schema_builder.build()
|
||||
};
|
||||
let schema_with_store = {
|
||||
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
|
||||
schema_builder.add_u64_field("timestamp", INDEXED | STORED);
|
||||
schema_builder.add_text_field("body", TEXT | STORED);
|
||||
schema_builder.add_text_field("severity", STRING | STORED);
|
||||
schema_builder.build()
|
||||
};
|
||||
let dynamic_schema = {
|
||||
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
|
||||
schema_builder.add_json_field("json", TEXT);
|
||||
schema_builder.build()
|
||||
};
|
||||
|
||||
let mut group = c.benchmark_group("index-hdfs");
|
||||
group.sample_size(20);
|
||||
group.bench_function("index-hdfs-no-commit", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-with-commit", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema_with_store.clone());
|
||||
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(schema_with_store.clone());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(doc_json).unwrap();
|
||||
let doc = tantivy::doc!(json_field=>json_val);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
})
|
||||
});
|
||||
group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
|
||||
b.iter(|| {
|
||||
let index = Index::create_in_ram(dynamic_schema.clone());
|
||||
let json_field = dynamic_schema.get_field("json").unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
|
||||
for _ in 0..NUM_REPEATS {
|
||||
for doc_json in HDFS_LOGS.trim().split("\n") {
|
||||
let json_val: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_str(doc_json).unwrap();
|
||||
let doc = tantivy::doc!(json_field=>json_val);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group! {
|
||||
name = benches;
|
||||
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||
targets = hdfs_index_benchmark
|
||||
}
|
||||
criterion_main!(benches);
|
||||
@@ -1,12 +1,12 @@
|
||||
[package]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.1.0"
|
||||
edition = "2018"
|
||||
version = "0.2.0"
|
||||
edition = "2021"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = []
|
||||
description = """Tantivy-sub crate: bitpacking"""
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
keywords = []
|
||||
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ extern crate test;
|
||||
mod tests {
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_blockedbitp_read(b: &mut Bencher) {
|
||||
let mut blocked_bitpacker = BlockedBitpacker::new();
|
||||
@@ -20,6 +21,7 @@ mod tests {
|
||||
out
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_blockedbitp_create(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use std::{convert::TryInto, io};
|
||||
use std::convert::TryInto;
|
||||
use std::io;
|
||||
|
||||
pub struct BitPacker {
|
||||
mini_buffer: u64,
|
||||
@@ -81,14 +82,16 @@ impl BitUnpacker {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn bit_width(&self) -> u8 {
|
||||
self.num_bits as u8
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn get(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0u64;
|
||||
}
|
||||
let num_bits = self.num_bits;
|
||||
let mask = self.mask;
|
||||
let addr_in_bits = idx * num_bits;
|
||||
let addr_in_bits = idx * self.num_bits;
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
debug_assert!(
|
||||
@@ -100,7 +103,7 @@ impl BitUnpacker {
|
||||
.unwrap();
|
||||
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
val_shifted & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
use super::bitpacker::BitPacker;
|
||||
use super::compute_num_bits;
|
||||
use crate::{minmax, BitUnpacker};
|
||||
|
||||
use super::{bitpacker::BitPacker, compute_num_bits};
|
||||
|
||||
const BLOCK_SIZE: usize = 128;
|
||||
|
||||
/// `BlockedBitpacker` compresses data in blocks of
|
||||
/// 128 elements, while keeping an index on it
|
||||
///
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BlockedBitpacker {
|
||||
// bitpacked blocks
|
||||
@@ -59,6 +58,10 @@ fn metadata_test() {
|
||||
assert_eq!(meta.num_bits(), 6);
|
||||
}
|
||||
|
||||
fn mem_usage<T>(items: &Vec<T>) -> usize {
|
||||
items.capacity() * std::mem::size_of::<T>()
|
||||
}
|
||||
|
||||
impl BlockedBitpacker {
|
||||
pub fn new() -> Self {
|
||||
let mut compressed_blocks = vec![];
|
||||
@@ -74,10 +77,8 @@ impl BlockedBitpacker {
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
std::mem::size_of::<BlockedBitpacker>()
|
||||
+ self.compressed_blocks.capacity()
|
||||
+ self.offset_and_bits.capacity()
|
||||
* std::mem::size_of_val(&self.offset_and_bits.get(0).cloned().unwrap_or_default())
|
||||
+ self.buffer.capacity()
|
||||
* std::mem::size_of_val(&self.buffer.get(0).cloned().unwrap_or_default())
|
||||
+ mem_usage(&self.offset_and_bits)
|
||||
+ mem_usage(&self.buffer)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
mod bitpacker;
|
||||
mod blocked_bitpacker;
|
||||
|
||||
pub use crate::bitpacker::BitPacker;
|
||||
pub use crate::bitpacker::BitUnpacker;
|
||||
pub use crate::bitpacker::{BitPacker, BitUnpacker};
|
||||
pub use crate::blocked_bitpacker::BlockedBitpacker;
|
||||
|
||||
/// Computes the number of bits that will be used for bitpacking.
|
||||
@@ -50,3 +49,32 @@ where
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
assert_eq!(compute_num_bits(0), 0u8);
|
||||
assert_eq!(compute_num_bits(2), 2u8);
|
||||
assert_eq!(compute_num_bits(3), 2u8);
|
||||
assert_eq!(compute_num_bits(4), 3u8);
|
||||
assert_eq!(compute_num_bits(255), 8u8);
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_empty() {
|
||||
let vals: Vec<u32> = vec![];
|
||||
assert_eq!(minmax(vals.into_iter()), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_one() {
|
||||
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_two() {
|
||||
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
|
||||
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
|
||||
}
|
||||
|
||||
@@ -1,12 +1,17 @@
|
||||
[package]
|
||||
name = "common"
|
||||
version = "0.1.0"
|
||||
name = "tantivy-common"
|
||||
version = "0.3.0"
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
edition = "2021"
|
||||
description = "common traits and utility functions used by multiple tantivy subcrates"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
ownedbytes = { version="0.3", path="../ownedbytes" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
rand = "0.8.4"
|
||||
|
||||
745
common/src/bitset.rs
Normal file
@@ -0,0 +1,745 @@
|
||||
use std::convert::TryInto;
|
||||
use std::io::Write;
|
||||
use std::{fmt, io, u64};
|
||||
|
||||
use ownedbytes::OwnedBytes;
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub struct TinySet(u64);
|
||||
|
||||
impl fmt::Debug for TinySet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.into_iter().collect::<Vec<u32>>().fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TinySetIterator(TinySet);
|
||||
impl Iterator for TinySetIterator {
|
||||
type Item = u32;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0.pop_lowest()
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for TinySet {
|
||||
type Item = u32;
|
||||
type IntoIter = TinySetIterator;
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
TinySetIterator(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl TinySet {
|
||||
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
|
||||
writer.write_all(self.0.to_le_bytes().as_ref())
|
||||
}
|
||||
|
||||
pub fn into_bytes(self) -> [u8; 8] {
|
||||
self.0.to_le_bytes()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn deserialize(data: [u8; 8]) -> Self {
|
||||
let val: u64 = u64::from_le_bytes(data);
|
||||
TinySet(val)
|
||||
}
|
||||
|
||||
/// Returns an empty `TinySet`.
|
||||
#[inline]
|
||||
pub fn empty() -> TinySet {
|
||||
TinySet(0u64)
|
||||
}
|
||||
|
||||
/// Returns a full `TinySet`.
|
||||
#[inline]
|
||||
pub fn full() -> TinySet {
|
||||
TinySet::empty().complement()
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
///
|
||||
/// Careful on making this function public, as it will break the padding handling in the last
|
||||
/// bucket.
|
||||
#[inline]
|
||||
fn complement(self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` contains the element `el`.
|
||||
#[inline]
|
||||
pub fn contains(self, el: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the TinySet.
|
||||
#[inline]
|
||||
pub fn len(self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
|
||||
/// Returns the intersection of `self` and `other`
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
}
|
||||
|
||||
/// Creates a new `TinySet` containing only one element
|
||||
/// within `[0; 64[`
|
||||
#[inline]
|
||||
pub fn singleton(el: u32) -> TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64)
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Removes an element within [0..64)
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn remove(self, el: u32) -> TinySet {
|
||||
self.intersect(TinySet::singleton(el).complement())
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64)
|
||||
///
|
||||
/// returns true if the set changed
|
||||
#[inline]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.insert(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Remove a element within [0..64)
|
||||
///
|
||||
/// returns true if the set changed
|
||||
#[inline]
|
||||
pub fn remove_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.remove(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 | other.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` is empty.
|
||||
#[inline]
|
||||
pub fn is_empty(self) -> bool {
|
||||
self.0 == 0u64
|
||||
}
|
||||
|
||||
/// Returns the lowest element in the `TinySet`
|
||||
/// and removes it.
|
||||
#[inline]
|
||||
pub fn pop_lowest(&mut self) -> Option<u32> {
|
||||
if self.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let lowest = self.0.trailing_zeros() as u32;
|
||||
self.0 ^= TinySet::singleton(lowest).0;
|
||||
Some(lowest)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` than contains all values up
|
||||
/// to limit excluded.
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_lower(upper_bound: u32) -> TinySet {
|
||||
TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` that contains all values greater
|
||||
/// or equal to the given limit, included. (and up to 63)
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
|
||||
TinySet::range_lower(from_included).complement()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BitSet {
|
||||
tinysets: Box<[TinySet]>,
|
||||
len: u64,
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
fn num_buckets(max_val: u32) -> u32 {
|
||||
(max_val + 63u32) / 64u32
|
||||
}
|
||||
|
||||
impl BitSet {
|
||||
/// serialize a `BitSet`.
|
||||
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
|
||||
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
|
||||
for tinyset in self.tinysets.iter().cloned() {
|
||||
writer.write_all(&tinyset.into_bytes())?;
|
||||
}
|
||||
writer.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val)`.
|
||||
pub fn with_max_value(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybitsets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
||||
BitSet {
|
||||
tinysets: tinybitsets,
|
||||
len: 0,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `BitSet` that may contain elements. Initially all values will be set.
|
||||
/// within `[0, max_val)`.
|
||||
pub fn with_max_value_and_full(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let mut tinybitsets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
|
||||
|
||||
// Fix padding
|
||||
let lower = max_value % 64u32;
|
||||
if lower != 0 {
|
||||
tinybitsets[tinybitsets.len() - 1] = TinySet::range_lower(lower);
|
||||
}
|
||||
BitSet {
|
||||
tinysets: tinybitsets,
|
||||
len: max_value as u64,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all elements from the `BitSet`.
|
||||
pub fn clear(&mut self) {
|
||||
for tinyset in self.tinysets.iter_mut() {
|
||||
*tinyset = TinySet::empty();
|
||||
}
|
||||
}
|
||||
|
||||
/// Intersect with serialized bitset
|
||||
pub fn intersect_update(&mut self, other: &ReadOnlyBitSet) {
|
||||
self.intersect_update_with_iter(other.iter_tinysets());
|
||||
}
|
||||
|
||||
/// Intersect with tinysets
|
||||
fn intersect_update_with_iter(&mut self, other: impl Iterator<Item = TinySet>) {
|
||||
self.len = 0;
|
||||
for (left, right) in self.tinysets.iter_mut().zip(other) {
|
||||
*left = left.intersect(right);
|
||||
self.len += left.len() as u64;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.len as usize
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
#[inline]
|
||||
pub fn insert(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
#[inline]
|
||||
pub fn remove(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len -= if self.tinysets[higher as usize].remove_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
self.tinyset(el / 64u32).contains(el % 64)
|
||||
}
|
||||
|
||||
/// Returns the first non-empty `TinySet` associated to a bucket lower
|
||||
/// or greater than bucket.
|
||||
///
|
||||
/// Reminder: the tiny set with the bucket `bucket`, represents the
|
||||
/// elements from `bucket * 64` to `(bucket+1) * 64`.
|
||||
pub fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
|
||||
self.tinysets[bucket as usize..]
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|tinyset| !tinyset.is_empty())
|
||||
.map(|delta_bucket| bucket + delta_bucket as u32)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
/// Returns the tiny bitset representing the
|
||||
/// the set restricted to the number range from
|
||||
/// `bucket * 64` to `(bucket + 1) * 64`.
|
||||
pub fn tinyset(&self, bucket: u32) -> TinySet {
|
||||
self.tinysets[bucket as usize]
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialized BitSet.
|
||||
#[derive(Clone)]
|
||||
pub struct ReadOnlyBitSet {
|
||||
data: OwnedBytes,
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
pub fn intersect_bitsets(left: &ReadOnlyBitSet, other: &ReadOnlyBitSet) -> ReadOnlyBitSet {
|
||||
assert_eq!(left.max_value(), other.max_value());
|
||||
assert_eq!(left.data.len(), other.data.len());
|
||||
let union_tinyset_it = left
|
||||
.iter_tinysets()
|
||||
.zip(other.iter_tinysets())
|
||||
.map(|(left_tinyset, right_tinyset)| left_tinyset.intersect(right_tinyset));
|
||||
let mut output_dataset: Vec<u8> = Vec::with_capacity(left.data.len());
|
||||
for tinyset in union_tinyset_it {
|
||||
output_dataset.extend_from_slice(&tinyset.into_bytes());
|
||||
}
|
||||
ReadOnlyBitSet {
|
||||
data: OwnedBytes::new(output_dataset),
|
||||
max_value: left.max_value(),
|
||||
}
|
||||
}
|
||||
|
||||
impl ReadOnlyBitSet {
|
||||
pub fn open(data: OwnedBytes) -> Self {
|
||||
let (max_value_data, data) = data.split(4);
|
||||
assert_eq!(data.len() % 8, 0);
|
||||
let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap());
|
||||
ReadOnlyBitSet { data, max_value }
|
||||
}
|
||||
|
||||
/// Number of elements in the bitset.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.iter_tinysets()
|
||||
.map(|tinyset| tinyset.len() as usize)
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Iterate the tinyset on the fly from serialized data.
|
||||
#[inline]
|
||||
fn iter_tinysets(&self) -> impl Iterator<Item = TinySet> + '_ {
|
||||
self.data.chunks_exact(8).map(move |chunk| {
|
||||
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap());
|
||||
tinyset
|
||||
})
|
||||
}
|
||||
|
||||
/// Iterate over the positions of the elements.
|
||||
#[inline]
|
||||
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
|
||||
self.iter_tinysets()
|
||||
.enumerate()
|
||||
.flat_map(move |(chunk_num, tinyset)| {
|
||||
let chunk_base_val = chunk_num as u32 * 64;
|
||||
tinyset
|
||||
.into_iter()
|
||||
.map(move |val| val + chunk_base_val)
|
||||
.take_while(move |doc| *doc < self.max_value)
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
let byte_offset = el / 8u32;
|
||||
let b: u8 = self.data[byte_offset as usize];
|
||||
let shift = (el % 8) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// Maximum value the bitset may contain.
|
||||
/// (Note this is not the maximum value contained in the set.)
|
||||
///
|
||||
/// A bitset has an intrinsic capacity.
|
||||
/// It only stores elements within [0..max_value).
|
||||
#[inline]
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
/// Number of bytes used in the bitset representation.
|
||||
pub fn num_bytes(&self) -> usize {
|
||||
self.data.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a BitSet> for ReadOnlyBitSet {
|
||||
fn from(bitset: &'a BitSet) -> ReadOnlyBitSet {
|
||||
let mut buffer = Vec::with_capacity(bitset.tinysets.len() * 8 + 4);
|
||||
bitset
|
||||
.serialize(&mut buffer)
|
||||
.expect("serializing into a buffer should never fail");
|
||||
ReadOnlyBitSet::open(OwnedBytes::new(buffer))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use ownedbytes::OwnedBytes;
|
||||
use rand::distributions::Bernoulli;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
|
||||
use super::{BitSet, ReadOnlyBitSet, TinySet};
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_full_multi() {
|
||||
for i in 0..1000 {
|
||||
let bitset = BitSet::with_max_value_and_full(i);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len() as usize, i as usize);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_full_block() {
|
||||
let bitset = BitSet::with_max_value_and_full(64);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len() as usize, 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_full() {
|
||||
let mut bitset = BitSet::with_max_value_and_full(5);
|
||||
bitset.remove(3);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len(), 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_intersect() {
|
||||
let bitset_serialized = {
|
||||
let mut bitset = BitSet::with_max_value_and_full(5);
|
||||
bitset.remove(1);
|
||||
bitset.remove(3);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
ReadOnlyBitSet::open(OwnedBytes::new(out))
|
||||
};
|
||||
|
||||
let mut bitset = BitSet::with_max_value_and_full(5);
|
||||
bitset.remove(1);
|
||||
bitset.intersect_update(&bitset_serialized);
|
||||
|
||||
assert!(bitset.contains(0));
|
||||
assert!(!bitset.contains(1));
|
||||
assert!(bitset.contains(2));
|
||||
assert!(!bitset.contains(3));
|
||||
assert!(bitset.contains(4));
|
||||
|
||||
bitset.intersect_update_with_iter(vec![TinySet::singleton(0)].into_iter());
|
||||
|
||||
assert!(bitset.contains(0));
|
||||
assert!(!bitset.contains(1));
|
||||
assert!(!bitset.contains(2));
|
||||
assert!(!bitset.contains(3));
|
||||
assert!(!bitset.contains(4));
|
||||
assert_eq!(bitset.len(), 1);
|
||||
|
||||
bitset.intersect_update_with_iter(vec![TinySet::singleton(1)].into_iter());
|
||||
assert!(!bitset.contains(0));
|
||||
assert!(!bitset.contains(1));
|
||||
assert!(!bitset.contains(2));
|
||||
assert!(!bitset.contains(3));
|
||||
assert!(!bitset.contains(4));
|
||||
assert_eq!(bitset.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_empty() {
|
||||
let mut bitset = BitSet::with_max_value(5);
|
||||
bitset.insert(3);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len(), 1);
|
||||
|
||||
{
|
||||
let bitset = BitSet::with_max_value(5);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set_remove() {
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32).insert(5).remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty()
|
||||
.insert(63u32)
|
||||
.insert(1)
|
||||
.insert(5)
|
||||
.remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1).remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1).remove(1u32);
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
assert!(TinySet::empty().is_empty());
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32).insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(2u32);
|
||||
assert_eq!(u.pop_lowest(), Some(2u32));
|
||||
u.insert_mut(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(63u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32).insert(5);
|
||||
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||
assert_eq!(u.pop_lowest(), Some(63u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let original = TinySet::empty().insert(63u32).insert(5);
|
||||
let after_serialize_deserialize = TinySet::deserialize(original.into_bytes());
|
||||
assert_eq!(original, after_serialize_deserialize);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset() {
|
||||
let test_against_hashset = |els: &[u32], max_value: u32| {
|
||||
let mut hashset: HashSet<u32> = HashSet::new();
|
||||
let mut bitset = BitSet::with_max_value(max_value);
|
||||
for &el in els {
|
||||
assert!(el < max_value);
|
||||
hashset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for el in 0..max_value {
|
||||
assert_eq!(hashset.contains(&el), bitset.contains(el));
|
||||
}
|
||||
assert_eq!(bitset.max_value(), max_value);
|
||||
|
||||
// test deser
|
||||
let mut data = vec![];
|
||||
bitset.serialize(&mut data).unwrap();
|
||||
let ro_bitset = ReadOnlyBitSet::open(OwnedBytes::new(data));
|
||||
for el in 0..max_value {
|
||||
assert_eq!(hashset.contains(&el), ro_bitset.contains(el));
|
||||
}
|
||||
assert_eq!(ro_bitset.max_value(), max_value);
|
||||
assert_eq!(ro_bitset.len(), els.len());
|
||||
};
|
||||
|
||||
test_against_hashset(&[], 0);
|
||||
test_against_hashset(&[], 1);
|
||||
test_against_hashset(&[0u32], 1);
|
||||
test_against_hashset(&[0u32], 100);
|
||||
test_against_hashset(&[1u32, 2u32], 4);
|
||||
test_against_hashset(&[99u32], 100);
|
||||
test_against_hashset(&[63u32], 64);
|
||||
test_against_hashset(&[62u32, 63u32], 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_num_buckets() {
|
||||
use super::num_buckets;
|
||||
assert_eq!(num_buckets(0u32), 0);
|
||||
assert_eq!(num_buckets(1u32), 1);
|
||||
assert_eq!(num_buckets(64u32), 1);
|
||||
assert_eq!(num_buckets(65u32), 2);
|
||||
assert_eq!(num_buckets(128u32), 2);
|
||||
assert_eq!(num_buckets(129u32), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tinyset_range() {
|
||||
assert_eq!(
|
||||
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1, 2]
|
||||
);
|
||||
assert!(TinySet::range_lower(0).is_empty());
|
||||
assert_eq!(
|
||||
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
|
||||
(0u32..63u32).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
|
||||
[0]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_greater_or_equal(3)
|
||||
.into_iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
(3u32..64u32).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_len() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(104u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
bitset.remove(105u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
bitset.remove(104u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.remove(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.remove(103u32);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
}
|
||||
|
||||
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
StdRng::from_seed([seed_val; 32])
|
||||
.sample_iter(&Bernoulli::new(ratio).unwrap())
|
||||
.take(n as usize)
|
||||
.enumerate()
|
||||
.filter_map(|(val, keep)| if keep { Some(val as u32) } else { None })
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn sample(n: u32, ratio: f64) -> Vec<u32> {
|
||||
sample_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_clear() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
let els = sample(1_000, 0.01f64);
|
||||
for &el in &els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
assert!(els.iter().all(|el| bitset.contains(*el)));
|
||||
bitset.clear();
|
||||
for el in 0u32..1000u32 {
|
||||
assert!(!bitset.contains(el));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use test;
|
||||
|
||||
use super::{BitSet, TinySet};
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let mut tinyset = TinySet::singleton(test::black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
b.iter(|| {
|
||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,168 @@
|
||||
#![allow(clippy::len_without_is_empty)]
|
||||
|
||||
use std::ops::Deref;
|
||||
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
mod bitset;
|
||||
mod serialize;
|
||||
mod vint;
|
||||
mod writer;
|
||||
|
||||
pub use bitset::*;
|
||||
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt};
|
||||
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
|
||||
|
||||
/// Has length trait
|
||||
pub trait HasLen {
|
||||
/// Return length
|
||||
fn len(&self) -> usize;
|
||||
|
||||
/// Returns true iff empty.
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Deref<Target = [u8]>> HasLen for T {
|
||||
fn len(&self) -> usize {
|
||||
self.deref().len()
|
||||
}
|
||||
}
|
||||
|
||||
const HIGHEST_BIT: u64 = 1 << 63;
|
||||
|
||||
/// Maps a `i64` to `u64`
|
||||
///
|
||||
/// For simplicity, tantivy internally handles `i64` as `u64`.
|
||||
/// The mapping is defined by this function.
|
||||
///
|
||||
/// Maps `i64` to `u64` so that
|
||||
/// `-2^63 .. 2^63-1` is mapped
|
||||
/// to
|
||||
/// `0 .. 2^64-1`
|
||||
/// in that order.
|
||||
///
|
||||
/// This is more suited than simply casting (`val as u64`)
|
||||
/// because of bitpacking.
|
||||
///
|
||||
/// Imagine a list of `i64` ranging from -10 to 10.
|
||||
/// When casting negative values, the negative values are projected
|
||||
/// to values over 2^63, and all values end up requiring 64 bits.
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
|
||||
#[inline]
|
||||
pub fn i64_to_u64(val: i64) -> u64 {
|
||||
(val as u64) ^ HIGHEST_BIT
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline]
|
||||
pub fn u64_to_i64(val: u64) -> i64 {
|
||||
(val ^ HIGHEST_BIT) as i64
|
||||
}
|
||||
|
||||
/// Maps a `f64` to `u64`
|
||||
///
|
||||
/// For simplicity, tantivy internally handles `f64` as `u64`.
|
||||
/// The mapping is defined by this function.
|
||||
///
|
||||
/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved.
|
||||
///
|
||||
/// This is more suited than simply casting (`val as u64`)
|
||||
/// which would truncate the result
|
||||
///
|
||||
/// # Reference
|
||||
///
|
||||
/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/)
|
||||
/// explains the mapping in a clear manner.
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
|
||||
#[inline]
|
||||
pub fn f64_to_u64(val: f64) -> u64 {
|
||||
let bits = val.to_bits();
|
||||
if val.is_sign_positive() {
|
||||
bits ^ HIGHEST_BIT
|
||||
} else {
|
||||
!bits
|
||||
}
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline]
|
||||
pub fn u64_to_f64(val: u64) -> f64 {
|
||||
f64::from_bits(if val & HIGHEST_BIT != 0 {
|
||||
val ^ HIGHEST_BIT
|
||||
} else {
|
||||
!val
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, BinarySerializable, FixedSize};
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
fn test_f64_converter_helper(val: f64) {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
|
||||
let left_u64 = f64_to_u64(left);
|
||||
let right_u64 = f64_to_u64(right);
|
||||
assert_eq!(left_u64 < right_u64, left < right);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_i64_converter() {
|
||||
assert_eq!(i64_to_u64(i64::MIN), u64::MIN);
|
||||
assert_eq!(i64_to_u64(i64::MAX), u64::MAX);
|
||||
test_i64_converter_helper(0i64);
|
||||
test_i64_converter_helper(i64::MIN);
|
||||
test_i64_converter_helper(i64::MAX);
|
||||
for i in -1000i64..1000i64 {
|
||||
test_i64_converter_helper(i);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_converter() {
|
||||
test_f64_converter_helper(f64::INFINITY);
|
||||
test_f64_converter_helper(f64::NEG_INFINITY);
|
||||
test_f64_converter_helper(0.0);
|
||||
test_f64_converter_helper(-0.0);
|
||||
test_f64_converter_helper(1.0);
|
||||
test_f64_converter_helper(-1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_order() {
|
||||
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
|
||||
.contains(&f64_to_u64(f64::NAN))); // nan is not a number
|
||||
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); // same exponent, different mantissa
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); // same mantissa, different exponent
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); // different exponent and mantissa
|
||||
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
|
||||
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
use crate::Endianness;
|
||||
use crate::VInt;
|
||||
use std::io::{Read, Write};
|
||||
use std::{fmt, io};
|
||||
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
|
||||
use crate::{Endianness, VInt};
|
||||
|
||||
/// Trait for a simple binary serialization.
|
||||
pub trait BinarySerializable: fmt::Debug + Sized {
|
||||
@@ -20,7 +19,7 @@ pub trait DeserializeFrom<T: BinarySerializable> {
|
||||
|
||||
/// Implement deserialize from &[u8] for all types which implement BinarySerializable.
|
||||
///
|
||||
/// TryFrom would actually be preferrable, but not possible because of the orphan
|
||||
/// TryFrom would actually be preferable, but not possible because of the orphan
|
||||
/// rules (not completely sure if this could be resolved)
|
||||
impl<T: BinarySerializable> DeserializeFrom<T> for &[u8] {
|
||||
fn deserialize(&mut self) -> io::Result<T> {
|
||||
@@ -202,8 +201,7 @@ impl BinarySerializable for String {
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
|
||||
use super::VInt;
|
||||
use super::*;
|
||||
use super::{VInt, *};
|
||||
use crate::serialize::BinarySerializable;
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
@@ -231,7 +229,7 @@ pub mod test {
|
||||
fixed_size_test::<u32>();
|
||||
assert_eq!(4, serialize_test(3u32));
|
||||
assert_eq!(4, serialize_test(5u32));
|
||||
assert_eq!(4, serialize_test(u32::max_value()));
|
||||
assert_eq!(4, serialize_test(u32::MAX));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -249,6 +247,11 @@ pub mod test {
|
||||
fixed_size_test::<u64>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_bool() {
|
||||
fixed_size_test::<bool>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_string() {
|
||||
assert_eq!(serialize_test(String::from("")), 1);
|
||||
@@ -274,6 +277,6 @@ pub mod test {
|
||||
assert_eq!(serialize_test(VInt(1234u64)), 2);
|
||||
assert_eq!(serialize_test(VInt(16_383u64)), 2);
|
||||
assert_eq!(serialize_test(VInt(16_384u64)), 3);
|
||||
assert_eq!(serialize_test(VInt(u64::max_value())), 10);
|
||||
assert_eq!(serialize_test(VInt(u64::MAX)), 10);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use super::BinarySerializable;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
use std::io::Write;
|
||||
use std::io::{Read, Write};
|
||||
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
|
||||
use super::BinarySerializable;
|
||||
|
||||
/// Wrapper over a `u64` that serializes as a variable int.
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
@@ -174,9 +175,7 @@ impl BinarySerializable for VInt {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::serialize_vint_u32;
|
||||
use super::BinarySerializable;
|
||||
use super::VInt;
|
||||
use super::{serialize_vint_u32, BinarySerializable, VInt};
|
||||
|
||||
fn aux_test_vint(val: u64) {
|
||||
let mut v = [14u8; 10];
|
||||
@@ -200,7 +199,7 @@ mod tests {
|
||||
aux_test_vint(0);
|
||||
aux_test_vint(1);
|
||||
aux_test_vint(5);
|
||||
aux_test_vint(u64::max_value());
|
||||
aux_test_vint(u64::MAX);
|
||||
for i in 1..9 {
|
||||
let power_of_128 = 1u64 << (7 * i);
|
||||
aux_test_vint(power_of_128 - 1u64);
|
||||
@@ -229,6 +228,6 @@ mod tests {
|
||||
aux_test_serialize_vint_u32(power_of_128);
|
||||
aux_test_serialize_vint_u32(power_of_128 + 1u32);
|
||||
}
|
||||
aux_test_serialize_vint_u32(u32::max_value());
|
||||
aux_test_serialize_vint_u32(u32::MAX);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,19 +54,18 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
|
||||
/// Struct used to prevent from calling
|
||||
/// [`terminate_ref`](trait.TerminatingWrite.html#tymethod.terminate_ref) directly
|
||||
///
|
||||
/// The point is that while the type is public, it cannot be built by anyone
|
||||
/// outside of this module.
|
||||
pub struct AntiCallToken(());
|
||||
|
||||
/// Trait used to indicate when no more write need to be done on a writer
|
||||
pub trait TerminatingWrite: Write {
|
||||
pub trait TerminatingWrite: Write + Send + Sync {
|
||||
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
|
||||
fn terminate(mut self) -> io::Result<()>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
where Self: Sized {
|
||||
self.terminate_ref(AntiCallToken(()))
|
||||
}
|
||||
|
||||
@@ -97,9 +96,10 @@ impl<'a> TerminatingWrite for &'a mut Vec<u8> {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::CountingWriter;
|
||||
use std::io::Write;
|
||||
|
||||
use super::CountingWriter;
|
||||
|
||||
#[test]
|
||||
fn test_counting_writer() {
|
||||
let buffer: Vec<u8> = vec![];
|
||||
|
||||
BIN
doc/assets/images/Nuclia.png
Normal file
|
After Width: | Height: | Size: 3.1 KiB |
BIN
doc/assets/images/element-dark-theme.png
Normal file
|
After Width: | Height: | Size: 56 KiB |
8
doc/assets/images/element.io.svg
Normal file
@@ -0,0 +1,8 @@
|
||||
<svg width="518" height="112" viewBox="0 0 518 112" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M56 112C86.9279 112 112 86.9279 112 56C112 25.0721 86.9279 0 56 0C25.0721 0 0 25.0721 0 56C0 86.9279 25.0721 112 56 112Z" fill="#0DBD8B"/>
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M45.7615 26.093C45.7615 23.8325 47.5977 22.0001 49.8629 22.0001C65.2154 22.0001 77.6611 34.4199 77.6611 49.7406C77.6611 52.001 75.8248 53.8335 73.5597 53.8335C71.2945 53.8335 69.4583 52.001 69.4583 49.7406C69.4583 38.9408 60.6851 30.1859 49.8629 30.1859C47.5977 30.1859 45.7615 28.3534 45.7615 26.093Z" fill="white"/>
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M85.8986 45.6477C88.1637 45.6477 89.9999 47.4801 89.9999 49.7406C89.9999 65.0612 77.5543 77.4811 62.2017 77.4811C59.9366 77.4811 58.1003 75.6486 58.1003 73.3882C58.1003 71.1277 59.9366 69.2953 62.2017 69.2953C73.024 69.2953 81.7972 60.5403 81.7972 49.7406C81.7972 47.4801 83.6334 45.6477 85.8986 45.6477Z" fill="white"/>
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M66.3031 85.907C66.3031 88.1675 64.4668 89.9999 62.2017 89.9999C46.8492 89.9999 34.4035 77.58 34.4035 62.2594C34.4035 59.9989 36.2398 58.1665 38.5049 58.1665C40.77 58.1665 42.6063 59.9989 42.6063 62.2594C42.6063 73.0592 51.3795 81.8141 62.2017 81.8141C64.4668 81.8141 66.3031 83.6466 66.3031 85.907Z" fill="white"/>
|
||||
<path fill-rule="evenodd" clip-rule="evenodd" d="M26.1014 66.3523C23.8363 66.3523 22.0001 64.5199 22.0001 62.2594C22 46.9388 34.4457 34.5189 49.7983 34.5189C52.0634 34.5189 53.8997 36.3514 53.8997 38.6118C53.8997 40.8723 52.0634 42.7047 49.7983 42.7047C38.976 42.7047 30.2028 51.4597 30.2028 62.2594C30.2028 64.5199 28.3666 66.3523 26.1014 66.3523Z" fill="white"/>
|
||||
<path d="M197 63.5H157.5C157.967 67.6333 159.467 70.9333 162 73.4C164.533 75.8 167.867 77 172 77C174.733 77 177.2 76.3333 179.4 75C181.6 73.6667 183.167 71.8667 184.1 69.6H196.1C194.5 74.8667 191.5 79.1333 187.1 82.4C182.767 85.6 177.633 87.2 171.7 87.2C163.967 87.2 157.7 84.6333 152.9 79.5C148.167 74.3667 145.8 67.8667 145.8 60C145.8 52.3333 148.2 45.9 153 40.7C157.8 35.5 164 32.9 171.6 32.9C179.2 32.9 185.333 35.4667 190 40.6C194.733 45.6667 197.1 52.0667 197.1 59.8L197 63.5ZM171.6 42.6C167.867 42.6 164.767 43.7 162.3 45.9C159.833 48.1 158.3 51.0333 157.7 54.7H185.3C184.767 51.0333 183.3 48.1 180.9 45.9C178.5 43.7 175.4 42.6 171.6 42.6ZM205.289 70.5V11H217.189V70.7C217.189 73.3667 218.656 74.7 221.589 74.7L223.689 74.6V85.9C222.556 86.1 221.356 86.2 220.089 86.2C214.956 86.2 211.189 84.9 208.789 82.3C206.456 79.7 205.289 75.7667 205.289 70.5ZM279.109 63.5H239.609C240.076 67.6333 241.576 70.9333 244.109 73.4C246.643 75.8 249.976 77 254.109 77C256.843 77 259.309 76.3333 261.509 75C263.709 73.6667 265.276 71.8667 266.209 69.6H278.209C276.609 74.8667 273.609 79.1333 269.209 82.4C264.876 85.6 259.743 87.2 253.809 87.2C246.076 87.2 239.809 84.6333 235.009 79.5C230.276 74.3667 227.909 67.8667 227.909 60C227.909 52.3333 230.309 45.9 235.109 40.7C239.909 35.5 246.109 32.9 253.709 32.9C261.309 32.9 267.443 35.4667 272.109 40.6C276.843 45.6667 279.209 52.0667 279.209 59.8L279.109 63.5ZM253.709 42.6C249.976 42.6 246.876 43.7 244.409 45.9C241.943 48.1 240.409 51.0333 239.809 54.7H267.409C266.876 51.0333 265.409 48.1 263.009 45.9C260.609 43.7 257.509 42.6 253.709 42.6ZM332.798 56.2V86H320.898V54.9C320.898 47.0333 317.632 43.1 311.098 43.1C307.565 43.1 304.732 44.2333 302.598 46.5C300.532 48.7667 299.498 51.8667 299.498 55.8V86H287.598V34.1H298.598V41C299.865 38.6667 301.798 36.7333 304.398 35.2C306.998 33.6667 310.232 32.9 314.098 32.9C321.298 32.9 326.498 35.6333 329.698 41.1C334.098 35.6333 339.965 32.9 347.298 32.9C353.365 32.9 358.032 34.8 361.298 38.6C364.565 42.3333 366.198 47.2667 366.198 53.4V86H354.298V54.9C354.298 47.0333 351.032 43.1 344.498 43.1C340.898 43.1 338.032 44.2667 335.898 46.6C333.832 48.8667 332.798 52.0667 332.798 56.2ZM425.379 63.5H385.879C386.346 67.6333 387.846 70.9333 390.379 73.4C392.912 75.8 396.246 77 400.379 77C403.112 77 405.579 76.3333 407.779 75C409.979 73.6667 411.546 71.8667 412.479 69.6H424.479C422.879 74.8667 419.879 79.1333 415.479 82.4C411.146 85.6 406.012 87.2 400.079 87.2C392.346 87.2 386.079 84.6333 381.279 79.5C376.546 74.3667 374.179 67.8667 374.179 60C374.179 52.3333 376.579 45.9 381.379 40.7C386.179 35.5 392.379 32.9 399.979 32.9C407.579 32.9 413.712 35.4667 418.379 40.6C423.112 45.6667 425.479 52.0667 425.479 59.8L425.379 63.5ZM399.979 42.6C396.246 42.6 393.146 43.7 390.679 45.9C388.212 48.1 386.679 51.0333 386.079 54.7H413.679C413.146 51.0333 411.679 48.1 409.279 45.9C406.879 43.7 403.779 42.6 399.979 42.6ZM444.868 34.1V41C446.068 38.7333 448.035 36.8333 450.768 35.3C453.568 33.7 456.935 32.9 460.868 32.9C467.001 32.9 471.735 34.7667 475.068 38.5C478.468 42.2333 480.168 47.2 480.168 53.4V86H468.268V54.9C468.268 51.2333 467.401 48.3667 465.668 46.3C464.001 44.1667 461.435 43.1 457.968 43.1C454.168 43.1 451.168 44.2333 448.968 46.5C446.835 48.7667 445.768 51.9 445.768 55.9V86H433.868V34.1H444.868ZM514.922 75.4V85.7C513.455 86.1 511.389 86.3 508.722 86.3C498.589 86.3 493.522 81.2 493.522 71V43.6H485.622V34.1H493.522V20.6H505.422V34.1H515.122V43.6H505.422V69.8C505.422 73.8667 507.355 75.9 511.222 75.9L514.922 75.4Z" fill="black"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 5.2 KiB |
BIN
doc/assets/images/humanfirst.ai-dark-theme.png
Normal file
|
After Width: | Height: | Size: 23 KiB |
BIN
doc/assets/images/humanfirst.png
Normal file
|
After Width: | Height: | Size: 102 KiB |
BIN
doc/assets/images/nuclia-dark-theme.png
Normal file
|
After Width: | Height: | Size: 7.8 KiB |
BIN
doc/assets/images/searchbenchmark.png
Normal file
|
After Width: | Height: | Size: 653 KiB |
@@ -1,12 +1,11 @@
|
||||
# Summary
|
||||
|
||||
|
||||
|
||||
[Avant Propos](./avant-propos.md)
|
||||
|
||||
- [Segments](./basis.md)
|
||||
- [Defining your schema](./schema.md)
|
||||
- [Facetting](./facetting.md)
|
||||
- [Index Sorting](./index_sorting.md)
|
||||
- [Innerworkings](./innerworkings.md)
|
||||
- [Inverted index](./inverted_index.md)
|
||||
- [Best practise](./inverted_index.md)
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
> Tantivy is a **search** engine **library** for Rust.
|
||||
|
||||
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and
|
||||
they both have the same scope and targetted use cases.
|
||||
they both have the same scope and targeted use cases.
|
||||
|
||||
If you are not familiar with Lucene, let's break down our little tagline.
|
||||
|
||||
@@ -31,4 +31,4 @@ relevancy, collapsing, highlighting, spatial search.
|
||||
index from a different format.
|
||||
|
||||
Tantivy exposes a lot of low level API to do all of these things.
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ directory shipped with tantivy is the `MmapDirectory`.
|
||||
While this design has some downsides, this greatly simplifies the source code of
|
||||
tantivy. Caching is also entirely delegated to the OS.
|
||||
|
||||
`tantivy` works entirely (or almost) by directly reading the datastructures as they are layed on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
|
||||
`tantivy` works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
|
||||
|
||||
This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case.
|
||||
|
||||
@@ -22,7 +22,6 @@ Of course this is crucial to reduce IO, and ensure that as much of our index can
|
||||
Also, whenever possible its data is accessed sequentially. Of course, this is an amazing property when tantivy needs to access the data from your spinning hard disk, but this is also
|
||||
critical for performance, if your data is read from and an `SSD` or even already in your pagecache.
|
||||
|
||||
|
||||
## Segments, and the log method
|
||||
|
||||
That kind of compact layout comes at one cost: it prevents our datastructures from being dynamic.
|
||||
@@ -53,11 +52,7 @@ to get tantivy to fit your use case:
|
||||
|
||||
*Example 2* You could also disable your merge policy and enforce daily segments. Removing data after one week can then be done very efficiently by just editing the `meta.json` and deleting the files associated to segment `D-7`.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Merging
|
||||
## Merging
|
||||
|
||||
As you index more and more data, your index will accumulate more and more segments.
|
||||
Having a lot of small segments is not really optimal. There is a bit of redundancy in having
|
||||
@@ -66,11 +61,7 @@ all these term dictionary. Also when searching, we will need to do term lookups
|
||||
That's where merging or compacting comes into place. Tantivy will continuously consider merge
|
||||
opportunities and start merging segments in the background.
|
||||
|
||||
|
||||
# Indexing throughput, number of indexing threads
|
||||
|
||||
|
||||
|
||||
## Indexing throughput, number of indexing threads
|
||||
|
||||
[^1]: This may eventually change.
|
||||
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
# Examples
|
||||
|
||||
- [Basic search](/examples/basic_search.html)
|
||||
- [Basic search](/examples/basic_search.html)
|
||||
|
||||
62
doc/src/index_sorting.md
Normal file
@@ -0,0 +1,62 @@
|
||||
|
||||
- [Index Sorting](#index-sorting)
|
||||
- [Why Sorting](#why-sorting)
|
||||
- [Compression](#compression)
|
||||
- [Top-N Optimization](#top-n-optimization)
|
||||
- [Pruning](#pruning)
|
||||
- [Other](#other)
|
||||
- [Usage](#usage)
|
||||
|
||||
# Index Sorting
|
||||
|
||||
Tantivy allows you to sort the index according to a property.
|
||||
|
||||
## Why Sorting
|
||||
|
||||
Presorting an index has several advantages:
|
||||
|
||||
### Compression
|
||||
|
||||
When data is sorted it is easier to compress the data. E.g. the numbers sequence [5, 2, 3, 1, 4] would be sorted to [1, 2, 3, 4, 5].
|
||||
If we apply delta encoding this list would be unsorted [5, -3, 1, -2, 3] vs. [1, 1, 1, 1, 1].
|
||||
Compression ratio is mainly affected on the fast field of the sorted property, every thing else is likely unaffected.
|
||||
|
||||
### Top-N Optimization
|
||||
|
||||
When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
|
||||
E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
|
||||
|
||||
Note: Tantivy 0.16 does not do this optimization yet.
|
||||
|
||||
### Pruning
|
||||
|
||||
Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
|
||||
|
||||
Note: Tantivy 0.16 does not do this optimization yet.
|
||||
|
||||
### Other?
|
||||
|
||||
In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?)
|
||||
|
||||
## Usage
|
||||
|
||||
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of Tantivy 0.16 only fast fields are allowed to be used.
|
||||
|
||||
```rust
|
||||
let settings = IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
let mut index_builder = Index::builder().schema(schema);
|
||||
index_builder = index_builder.settings(settings);
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
```
|
||||
|
||||
## Implementation details
|
||||
|
||||
Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073).
|
||||
|
||||
In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets).
|
||||
130
doc/src/json.md
Normal file
@@ -0,0 +1,130 @@
|
||||
# Json
|
||||
|
||||
As of tantivy 0.17, tantivy supports a json object type.
|
||||
This type can be used to allow for a schema-less search index.
|
||||
|
||||
When indexing a json object, we "flatten" the JSON. This operation emits terms that represent a triplet `(json_path, value_type, value)`
|
||||
|
||||
For instance, if user is a json field, the following document:
|
||||
|
||||
```json
|
||||
{
|
||||
"user": {
|
||||
"name": "Paul Masurel",
|
||||
"address": {
|
||||
"city": "Tokyo",
|
||||
"country": "Japan"
|
||||
},
|
||||
"created_at": "2018-11-12T23:20:50.52Z"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
emits the following tokens:
|
||||
|
||||
- ("name", Text, "Paul")
|
||||
- ("name", Text, "Masurel")
|
||||
- ("address.city", Text, "Tokyo")
|
||||
- ("address.country", Text, "Japan")
|
||||
- ("created_at", Date, 15420648505)
|
||||
|
||||
## Bytes-encoding and lexicographical sort
|
||||
|
||||
Like any other terms, these triplets are encoded into a binary format as follows.
|
||||
|
||||
- `json_path`: the json path is a sequence of "segments". In the example above, `address.city`
|
||||
is just a debug representation of the json path `["address", "city"]`.
|
||||
Its representation is done by separating segments by a unicode char `\x01`, and ending the path by `\x00`.
|
||||
- `value type`: One byte represents the `Value` type.
|
||||
- `value`: The value representation is just the regular Value representation.
|
||||
|
||||
This representation is designed to align the natural sort of Terms with the lexicographical sort
|
||||
of their binary representation (Tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
|
||||
|
||||
In the example above, the terms will be sorted as
|
||||
|
||||
- ("address.city", Text, "Tokyo")
|
||||
- ("address.country", Text, "Japan")
|
||||
- ("name", Text, "Masurel")
|
||||
- ("name", Text, "Paul")
|
||||
- ("created_at", Date, 15420648505)
|
||||
|
||||
As seen in "pitfalls", we may end up having to search for a value for a same path in several different fields. Putting the field code after the path makes it maximizes compression opportunities but also increases the chances for the two terms to end up in the actual same term dictionary block.
|
||||
|
||||
## Pitfalls, limitation and corner cases
|
||||
|
||||
Json gives very little information about the type of the literals it stores.
|
||||
All numeric types end up mapped as a "Number" and there are no types for dates.
|
||||
|
||||
At indexing, tantivy will try to interpret number and strings as different type with a
|
||||
priority order.
|
||||
|
||||
Numbers will be interpreted as u64, i64 and f64 in that order.
|
||||
Strings will be interpreted as rfc3999 dates or simple strings.
|
||||
|
||||
The first working type is picked and is the only term that is emitted for indexing.
|
||||
Note this interpretation happens on a per-document basis, and there is no effort to try to sniff
|
||||
a consistent field type at the scale of a segment.
|
||||
|
||||
On the query parser side on the other hand, we may end up emitting more than one type.
|
||||
For instance, we do not even know if the type is a number or string based.
|
||||
|
||||
So the query
|
||||
|
||||
```rust
|
||||
my_path.my_segment:233
|
||||
```
|
||||
|
||||
Will be interpreted as
|
||||
|
||||
```rust
|
||||
(my_path.my_segment, String, 233) or (my_path.my_segment, u64, 233)
|
||||
```
|
||||
|
||||
Likewise, we need to emit two tokens if the query contains an rfc3999 date.
|
||||
Indeed the date could have been actually a single token inside the text of a document at ingestion time. Generally speaking, we will always at least emit a string token in query parsing, and sometimes more.
|
||||
|
||||
If one more json field is defined, things get even more complicated.
|
||||
|
||||
## Default json field
|
||||
|
||||
If the schema contains a text field called "text" and a json field that is set as a default field:
|
||||
`text:hello` could be reasonably interpreted as targeting the text field or as targeting the json field called `json_dynamic` with the json_path "text".
|
||||
|
||||
If there is such an ambiguity, we decide to only search in the "text" field: `text:hello`.
|
||||
|
||||
In other words, the parser will not search in default json fields if there is a schema hit.
|
||||
This is a product decision.
|
||||
|
||||
The user can still target the JSON field by specifying its name explicitly:
|
||||
`json_dynamic.text:hello`.
|
||||
|
||||
## Range queries are not supported
|
||||
|
||||
Json field do not support range queries.
|
||||
|
||||
## Arrays do not work like nested object
|
||||
|
||||
If json object contains an array, a search query might return more documents
|
||||
than what might be expected.
|
||||
|
||||
Let's take an example.
|
||||
|
||||
```json
|
||||
{
|
||||
"cart_id": 3234234 ,
|
||||
"cart": [
|
||||
{"product_type": "sneakers", "attributes": {"color": "white"} },
|
||||
{"product_type": "t-shirt", "attributes": {"color": "red"}},
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Despite the array structure, a document in tantivy is a bag of terms.
|
||||
The query:
|
||||
|
||||
```rust
|
||||
cart.product_type:sneakers AND cart.attributes.color:red
|
||||
```
|
||||
|
||||
Actually match the document above.
|
||||
130
examples/aggregation.rs
Normal file
@@ -0,0 +1,130 @@
|
||||
// # Aggregation example
|
||||
//
|
||||
// This example shows how you can use built-in aggregations.
|
||||
// We will use range buckets and compute the average in each bucket.
|
||||
//
|
||||
|
||||
use serde_json::Value;
|
||||
use tantivy::aggregation::agg_req::{
|
||||
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
|
||||
RangeAggregation,
|
||||
};
|
||||
use tantivy::aggregation::agg_result::AggregationResults;
|
||||
use tantivy::aggregation::metric::AverageAggregation;
|
||||
use tantivy::aggregation::AggregationCollector;
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::{self, Cardinality, IndexRecordOption, Schema, TextFieldIndexing};
|
||||
use tantivy::{doc, Index, Term};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_fieldtype = schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let highscore_field = schema_builder.add_f64_field("highscore", score_fieldtype.clone());
|
||||
let price_field = schema_builder.add_f64_field("price", score_fieldtype.clone());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Lets index a bunch of documents for this example.
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 1f64,
|
||||
price_field => 0f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 3f64,
|
||||
price_field => 1f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 5f64,
|
||||
price_field => 1f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "nohit",
|
||||
highscore_field => 6f64,
|
||||
price_field => 2f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 7f64,
|
||||
price_field => 2f64,
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 11f64,
|
||||
price_field => 10f64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 14f64,
|
||||
price_field => 15f64,
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
highscore_field => 15f64,
|
||||
price_field => 20f64,
|
||||
))?;
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let sub_agg_req_1: Aggregations = vec![(
|
||||
"average_price".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Average(
|
||||
AverageAggregation::from_field_name("price".to_string()),
|
||||
)),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let agg_req_1: Aggregations = vec![(
|
||||
"score_ranges".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "highscore".to_string(),
|
||||
ranges: vec![
|
||||
(-1f64..9f64).into(),
|
||||
(9f64..14f64).into(),
|
||||
(14f64..20f64).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: sub_agg_req_1.clone(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::to_value(&agg_res)?;
|
||||
println!("{}", serde_json::to_string_pretty(&res)?);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -73,7 +73,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// multithreaded.
|
||||
//
|
||||
// Here we give tantivy a budget of `50MB`.
|
||||
// Using a bigger heap for the indexer may increase
|
||||
// Using a bigger memory_arena for the indexer may increase
|
||||
// throughput, but 50 MB is already plenty.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
@@ -91,12 +91,12 @@ fn main() -> tantivy::Result<()> {
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.",
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \
|
||||
eighty-four days now without taking a fish.",
|
||||
);
|
||||
|
||||
// ... and add it to the `IndexWriter`.
|
||||
index_writer.add_document(old_man_doc);
|
||||
index_writer.add_document(old_man_doc)?;
|
||||
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
@@ -110,7 +110,7 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
))?;
|
||||
|
||||
// Multivalued field just need to be repeated.
|
||||
index_writer.add_document(doc!(
|
||||
@@ -120,7 +120,7 @@ fn main() -> tantivy::Result<()> {
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
));
|
||||
))?;
|
||||
|
||||
// This is an example, so we will only index 3 documents
|
||||
// here. You can check out tantivy's tutorial to index
|
||||
|
||||
@@ -7,13 +7,13 @@
|
||||
// Of course, you can have a look at the tantivy's built-in collectors
|
||||
// such as the `CountCollector` for more examples.
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::{Collector, SegmentCollector};
|
||||
use tantivy::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use tantivy::fastfield::DynamicFastFieldReader;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::Field;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||
use tantivy::schema::{Field, Schema, FAST, INDEXED, TEXT};
|
||||
use tantivy::{doc, Index, Score, SegmentReader};
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -86,12 +86,10 @@ impl Collector for StatsCollector {
|
||||
|
||||
fn merge_fruits(&self, segment_stats: Vec<Option<Stats>>) -> tantivy::Result<Option<Stats>> {
|
||||
let mut stats = Stats::default();
|
||||
for segment_stats_opt in segment_stats {
|
||||
if let Some(segment_stats) = segment_stats_opt {
|
||||
stats.count += segment_stats.count;
|
||||
stats.sum += segment_stats.sum;
|
||||
stats.squared_sum += segment_stats.squared_sum;
|
||||
}
|
||||
for segment_stats in segment_stats.into_iter().flatten() {
|
||||
stats.count += segment_stats.count;
|
||||
stats.sum += segment_stats.sum;
|
||||
stats.squared_sum += segment_stats.squared_sum;
|
||||
}
|
||||
Ok(stats.non_zero_count())
|
||||
}
|
||||
@@ -106,7 +104,7 @@ impl SegmentCollector for StatsSegmentCollector {
|
||||
type Fruit = Option<Stats>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: Score) {
|
||||
let value = self.fast_field_reader.get(doc) as f64;
|
||||
let value = self.fast_field_reader.get_val(doc as u64) as f64;
|
||||
self.stats.count += 1;
|
||||
self.stats.sum += value;
|
||||
self.stats.squared_sum += value * value;
|
||||
@@ -147,23 +145,23 @@ fn main() -> tantivy::Result<()> {
|
||||
product_description => "While it is ok for short distance travel, this broom \
|
||||
was designed quiditch. It will up your game.",
|
||||
price => 30_200u64
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Turbulobroom",
|
||||
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
|
||||
You'll enjoy its sharp turns, and rapid acceleration",
|
||||
price => 29_240u64
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Broomio",
|
||||
product_description => "Great value for the price. This broom is a market favorite",
|
||||
price => 21_240u64
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Whack a Mole",
|
||||
product_description => "Prime quality bat.",
|
||||
price => 5_200u64
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// for your unit tests... Or this example.
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
// here we are registering our custome tokenizer
|
||||
// here we are registering our custom tokenizer
|
||||
// this will store tokens of 3 characters each
|
||||
index
|
||||
.tokenizers()
|
||||
@@ -62,13 +62,13 @@ fn main() -> tantivy::Result<()> {
|
||||
// multithreaded.
|
||||
//
|
||||
// Here we use a buffer of 50MB per thread. Using a bigger
|
||||
// heap for the indexer can increase its throughput.
|
||||
// memory arena for the indexer can increase its throughput.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
|
||||
@@ -79,14 +79,14 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent
|
||||
limbs and branches that arch over the pool"#
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and
|
||||
increasing confidence in the success of my undertaking."#
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
69
examples/date_time_field.rs
Normal file
@@ -0,0 +1,69 @@
|
||||
// # DateTime field example
|
||||
//
|
||||
// This example shows how the DateTime field can be used
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Cardinality, DateOptions, Schema, Value, INDEXED, STORED, STRING};
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
let opts = DateOptions::from(INDEXED)
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
.set_precision(tantivy::DatePrecision::Seconds);
|
||||
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
|
||||
let event_type = schema_builder.add_text_field("event", STRING | STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"occurred_at": "2022-06-22T12:53:50.53Z",
|
||||
"event": "pull-request"
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"occurred_at": "2022-06-22T13:00:00.22Z",
|
||||
"event": "comment"
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// # Default fields: event_type
|
||||
let query_parser = QueryParser::for_index(&index, vec![event_type]);
|
||||
{
|
||||
let query = query_parser.parse_query("event:comment")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
|
||||
assert_eq!(count_docs.len(), 1);
|
||||
}
|
||||
{
|
||||
let query = query_parser
|
||||
.parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
|
||||
assert_eq!(count_docs.len(), 1);
|
||||
for (_score, doc_address) in count_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
assert!(matches!(
|
||||
retrieved_doc.get_first(occurred_at),
|
||||
Some(Value::Date(_))
|
||||
));
|
||||
assert_eq!(
|
||||
schema.to_json(&retrieved_doc),
|
||||
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -56,8 +56,9 @@ fn main() -> tantivy::Result<()> {
|
||||
// If it is `text`, let's make sure to keep it `raw` and let's avoid
|
||||
// running any text processing on it.
|
||||
// This is done by associating this field to the tokenizer named `raw`.
|
||||
// Rather than building our [`TextOptions`](//docs.rs/tantivy/~0/tantivy/schema/struct.TextOptions.html) manually,
|
||||
// We use the `STRING` shortcut. `STRING` stands for indexed (without term frequency or positions)
|
||||
// Rather than building our
|
||||
// [`TextOptions`](//docs.rs/tantivy/~0/tantivy/schema/struct.TextOptions.html) manually, We
|
||||
// use the `STRING` shortcut. `STRING` stands for indexed (without term frequency or positions)
|
||||
// and untokenized.
|
||||
//
|
||||
// Because we also want to be able to see this `id` in our returned documents,
|
||||
@@ -76,15 +77,15 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer.add_document(doc!(
|
||||
isbn => "978-0099908401",
|
||||
title => "The old Man and the see"
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
isbn => "978-0140177398",
|
||||
title => "Of Mice and Men",
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankentein", //< Oops there is a typo here.
|
||||
isbn => "978-9176370711",
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -122,7 +123,7 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
isbn => "978-9176370711",
|
||||
));
|
||||
))?;
|
||||
|
||||
// You are guaranteed that your clients will only observe your index in
|
||||
// the state it was in after a commit.
|
||||
|
||||
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let name = schema_builder.add_text_field("felin_name", TEXT | STORED);
|
||||
// this is our faceted field: its scientific classification
|
||||
let classification = schema_builder.add_facet_field("classification", INDEXED);
|
||||
let classification = schema_builder.add_facet_field("classification", FacetOptions::default());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -35,35 +35,35 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer.add_document(doc!(
|
||||
name => "Cat",
|
||||
classification => Facet::from("/Felidae/Felinae/Felis")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Canada lynx",
|
||||
classification => Facet::from("/Felidae/Felinae/Lynx")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Cheetah",
|
||||
classification => Facet::from("/Felidae/Felinae/Acinonyx")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Tiger",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Lion",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Jaguar",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Sunda clouded leopard",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Neofelis")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Fossa",
|
||||
classification => Facet::from("/Eupleridae/Cryptoprocta")
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::doc;
|
||||
use tantivy::query::BooleanQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::{DocId, Index, Score, SegmentReader};
|
||||
use tantivy::{doc, DocId, Index, Score, SegmentReader};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let title = schema_builder.add_text_field("title", STORED);
|
||||
let ingredient = schema_builder.add_facet_field("ingredient", INDEXED);
|
||||
let ingredient = schema_builder.add_facet_field("ingredient", FacetOptions::default());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -20,14 +20,14 @@ fn main() -> tantivy::Result<()> {
|
||||
title => "Fried egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Scrambled egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/butter"),
|
||||
ingredient => Facet::from("/ingredient/milk"),
|
||||
ingredient => Facet::from("/ingredient/salt"),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Egg rolls",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
@@ -36,7 +36,7 @@ fn main() -> tantivy::Result<()> {
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
ingredient => Facet::from("/ingredient/tortilla-wrap"),
|
||||
ingredient => Facet::from("/ingredient/mushroom"),
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
@@ -87,7 +87,7 @@ fn main() -> tantivy::Result<()> {
|
||||
.unwrap()
|
||||
.get_first(title)
|
||||
.unwrap()
|
||||
.text()
|
||||
.as_text()
|
||||
.unwrap()
|
||||
.to_owned()
|
||||
})
|
||||
|
||||
@@ -7,7 +7,7 @@ use tantivy::query::RangeQuery;
|
||||
use tantivy::schema::{Schema, INDEXED};
|
||||
use tantivy::{doc, Index, Result};
|
||||
|
||||
fn run() -> Result<()> {
|
||||
fn main() -> Result<()> {
|
||||
// For the sake of simplicity, this schema will only have 1 field
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -19,7 +19,7 @@ fn run() -> Result<()> {
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
|
||||
for year in 1950u64..2019u64 {
|
||||
index_writer.add_document(doc!(year_field => year));
|
||||
index_writer.add_document(doc!(year_field => year))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
// The index will be a range of years
|
||||
@@ -33,7 +33,3 @@ fn run() -> Result<()> {
|
||||
assert_eq!(num_60s_books, 10);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() {
|
||||
run().unwrap()
|
||||
}
|
||||
|
||||
@@ -25,9 +25,9 @@ fn main() -> tantivy::Result<()> {
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"));
|
||||
index_writer.add_document(doc!(title => "Of Mice and Men"));
|
||||
index_writer.add_document(doc!(title => "The modern Promotheus"));
|
||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
|
||||
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
|
||||
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
@@ -52,11 +52,11 @@ fn main() -> tantivy::Result<()> {
|
||||
let term_the = Term::from_field_text(title, "the");
|
||||
|
||||
// This segment posting object is like a cursor over the documents matching the term.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies
|
||||
// and positions.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term
|
||||
// frequencies and positions.
|
||||
//
|
||||
// If you don't need all this information, you may get better performance by decompressing less
|
||||
// information.
|
||||
// If you don't need all this information, you may get better performance by decompressing
|
||||
// less information.
|
||||
if let Some(mut segment_postings) =
|
||||
inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions)?
|
||||
{
|
||||
@@ -109,11 +109,11 @@ fn main() -> tantivy::Result<()> {
|
||||
let inverted_index = segment_reader.inverted_index(title)?;
|
||||
|
||||
// This segment posting object is like a cursor over the documents matching the term.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies
|
||||
// and positions.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term
|
||||
// frequencies and positions.
|
||||
//
|
||||
// If you don't need all this information, you may get better performance by decompressing less
|
||||
// information.
|
||||
// If you don't need all this information, you may get better performance by decompressing
|
||||
// less information.
|
||||
if let Some(mut block_segment_postings) =
|
||||
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)?
|
||||
{
|
||||
|
||||
105
examples/json_field.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
// # Json field example
|
||||
//
|
||||
// This example shows how the json field can be used
|
||||
// to make tantivy partially schemaless by setting it as
|
||||
// default query parser field.
|
||||
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_date_field("timestamp", FAST | STORED);
|
||||
let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
|
||||
let attributes = schema_builder.add_json_field("attributes", STORED | TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"timestamp": "2022-02-22T23:20:50.53Z",
|
||||
"event_type": "click",
|
||||
"attributes": {
|
||||
"target": "submit-button",
|
||||
"cart": {"product_id": 103},
|
||||
"description": "the best vacuum cleaner ever"
|
||||
}
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"timestamp": "2022-02-22T23:20:51.53Z",
|
||||
"event_type": "click",
|
||||
"attributes": {
|
||||
"target": "submit-button",
|
||||
"cart": {"product_id": 133},
|
||||
"description": "das keyboard",
|
||||
"event_type": "holiday-sale"
|
||||
}
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// # Default fields: event_type and attributes
|
||||
// By setting attributes as a default field it allows omitting attributes itself, e.g. "target",
|
||||
// instead of "attributes.target"
|
||||
let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]);
|
||||
{
|
||||
let query = query_parser.parse_query("target:submit-button")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(count_docs.len(), 2);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("target:submit")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(count_docs.len(), 2);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("cart.product_id:103")?;
|
||||
let count_docs = searcher.search(&*query, &Count)?;
|
||||
assert_eq!(count_docs, 1);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("click AND cart.product_id:133")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 1);
|
||||
}
|
||||
{
|
||||
// The sub-fields in the json field marked as default field still need to be explicitly
|
||||
// addressed
|
||||
let query = query_parser.parse_query("click AND 133")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 0);
|
||||
}
|
||||
{
|
||||
// Default json fields are ignored if they collide with the schema
|
||||
let query = query_parser.parse_query("event_type:holiday-sale")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 0);
|
||||
}
|
||||
// # Query via full attribute path
|
||||
{
|
||||
// This only searches in our schema's `event_type` field
|
||||
let query = query_parser.parse_query("event_type:click")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 2);
|
||||
}
|
||||
{
|
||||
// Default json fields can still be accessed by full path
|
||||
let query = query_parser.parse_query("attributes.event_type:holiday-sale")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 1);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -28,8 +28,9 @@
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use tantivy::schema::{Schema, STORED, TEXT};
|
||||
use tantivy::{doc, Index, IndexWriter, Opstamp};
|
||||
use tantivy::{doc, Index, IndexWriter, Opstamp, TantivyError};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
@@ -59,10 +60,11 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
))?;
|
||||
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
}
|
||||
Result::<(), TantivyError>::Ok(())
|
||||
});
|
||||
|
||||
// # Second indexing thread.
|
||||
@@ -78,19 +80,21 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer_rlock.add_document(doc!(
|
||||
title => "Manufacturing consent",
|
||||
body => "Some great book description..."
|
||||
))
|
||||
))?
|
||||
};
|
||||
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(10));
|
||||
}
|
||||
Result::<(), TantivyError>::Ok(())
|
||||
});
|
||||
|
||||
// # In the main thread, we commit 10 times, once every 500ms.
|
||||
for _ in 0..10 {
|
||||
let opstamp: Opstamp = {
|
||||
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
|
||||
// Committing or rollbacking on the other hand requires write lock. This will block
|
||||
// other threads.
|
||||
let mut index_writer_wlock = index_writer.write().unwrap();
|
||||
index_writer_wlock.commit().unwrap()
|
||||
index_writer_wlock.commit()?
|
||||
};
|
||||
println!("committed with opstamp {}", opstamp);
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
|
||||
@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let old_man_doc = doc!(title => title_tok, body => body_tok);
|
||||
|
||||
// ... now let's just add it to the IndexWriter
|
||||
index_writer.add_document(old_man_doc);
|
||||
index_writer.add_document(old_man_doc)?;
|
||||
|
||||
// Pretokenized text can also be fed as JSON
|
||||
let short_man_json = r#"{
|
||||
@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let short_man_doc = schema.parse_document(short_man_json)?;
|
||||
|
||||
index_writer.add_document(short_man_doc);
|
||||
index_writer.add_document(short_man_doc)?;
|
||||
|
||||
// Let's commit changes
|
||||
index_writer.commit()?;
|
||||
@@ -106,9 +106,7 @@ fn main() -> tantivy::Result<()> {
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let (top_docs, count) = searcher
|
||||
.search(&query, &(TopDocs::with_limit(2), Count))
|
||||
.unwrap();
|
||||
let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
|
||||
|
||||
assert_eq!(count, 2);
|
||||
|
||||
@@ -129,9 +127,7 @@ fn main() -> tantivy::Result<()> {
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let (_top_docs, count) = searcher
|
||||
.search(&query, &(TopDocs::with_limit(2), Count))
|
||||
.unwrap();
|
||||
let (_top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
|
||||
|
||||
assert_eq!(count, 0);
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
))?;
|
||||
// ...
|
||||
index_writer.commit()?;
|
||||
|
||||
@@ -57,7 +57,10 @@ fn main() -> tantivy::Result<()> {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||
println!("Document score {}:", score);
|
||||
println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
|
||||
println!(
|
||||
"title: {}",
|
||||
doc.get_first(title).unwrap().as_text().unwrap()
|
||||
);
|
||||
println!("snippet: {}", snippet.to_html());
|
||||
println!("custom highlighting: {}", highlight(snippet));
|
||||
}
|
||||
@@ -70,13 +73,13 @@ fn highlight(snippet: Snippet) -> String {
|
||||
let mut start_from = 0;
|
||||
|
||||
for fragment_range in snippet.highlighted() {
|
||||
result.push_str(&snippet.fragments()[start_from..fragment_range.start]);
|
||||
result.push_str(&snippet.fragment()[start_from..fragment_range.start]);
|
||||
result.push_str(" --> ");
|
||||
result.push_str(&snippet.fragments()[fragment_range.clone()]);
|
||||
result.push_str(&snippet.fragment()[fragment_range.clone()]);
|
||||
result.push_str(" <-- ");
|
||||
start_from = fragment_range.end;
|
||||
}
|
||||
|
||||
result.push_str(&snippet.fragments()[start_from..]);
|
||||
result.push_str(&snippet.fragment()[start_from..]);
|
||||
result
|
||||
}
|
||||
|
||||
@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
));
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
@@ -80,7 +80,7 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
@@ -88,7 +88,7 @@ fn main() -> tantivy::Result<()> {
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
));
|
||||
))?;
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
|
||||
220
examples/warmer.rs
Normal file
@@ -0,0 +1,220 @@
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Field, Schema, FAST, TEXT};
|
||||
use tantivy::{
|
||||
doc, DocAddress, DocId, Index, IndexReader, Opstamp, Searcher, SearcherGeneration, SegmentId,
|
||||
SegmentReader, Warmer,
|
||||
};
|
||||
|
||||
// This example shows how warmers can be used to
|
||||
// load a values from an external sources using the Warmer API.
|
||||
//
|
||||
// In this example, we assume an e-commerce search engine.
|
||||
|
||||
type ProductId = u64;
|
||||
|
||||
/// Price
|
||||
type Price = u32;
|
||||
|
||||
pub trait PriceFetcher: Send + Sync + 'static {
|
||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
|
||||
}
|
||||
|
||||
struct DynamicPriceColumn {
|
||||
field: Field,
|
||||
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
|
||||
price_fetcher: Box<dyn PriceFetcher>,
|
||||
}
|
||||
|
||||
impl DynamicPriceColumn {
|
||||
pub fn with_product_id_field<T: PriceFetcher>(field: Field, price_fetcher: T) -> Self {
|
||||
DynamicPriceColumn {
|
||||
field,
|
||||
price_cache: Default::default(),
|
||||
price_fetcher: Box::new(price_fetcher),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn price_for_segment(&self, segment_reader: &SegmentReader) -> Option<Arc<Vec<Price>>> {
|
||||
let segment_key = (segment_reader.segment_id(), segment_reader.delete_opstamp());
|
||||
self.price_cache.read().unwrap().get(&segment_key).cloned()
|
||||
}
|
||||
}
|
||||
impl Warmer for DynamicPriceColumn {
|
||||
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
|
||||
for segment in searcher.segment_readers() {
|
||||
let key = (segment.segment_id(), segment.delete_opstamp());
|
||||
let product_id_reader = segment.fast_fields().u64(self.field)?;
|
||||
let product_ids: Vec<ProductId> = segment
|
||||
.doc_ids_alive()
|
||||
.map(|doc| product_id_reader.get_val(doc as u64))
|
||||
.collect();
|
||||
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
|
||||
let mut price_vals: Vec<Price> = Vec::new();
|
||||
for doc in 0..segment.max_doc() {
|
||||
if segment.is_deleted(doc) {
|
||||
price_vals.push(0);
|
||||
} else {
|
||||
price_vals.push(prices_it.next().unwrap())
|
||||
}
|
||||
}
|
||||
self.price_cache
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert(key, Arc::new(price_vals));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn garbage_collect(&self, live_generations: &[&SearcherGeneration]) {
|
||||
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
|
||||
live_generations
|
||||
.iter()
|
||||
.flat_map(|gen| gen.segments())
|
||||
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
|
||||
.collect();
|
||||
let mut price_cache_wrt = self.price_cache.write().unwrap();
|
||||
// let price_cache = std::mem::take(&mut *price_cache_wrt);
|
||||
// Drain would be nicer here.
|
||||
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
|
||||
.into_iter()
|
||||
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
|
||||
.collect();
|
||||
}
|
||||
}
|
||||
|
||||
/// For the sake of this example, the table is just an editable HashMap behind a RwLock.
|
||||
/// This map represents a map (ProductId -> Price)
|
||||
///
|
||||
/// In practise, it could be fetching things from an external service, like a SQL table.
|
||||
#[derive(Default, Clone)]
|
||||
pub struct ExternalPriceTable {
|
||||
prices: Arc<RwLock<HashMap<ProductId, Price>>>,
|
||||
}
|
||||
|
||||
impl ExternalPriceTable {
|
||||
pub fn update_price(&self, product_id: ProductId, price: Price) {
|
||||
let mut prices_wrt = self.prices.write().unwrap();
|
||||
prices_wrt.insert(product_id, price);
|
||||
}
|
||||
}
|
||||
|
||||
impl PriceFetcher for ExternalPriceTable {
|
||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
|
||||
let prices_read = self.prices.read().unwrap();
|
||||
product_ids
|
||||
.iter()
|
||||
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Declaring our schema.
|
||||
let mut schema_builder = Schema::builder();
|
||||
// The product id is assumed to be a primary id for our external price source.
|
||||
let product_id = schema_builder.add_u64_field("product_id", FAST);
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema: Schema = schema_builder.build();
|
||||
|
||||
let price_table = ExternalPriceTable::default();
|
||||
let price_dynamic_column = Arc::new(DynamicPriceColumn::with_product_id_field(
|
||||
product_id,
|
||||
price_table.clone(),
|
||||
));
|
||||
price_table.update_price(OLIVE_OIL, 12);
|
||||
price_table.update_price(GLOVES, 13);
|
||||
price_table.update_price(SNEAKERS, 80);
|
||||
|
||||
const OLIVE_OIL: ProductId = 323423;
|
||||
const GLOVES: ProductId = 3966623;
|
||||
const SNEAKERS: ProductId = 23222;
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
|
||||
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
|
||||
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
|
||||
writer.commit()?;
|
||||
|
||||
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
|
||||
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
|
||||
)];
|
||||
let reader: IndexReader = index.reader_builder().warmers(warmers).try_into()?;
|
||||
reader.reload()?;
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![text]);
|
||||
let query = query_parser.parse_query("cooking")?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let score_by_price = move |segment_reader: &SegmentReader| {
|
||||
let price = price_dynamic_column
|
||||
.price_for_segment(segment_reader)
|
||||
.unwrap();
|
||||
move |doc_id: DocId| Reverse(price[doc_id as usize])
|
||||
};
|
||||
|
||||
let most_expensive_first = TopDocs::with_limit(10).custom_score(score_by_price);
|
||||
|
||||
let hits = searcher.search(&query, &most_expensive_first)?;
|
||||
assert_eq!(
|
||||
&hits,
|
||||
&[
|
||||
(
|
||||
Reverse(12u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 0u32
|
||||
}
|
||||
),
|
||||
(
|
||||
Reverse(13u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 1u32
|
||||
}
|
||||
),
|
||||
]
|
||||
);
|
||||
|
||||
// Olive oil just got more expensive!
|
||||
price_table.update_price(OLIVE_OIL, 15);
|
||||
|
||||
// The price update are directly reflected on `reload`.
|
||||
//
|
||||
// Be careful here though!...
|
||||
// You may have spotted that we are still using the same `Searcher`.
|
||||
//
|
||||
// It is up to the `Warmer` implementer to decide how
|
||||
// to control this behavior.
|
||||
|
||||
reader.reload()?;
|
||||
|
||||
let hits_with_new_prices = searcher.search(&query, &most_expensive_first)?;
|
||||
assert_eq!(
|
||||
&hits_with_new_prices,
|
||||
&[
|
||||
(
|
||||
Reverse(13u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 1u32
|
||||
}
|
||||
),
|
||||
(
|
||||
Reverse(15u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 0u32
|
||||
}
|
||||
),
|
||||
]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,21 +1,23 @@
|
||||
[package]
|
||||
name = "fastfield_codecs"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
authors = ["Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
edition = "2021"
|
||||
description = "Fast field codecs used by tantivy"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
common = { path = "../common/" }
|
||||
tantivy-bitpacker = { path = "../bitpacker/" }
|
||||
prettytable-rs = {version="0.8.0", optional= true}
|
||||
common = { version = "0.3", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
|
||||
ownedbytes = { version = "0.3.0", path = "../ownedbytes" }
|
||||
prettytable-rs = {version="0.9.0", optional= true}
|
||||
rand = {version="0.8.3", optional= true}
|
||||
|
||||
[dev-dependencies]
|
||||
more-asserts = "0.2.1"
|
||||
more-asserts = "0.3.0"
|
||||
proptest = "1.0.0"
|
||||
rand = "0.8.3"
|
||||
|
||||
[features]
|
||||
|
||||
@@ -4,14 +4,10 @@ extern crate test;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use fastfield_codecs::{
|
||||
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
|
||||
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
|
||||
multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
},
|
||||
*,
|
||||
};
|
||||
use fastfield_codecs::bitpacked::BitpackedCodec;
|
||||
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
|
||||
use fastfield_codecs::linear::LinearCodec;
|
||||
use fastfield_codecs::*;
|
||||
|
||||
fn get_data() -> Vec<u64> {
|
||||
let mut data: Vec<_> = (100..55000_u64)
|
||||
@@ -29,72 +25,59 @@ mod tests {
|
||||
fn value_iter() -> impl Iterator<Item = u64> {
|
||||
0..20_000
|
||||
}
|
||||
fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
b: &mut Bencher,
|
||||
data: &[u64],
|
||||
) {
|
||||
fn bench_get<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
|
||||
let mut bytes = vec![];
|
||||
S::serialize(
|
||||
&mut bytes,
|
||||
&data,
|
||||
stats_from_vec(data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
let reader = R::open_from_bytes(&bytes).unwrap();
|
||||
Codec::serialize(&mut bytes, &data).unwrap();
|
||||
let reader = Codec::open_from_bytes(OwnedBytes::new(bytes)).unwrap();
|
||||
b.iter(|| {
|
||||
let mut sum = 0u64;
|
||||
for pos in value_iter() {
|
||||
reader.get_u64(pos as u64, &bytes);
|
||||
let val = reader.get_val(pos as u64);
|
||||
debug_assert_eq!(data[pos as usize], val);
|
||||
sum = sum.wrapping_add(val);
|
||||
}
|
||||
sum
|
||||
});
|
||||
}
|
||||
fn bench_create<S: FastFieldCodecSerializer>(b: &mut Bencher, data: &[u64]) {
|
||||
let mut bytes = vec![];
|
||||
fn bench_create<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
|
||||
let mut bytes = Vec::new();
|
||||
b.iter(|| {
|
||||
S::serialize(
|
||||
&mut bytes,
|
||||
&data,
|
||||
stats_from_vec(data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
bytes.clear();
|
||||
Codec::serialize(&mut bytes, &data).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
use ownedbytes::OwnedBytes;
|
||||
use test::Bencher;
|
||||
#[bench]
|
||||
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<BitpackedFastFieldSerializer>(b, &data);
|
||||
bench_create::<BitpackedCodec>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<LinearInterpolFastFieldSerializer>(b, &data);
|
||||
bench_create::<LinearCodec>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<MultiLinearInterpolFastFieldSerializer>(b, &data);
|
||||
bench_create::<BlockwiseLinearCodec>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
|
||||
bench_get::<BitpackedCodec>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>(b, &data);
|
||||
bench_get::<LinearCodec>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>(
|
||||
b, &data,
|
||||
);
|
||||
bench_get::<BlockwiseLinearCodec>(b, &data);
|
||||
}
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
|
||||
@@ -1,41 +1,26 @@
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use common::BinarySerializable;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
use common::BinarySerializable;
|
||||
use ownedbytes::OwnedBytes;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{Column, FastFieldCodec, FastFieldCodecType};
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct BitpackedFastFieldReader {
|
||||
pub struct BitpackedReader {
|
||||
data: OwnedBytes,
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub min_value_u64: u64,
|
||||
pub max_value_u64: u64,
|
||||
min_value_u64: u64,
|
||||
max_value_u64: u64,
|
||||
num_vals: u64,
|
||||
}
|
||||
|
||||
impl<'data> FastFieldCodecReader for BitpackedFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - 16);
|
||||
let min_value = u64::deserialize(&mut footer)?;
|
||||
let amplitude = u64::deserialize(&mut footer)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(BitpackedFastFieldReader {
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
})
|
||||
}
|
||||
impl Column for BitpackedReader {
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
self.min_value_u64 + self.bit_unpacker.get(doc, data)
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
|
||||
}
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
@@ -45,16 +30,21 @@ impl<'data> FastFieldCodecReader for BitpackedFastFieldReader {
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value_u64
|
||||
}
|
||||
#[inline]
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.num_vals
|
||||
}
|
||||
}
|
||||
pub struct BitpackedFastFieldSerializerLegacy<'a, W: 'a + Write> {
|
||||
pub struct BitpackedSerializerLegacy<'a, W: 'a + Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
num_vals: u64,
|
||||
amplitude: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> BitpackedFastFieldSerializerLegacy<'a, W> {
|
||||
impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
@@ -67,15 +57,16 @@ impl<'a, W: Write> BitpackedFastFieldSerializerLegacy<'a, W> {
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'a, W>> {
|
||||
) -> io::Result<BitpackedSerializerLegacy<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
let amplitude = max_value - min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(BitpackedFastFieldSerializerLegacy {
|
||||
Ok(BitpackedSerializerLegacy {
|
||||
bit_packer,
|
||||
write,
|
||||
min_value,
|
||||
num_vals: 0,
|
||||
amplitude,
|
||||
num_bits,
|
||||
})
|
||||
@@ -86,21 +77,45 @@ impl<'a, W: Write> BitpackedFastFieldSerializerLegacy<'a, W> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
self.num_vals += 1;
|
||||
Ok(())
|
||||
}
|
||||
pub fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)?;
|
||||
self.min_value.serialize(&mut self.write)?;
|
||||
self.amplitude.serialize(&mut self.write)?;
|
||||
self.num_vals.serialize(&mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BitpackedFastFieldSerializer {}
|
||||
pub struct BitpackedCodec;
|
||||
|
||||
impl FastFieldCodec for BitpackedCodec {
|
||||
/// The CODEC_TYPE is an enum value used for serialization.
|
||||
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Bitpacked;
|
||||
|
||||
type Reader = BitpackedReader;
|
||||
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
|
||||
let footer_offset = bytes.len() - 24;
|
||||
let (data, mut footer) = bytes.split(footer_offset);
|
||||
let min_value = u64::deserialize(&mut footer)?;
|
||||
let amplitude = u64::deserialize(&mut footer)?;
|
||||
let num_vals = u64::deserialize(&mut footer)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(BitpackedReader {
|
||||
data,
|
||||
bit_unpacker,
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
num_vals,
|
||||
})
|
||||
}
|
||||
|
||||
impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
||||
const NAME: &'static str = "Bitpacked";
|
||||
const ID: u8 = 1;
|
||||
/// Serializes data with the BitpackedFastFieldSerializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
@@ -109,51 +124,41 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut serializer =
|
||||
BitpackedFastFieldSerializerLegacy::open(write, stats.min_value, stats.max_value)?;
|
||||
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
|
||||
let mut serializer = BitpackedSerializerLegacy::open(
|
||||
write,
|
||||
fastfield_accessor.min_value(),
|
||||
fastfield_accessor.max_value(),
|
||||
)?;
|
||||
|
||||
for val in data_iter {
|
||||
for val in fastfield_accessor.iter() {
|
||||
serializer.add_val(val)?;
|
||||
}
|
||||
serializer.close_field()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
_stats: FastFieldStats,
|
||||
) -> bool {
|
||||
true
|
||||
}
|
||||
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
|
||||
let amplitude = fastfield_accessor.max_value() - fastfield_accessor.min_value();
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
use crate::tests::get_codec_test_datasets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(
|
||||
data, name,
|
||||
);
|
||||
crate::tests::create_and_validate::<BitpackedCodec>(data, name);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
let data_sets = get_codec_test_datasets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
|
||||
@@ -1,29 +1,38 @@
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use common::CountingWriter;
|
||||
//! The BlockwiseLinear codec uses linear interpolation to guess a values and stores the
|
||||
//! offset, but in blocks of 512.
|
||||
//!
|
||||
//! With a CHUNK_SIZE of 512 and 29 byte metadata per block, we get a overhead for metadata of 232 /
|
||||
//! 512 = 0,45 bits per element. The additional space required per element in a block is the the
|
||||
//! maximum deviation of the linear interpolation estimation function.
|
||||
//!
|
||||
//! E.g. if the maximum deviation of an element is 12, all elements cost 4bits.
|
||||
//!
|
||||
//! Size per block:
|
||||
//! Num Elements * Maximum Deviation from Interpolation + 29 Byte Metadata
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::DeserializeFrom;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
use common::{BinarySerializable, CountingWriter, DeserializeFrom};
|
||||
use ownedbytes::OwnedBytes;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::linear::{get_calculated_value, get_slope};
|
||||
use crate::{Column, FastFieldCodec, FastFieldCodecType};
|
||||
|
||||
const CHUNK_SIZE: u64 = 512;
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct MultiLinearInterpolFastFieldReader {
|
||||
pub footer: MultiLinearInterpolFooter,
|
||||
pub struct BlockwiseLinearReader {
|
||||
data: OwnedBytes,
|
||||
pub footer: BlockwiseLinearFooter,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct Function {
|
||||
// The offset in the data is required, because we have diffrent bit_widths per block
|
||||
// The offset in the data is required, because we have different bit_widths per block
|
||||
data_start_offset: u64,
|
||||
// start_pos in the block will be CHUNK_SIZE * BLOCK_NUM
|
||||
start_pos: u64,
|
||||
@@ -43,7 +52,7 @@ struct Function {
|
||||
impl Function {
|
||||
fn calc_slope(&mut self) {
|
||||
let num_vals = self.end_pos - self.start_pos;
|
||||
get_slope(self.value_start_pos, self.value_end_pos, num_vals);
|
||||
self.slope = get_slope(self.value_start_pos, self.value_end_pos, num_vals);
|
||||
}
|
||||
// split the interpolation into two function, change self and return the second split
|
||||
fn split(&mut self, split_pos: u64, split_pos_value: u64) -> Function {
|
||||
@@ -93,14 +102,14 @@ impl BinarySerializable for Function {
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct MultiLinearInterpolFooter {
|
||||
pub struct BlockwiseLinearFooter {
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
interpolations: Vec<Function>,
|
||||
}
|
||||
|
||||
impl BinarySerializable for MultiLinearInterpolFooter {
|
||||
impl BinarySerializable for BlockwiseLinearFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
let mut out = vec![];
|
||||
self.num_vals.serialize(&mut out)?;
|
||||
@@ -112,8 +121,8 @@ impl BinarySerializable for MultiLinearInterpolFooter {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<MultiLinearInterpolFooter> {
|
||||
let mut footer = MultiLinearInterpolFooter {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<BlockwiseLinearFooter> {
|
||||
let mut footer = BlockwiseLinearFooter {
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
@@ -137,26 +146,20 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio
|
||||
&interpolations[get_interpolation_position(doc)]
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||
let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?;
|
||||
|
||||
Ok(MultiLinearInterpolFastFieldReader { footer })
|
||||
}
|
||||
|
||||
impl Column for BlockwiseLinearReader {
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
|
||||
let doc = doc - interpolation.start_pos;
|
||||
let calculated_value =
|
||||
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
|
||||
let diff = interpolation
|
||||
.bit_unpacker
|
||||
.get(doc, &data[interpolation.data_start_offset as usize..]);
|
||||
fn get_val(&self, idx: u64) -> u64 {
|
||||
let interpolation = get_interpolation_function(idx, &self.footer.interpolations);
|
||||
let in_block_idx = idx - interpolation.start_pos;
|
||||
let calculated_value = get_calculated_value(
|
||||
interpolation.value_start_pos,
|
||||
in_block_idx,
|
||||
interpolation.slope,
|
||||
);
|
||||
let diff = interpolation.bit_unpacker.get(
|
||||
in_block_idx,
|
||||
&self.data[interpolation.data_start_offset as usize..],
|
||||
);
|
||||
(calculated_value + diff) - interpolation.positive_val_offset
|
||||
}
|
||||
|
||||
@@ -168,39 +171,38 @@ impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
|
||||
fn max_value(&self) -> u64 {
|
||||
self.footer.max_value
|
||||
}
|
||||
#[inline]
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.footer.num_vals
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
||||
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
|
||||
}
|
||||
/// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||
pub struct BlockwiseLinearCodec;
|
||||
|
||||
#[inline]
|
||||
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
(first_val as i64 + (pos as f32 * slope) as i64) as u64
|
||||
}
|
||||
impl FastFieldCodec for BlockwiseLinearCodec {
|
||||
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::BlockwiseLinear;
|
||||
|
||||
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||
pub struct MultiLinearInterpolFastFieldSerializer {}
|
||||
type Reader = BlockwiseLinearReader;
|
||||
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
let footer_offset = bytes.len() - 4 - footer_len as usize;
|
||||
let (data, mut footer) = bytes.split(footer_offset);
|
||||
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
|
||||
Ok(BlockwiseLinearReader { data, footer })
|
||||
}
|
||||
|
||||
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "MultiLinearInterpol";
|
||||
const ID: u8 = 3;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
|
||||
assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value());
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
|
||||
|
||||
let mut first_function = Function {
|
||||
end_pos: stats.num_vals,
|
||||
end_pos: fastfield_accessor.num_vals(),
|
||||
value_start_pos: first_val,
|
||||
value_end_pos: last_val,
|
||||
..Default::default()
|
||||
@@ -211,7 +213,7 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
// Since we potentially apply multiple passes over the data, the data is cached.
|
||||
// Multiple iteration can be expensive (merge with index sorting can add lot of overhead per
|
||||
// iteration)
|
||||
let data = data_iter.collect::<Vec<_>>();
|
||||
let data = fastfield_accessor.iter().collect::<Vec<_>>();
|
||||
|
||||
//// let's split this into chunks of CHUNK_SIZE
|
||||
for data_pos in (0..data.len() as u64).step_by(CHUNK_SIZE as usize).skip(1) {
|
||||
@@ -238,11 +240,11 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
);
|
||||
if calculated_value > actual_value {
|
||||
// negative value we need to apply an offset
|
||||
// we ignore negative values in the max value calculation, because negative values
|
||||
// will be offset to 0
|
||||
// we ignore negative values in the max value calculation, because negative
|
||||
// values will be offset to 0
|
||||
offset = offset.max(calculated_value - actual_value);
|
||||
} else {
|
||||
//positive value no offset reuqired
|
||||
// positive value no offset reuqired
|
||||
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
|
||||
}
|
||||
}
|
||||
@@ -274,49 +276,47 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = MultiLinearInterpolFooter {
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
let footer = BlockwiseLinearFooter {
|
||||
num_vals: fastfield_accessor.num_vals(),
|
||||
min_value: fastfield_accessor.min_value(),
|
||||
max_value: fastfield_accessor.max_value(),
|
||||
interpolations,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 5_000 {
|
||||
return false;
|
||||
}
|
||||
// On serialization the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algortihm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
#[allow(clippy::question_mark)]
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 10 * CHUNK_SIZE {
|
||||
return None;
|
||||
}
|
||||
|
||||
// On serialization the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algorithm should be fine
|
||||
let theorethical_maximum_offset =
|
||||
fastfield_accessor.max_value() - fastfield_accessor.min_value();
|
||||
if fastfield_accessor
|
||||
.max_value()
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(fastfield_accessor.num_vals());
|
||||
let last_val_in_first_block =
|
||||
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
|
||||
let slope = get_slope(
|
||||
first_val_in_first_block,
|
||||
last_val_in_first_block,
|
||||
stats.num_vals,
|
||||
fastfield_accessor.num_vals(),
|
||||
);
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
|
||||
@@ -336,18 +336,18 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
.unwrap();
|
||||
|
||||
// Estimate one block and extrapolate the cost to all blocks.
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within 50%
|
||||
// threshold.
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within
|
||||
// 50% threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
//
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * fastfield_accessor.num_vals() as u64
|
||||
// function metadata per block
|
||||
+ 29 * (stats.num_vals / CHUNK_SIZE);
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
+ 29 * (fastfield_accessor.num_vals() / CHUNK_SIZE);
|
||||
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
|
||||
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -362,18 +362,44 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
use crate::tests::get_codec_test_datasets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<
|
||||
MultiLinearInterpolFastFieldSerializer,
|
||||
MultiLinearInterpolFastFieldReader,
|
||||
>(data, name);
|
||||
fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> {
|
||||
crate::tests::create_and_validate::<BlockwiseLinearCodec>(data, name)
|
||||
}
|
||||
|
||||
const HIGHEST_BIT: u64 = 1 << 63;
|
||||
pub fn i64_to_u64(val: i64) -> u64 {
|
||||
(val as u64) ^ HIGHEST_BIT
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression_i64() {
|
||||
let data = (i64::MAX - 600_000..=i64::MAX - 550_000)
|
||||
.map(i64_to_u64)
|
||||
.collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large i64").unwrap();
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.15);
|
||||
assert!(actual_compression > 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large").unwrap();
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.15);
|
||||
assert!(actual_compression > 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
let data_sets = get_codec_test_datasets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
@@ -400,10 +426,9 @@ mod tests {
|
||||
fn rand() {
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u64>() as u64)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
let _ = create_and_validate(&data, "random");
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
49
fastfield_codecs/src/column.rs
Normal file
@@ -0,0 +1,49 @@
|
||||
pub trait Column<T = u64> {
|
||||
/// Return the value associated to the given idx.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `idx` is greater than the column length.
|
||||
fn get_val(&self, idx: u64) -> T;
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// Regardless of the type of `Item`, this method works
|
||||
/// - transmuting the output array
|
||||
/// - extracting the `Item`s as if they were `u64`
|
||||
/// - possibly converting the `u64` value to the right type.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
fn get_range(&self, start: u64, output: &mut [T]) {
|
||||
for (out, idx) in output.iter_mut().zip(start..) {
|
||||
*out = self.get_val(idx);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual minimum value.
|
||||
fn min_value(&self) -> T;
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value
|
||||
fn max_value(&self) -> T;
|
||||
|
||||
fn num_vals(&self) -> u64;
|
||||
/// Returns a iterator over the data
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
|
||||
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
|
||||
}
|
||||
}
|
||||
@@ -5,126 +5,187 @@ extern crate more_asserts;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use ownedbytes::OwnedBytes;
|
||||
|
||||
pub mod bitpacked;
|
||||
pub mod linearinterpol;
|
||||
pub mod multilinearinterpol;
|
||||
pub mod blockwise_linear;
|
||||
pub mod linear;
|
||||
|
||||
pub trait FastFieldCodecReader: Sized {
|
||||
/// reads the metadata and returns the CodecReader
|
||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
|
||||
mod column;
|
||||
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64;
|
||||
pub use self::column::Column;
|
||||
|
||||
fn min_value(&self) -> u64;
|
||||
fn max_value(&self) -> u64;
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
|
||||
#[repr(u8)]
|
||||
pub enum FastFieldCodecType {
|
||||
Bitpacked = 1,
|
||||
Linear = 2,
|
||||
BlockwiseLinear = 3,
|
||||
Gcd = 4,
|
||||
}
|
||||
|
||||
impl BinarySerializable for FastFieldCodecType {
|
||||
fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
|
||||
self.to_code().serialize(wrt)
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let code = u8::deserialize(reader)?;
|
||||
let codec_type: Self = Self::from_code(code)
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
|
||||
Ok(codec_type)
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldCodecType {
|
||||
pub fn to_code(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
pub fn from_code(code: u8) -> Option<Self> {
|
||||
match code {
|
||||
1 => Some(Self::Bitpacked),
|
||||
2 => Some(Self::Linear),
|
||||
3 => Some(Self::BlockwiseLinear),
|
||||
4 => Some(Self::Gcd),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldCodecSerializer {
|
||||
pub trait FastFieldCodec {
|
||||
/// A codex needs to provide a unique name and id, which is
|
||||
/// used for debugging and de/serialization.
|
||||
const NAME: &'static str;
|
||||
const ID: u8;
|
||||
const CODEC_TYPE: FastFieldCodecType;
|
||||
|
||||
/// Check if the Codec is able to compress the data
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> bool;
|
||||
type Reader: Column<u64>;
|
||||
|
||||
/// Reads the metadata and returns the CodecReader
|
||||
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader>;
|
||||
|
||||
/// Serializes the data using the serializer into write.
|
||||
///
|
||||
/// The fastfield_accessor iterator should be preferred over using fastfield_accessor for
|
||||
/// performance reasons.
|
||||
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column<u64>) -> io::Result<()>;
|
||||
|
||||
/// Returns an estimate of the compression ratio.
|
||||
/// If the codec is not applicable, returns `None`.
|
||||
///
|
||||
/// The baseline is uncompressed 64bit data.
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
|
||||
|
||||
/// Serializes the data using the serializer into write.
|
||||
/// There are multiple iterators, in case the codec needs to read the data multiple times.
|
||||
/// The iterators should be preferred over using fastfield_accessor for performance reasons.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
|
||||
pub trait FastFieldDataAccess {
|
||||
/// Return the value associated to the given position.
|
||||
///
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `position` is greater than the index.
|
||||
fn get_val(&self, position: u64) -> u64;
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32>;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
/// Statistics are used in codec detection and stored in the fast field footer.
|
||||
pub struct FastFieldStats {
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
pub num_vals: u64,
|
||||
}
|
||||
|
||||
impl<'a> FastFieldDataAccess for &'a [u64] {
|
||||
struct VecColum<'a>(&'a [u64]);
|
||||
impl<'a> Column for VecColum<'a> {
|
||||
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self[position as usize]
|
||||
self.0[position as usize]
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new(self.0.iter().cloned())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
self.0.iter().min().cloned().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.0.iter().max().cloned().unwrap_or(0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.0.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldDataAccess for Vec<u64> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self[position as usize]
|
||||
impl<'a> From<&'a [u64]> for VecColum<'a> {
|
||||
fn from(data: &'a [u64]) -> Self {
|
||||
Self(data)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
|
||||
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
|
||||
multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
},
|
||||
};
|
||||
use proptest::prelude::*;
|
||||
use proptest::strategy::Strategy;
|
||||
use proptest::{prop_oneof, proptest};
|
||||
|
||||
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
use crate::bitpacked::BitpackedCodec;
|
||||
use crate::blockwise_linear::BlockwiseLinearCodec;
|
||||
use crate::linear::LinearCodec;
|
||||
|
||||
pub fn create_and_validate<Codec: FastFieldCodec>(
|
||||
data: &[u64],
|
||||
name: &str,
|
||||
) -> (f32, f32) {
|
||||
if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) {
|
||||
return (f32::MAX, 0.0);
|
||||
}
|
||||
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
&data,
|
||||
crate::tests::stats_from_vec(data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
) -> Option<(f32, f32)> {
|
||||
let estimation = Codec::estimate(&VecColum::from(data))?;
|
||||
|
||||
let reader = R::open_from_bytes(&out).unwrap();
|
||||
for (doc, orig_val) in data.iter().enumerate() {
|
||||
let val = reader.get_u64(doc as u64, &out);
|
||||
if val != *orig_val {
|
||||
panic!(
|
||||
"val {:?} does not match orig_val {:?}, in data set {}, data {:?}",
|
||||
val, orig_val, name, data
|
||||
);
|
||||
}
|
||||
let mut out: Vec<u8> = Vec::new();
|
||||
Codec::serialize(&mut out, &VecColum::from(data)).unwrap();
|
||||
|
||||
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
|
||||
|
||||
let reader = Codec::open_from_bytes(OwnedBytes::new(out)).unwrap();
|
||||
assert_eq!(reader.num_vals(), data.len() as u64);
|
||||
for (doc, orig_val) in data.iter().copied().enumerate() {
|
||||
let val = reader.get_val(doc as u64);
|
||||
assert_eq!(
|
||||
val, orig_val,
|
||||
"val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \
|
||||
`{data:?}`",
|
||||
);
|
||||
}
|
||||
let actual_compression = data.len() as f32 / out.len() as f32;
|
||||
(estimation, actual_compression)
|
||||
Some((estimation, actual_compression))
|
||||
}
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(100))]
|
||||
#[test]
|
||||
fn test_proptest_small(data in proptest::collection::vec(num_strategy(), 1..10)) {
|
||||
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
|
||||
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
|
||||
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
|
||||
}
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(10))]
|
||||
#[test]
|
||||
fn test_proptest_large(data in proptest::collection::vec(num_strategy(), 1..6000)) {
|
||||
create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
|
||||
create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
|
||||
create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
|
||||
}
|
||||
|
||||
}
|
||||
fn num_strategy() -> impl Strategy<Value = u64> {
|
||||
prop_oneof![
|
||||
1 => prop::num::u64::ANY.prop_map(|num| u64::MAX - (num % 10) ),
|
||||
1 => prop::num::u64::ANY.prop_map(|num| num % 10 ),
|
||||
20 => prop::num::u64::ANY,
|
||||
]
|
||||
}
|
||||
|
||||
pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
let data = (10..=10_000_u64).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "simple monotonically increasing"));
|
||||
|
||||
data_and_names.push((
|
||||
@@ -134,92 +195,93 @@ mod tests {
|
||||
data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
|
||||
data_and_names.push((vec![10], "single value"));
|
||||
|
||||
data_and_names.push((
|
||||
vec![1572656989877777, 1170935903116329, 720575940379279, 0],
|
||||
"overflow error",
|
||||
));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
fn test_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>() {
|
||||
let codec_name = S::NAME;
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let (estimate, actual) =
|
||||
crate::tests::create_and_validate::<S, R>(&data, data_set_name);
|
||||
let result = if estimate == f32::MAX {
|
||||
"Disabled".to_string()
|
||||
fn test_codec<C: FastFieldCodec>() {
|
||||
let codec_name = format!("{:?}", C::CODEC_TYPE);
|
||||
for (data, dataset_name) in get_codec_test_datasets() {
|
||||
let estimate_actual_opt: Option<(f32, f32)> =
|
||||
crate::tests::create_and_validate::<C>(&data, dataset_name);
|
||||
let result = if let Some((estimate, actual)) = estimate_actual_opt {
|
||||
format!("Estimate `{estimate}` Actual `{actual}`")
|
||||
} else {
|
||||
format!("Estimate {:?} Actual {:?} ", estimate, actual)
|
||||
"Disabled".to_string()
|
||||
};
|
||||
println!(
|
||||
"Codec {}, DataSet {}, {}",
|
||||
codec_name, data_set_name, result
|
||||
);
|
||||
println!("Codec {codec_name}, DataSet {dataset_name}, {result}");
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_bitpacking() {
|
||||
test_codec::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>();
|
||||
test_codec::<BitpackedCodec>();
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_interpolation() {
|
||||
test_codec::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>();
|
||||
test_codec::<LinearCodec>();
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_multi_interpolation() {
|
||||
test_codec::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>();
|
||||
test_codec::<BlockwiseLinearCodec>();
|
||||
}
|
||||
|
||||
use super::*;
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
let max_value = data.iter().cloned().max().unwrap_or(0);
|
||||
FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: data.len() as u64,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||
let data: VecColum = data.as_slice().into();
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, 0.01);
|
||||
|
||||
let multi_linear_interpol_estimation =
|
||||
MultiLinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(multi_linear_interpol_estimation, 0.2);
|
||||
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case() {
|
||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||
let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
let data: VecColum = data.into();
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, 0.32);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
|
||||
let mut data = (200..=20000_u64).collect::<Vec<_>>();
|
||||
let mut data: Vec<u64> = (200..=20000_u64).collect();
|
||||
data.push(1_000_000);
|
||||
let data: VecColum = data.as_slice().into();
|
||||
|
||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
|
||||
assert_le!(linear_interpol_estimation, 0.35);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
|
||||
assert_le!(bitpacked_estimation, 0.32);
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fast_field_codec_type_to_code() {
|
||||
let mut count_codec = 0;
|
||||
for code in 0..=255 {
|
||||
if let Some(codec_type) = FastFieldCodecType::from_code(code) {
|
||||
assert_eq!(codec_type.to_code(), code);
|
||||
count_codec += 1;
|
||||
}
|
||||
}
|
||||
assert_eq!(count_codec, 4);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,27 +1,24 @@
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::FixedSize;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use ownedbytes::OwnedBytes;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{Column, FastFieldCodec, FastFieldCodecType};
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct LinearInterpolFastFieldReader {
|
||||
pub struct LinearReader {
|
||||
data: OwnedBytes,
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub footer: LinearInterpolFooter,
|
||||
pub footer: LinearFooter,
|
||||
pub slope: f32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LinearInterpolFooter {
|
||||
pub struct LinearFooter {
|
||||
pub relative_max_value: u64,
|
||||
pub offset: u64,
|
||||
pub first_val: u64,
|
||||
@@ -31,7 +28,7 @@ pub struct LinearInterpolFooter {
|
||||
pub max_value: u64,
|
||||
}
|
||||
|
||||
impl BinarySerializable for LinearInterpolFooter {
|
||||
impl BinarySerializable for LinearFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.relative_max_value.serialize(write)?;
|
||||
self.offset.serialize(write)?;
|
||||
@@ -43,8 +40,8 @@ impl BinarySerializable for LinearInterpolFooter {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<LinearInterpolFooter> {
|
||||
Ok(LinearInterpolFooter {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<LinearFooter> {
|
||||
Ok(LinearFooter {
|
||||
relative_max_value: u64::deserialize(reader)?,
|
||||
offset: u64::deserialize(reader)?,
|
||||
first_val: u64::deserialize(reader)?,
|
||||
@@ -56,29 +53,15 @@ impl BinarySerializable for LinearInterpolFooter {
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for LinearInterpolFooter {
|
||||
impl FixedSize for LinearFooter {
|
||||
const SIZE_IN_BYTES: usize = 56;
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES);
|
||||
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
|
||||
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
|
||||
|
||||
let num_bits = compute_num_bits(footer.relative_max_value);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(LinearInterpolFastFieldReader {
|
||||
bit_unpacker,
|
||||
footer,
|
||||
slope,
|
||||
})
|
||||
}
|
||||
impl Column for LinearReader {
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
|
||||
(calculated_value + self.bit_unpacker.get(doc, data)) - self.footer.offset
|
||||
(calculated_value + self.bit_unpacker.get(doc, &self.data)) - self.footer.offset
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -89,47 +72,87 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
||||
fn max_value(&self) -> u64 {
|
||||
self.footer.max_value
|
||||
}
|
||||
#[inline]
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.footer.num_vals
|
||||
}
|
||||
}
|
||||
|
||||
/// Fastfield serializer, which tries to guess values by linear interpolation
|
||||
/// and stores the difference bitpacked.
|
||||
pub struct LinearInterpolFastFieldSerializer {}
|
||||
pub struct LinearCodec;
|
||||
|
||||
#[inline]
|
||||
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
||||
pub(crate) fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
||||
if num_vals <= 1 {
|
||||
return 0.0;
|
||||
}
|
||||
// We calculate the slope with f64 high precision and use the result in lower precision f32
|
||||
// This is done in order to handle estimations for very large values like i64::MAX
|
||||
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
|
||||
let diff = diff(last_val, first_val);
|
||||
(diff / (num_vals - 1) as f64) as f32
|
||||
}
|
||||
|
||||
/// Delay the cast, to improve precision for very large u64 values.
|
||||
///
|
||||
/// Since i64 is mapped monotonically to u64 space, 0i64 is after the mapping i64::MAX.
|
||||
/// So very large values are not uncommon.
|
||||
///
|
||||
/// ```rust
|
||||
/// let val1 = i64::MAX;
|
||||
/// let val2 = i64::MAX - 100;
|
||||
/// assert_eq!(val1 - val2, 100);
|
||||
/// assert_eq!(val1 as f64 - val2 as f64, 0.0);
|
||||
/// ```
|
||||
fn diff(val1: u64, val2: u64) -> f64 {
|
||||
if val1 >= val2 {
|
||||
(val1 - val2) as f64
|
||||
} else {
|
||||
(val2 - val1) as f64 * -1.0
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
first_val + (pos as f32 * slope) as u64
|
||||
pub fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
if slope < 0.0 {
|
||||
first_val.saturating_sub((pos as f32 * -slope) as u64)
|
||||
} else {
|
||||
first_val.saturating_add((pos as f32 * slope) as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "LinearInterpol";
|
||||
const ID: u8 = 2;
|
||||
impl FastFieldCodec for LinearCodec {
|
||||
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Linear;
|
||||
|
||||
type Reader = LinearReader;
|
||||
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
|
||||
let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES;
|
||||
let (data, mut footer) = bytes.split(footer_offset);
|
||||
let footer = LinearFooter::deserialize(&mut footer)?;
|
||||
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
|
||||
let num_bits = compute_num_bits(footer.relative_max_value);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(LinearReader {
|
||||
data,
|
||||
bit_unpacker,
|
||||
footer,
|
||||
slope,
|
||||
})
|
||||
}
|
||||
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
fn serialize(write: &mut impl Write, fastfield_accessor: &dyn Column) -> io::Result<()> {
|
||||
assert!(fastfield_accessor.min_value() <= fastfield_accessor.max_value());
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals());
|
||||
// calculate offset to ensure all values are positive
|
||||
let mut offset = 0;
|
||||
let mut rel_positive_max = 0;
|
||||
for (pos, actual_value) in data_iter1.enumerate() {
|
||||
for (pos, actual_value) in fastfield_accessor.iter().enumerate() {
|
||||
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
|
||||
if calculated_value > actual_value {
|
||||
// negative value we need to apply an offset
|
||||
@@ -137,7 +160,7 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
// will be offset to 0
|
||||
offset = offset.max(calculated_value - actual_value);
|
||||
} else {
|
||||
//positive value no offset reuqired
|
||||
// positive value no offset reuqired
|
||||
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
|
||||
}
|
||||
}
|
||||
@@ -147,56 +170,55 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value);
|
||||
let mut bit_packer = BitPacker::new();
|
||||
for (pos, val) in data_iter.enumerate() {
|
||||
for (pos, val) in fastfield_accessor.iter().enumerate() {
|
||||
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
|
||||
let diff = (val + offset) - calculated_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = LinearInterpolFooter {
|
||||
let footer = LinearFooter {
|
||||
relative_max_value,
|
||||
offset,
|
||||
first_val,
|
||||
last_val,
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
num_vals: fastfield_accessor.num_vals(),
|
||||
min_value: fastfield_accessor.min_value(),
|
||||
max_value: fastfield_accessor.max_value(),
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 3 {
|
||||
return false; //disable compressor for this case
|
||||
}
|
||||
// On serialisation the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algortihm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
#[allow(clippy::question_mark)]
|
||||
fn estimate(fastfield_accessor: &impl Column) -> Option<f32> {
|
||||
if fastfield_accessor.num_vals() < 3 {
|
||||
return None; // disable compressor for this case
|
||||
}
|
||||
|
||||
// On serialisation the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algorithm should be fine
|
||||
let theorethical_maximum_offset =
|
||||
fastfield_accessor.max_value() - fastfield_accessor.min_value();
|
||||
if fastfield_accessor
|
||||
.max_value()
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return None;
|
||||
}
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
let last_val = fastfield_accessor.get_val(fastfield_accessor.num_vals() as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, fastfield_accessor.num_vals());
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%
|
||||
let num_vals = stats.num_vals as f32 / 100.0;
|
||||
let num_vals = fastfield_accessor.num_vals() as f32 / 100.0;
|
||||
let sample_positions = (0..20)
|
||||
.map(|pos| (num_vals * pos as f32 * 5.0) as usize)
|
||||
.collect::<Vec<_>>();
|
||||
@@ -211,17 +233,18 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within 50%
|
||||
// threshold.
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within
|
||||
// 50% threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
//
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
+ LinearInterpolFooter::SIZE_IN_BYTES as u64;
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64
|
||||
* fastfield_accessor.num_vals()
|
||||
+ LinearFooter::SIZE_IN_BYTES as u64;
|
||||
let num_bits_uncompressed = 64 * fastfield_accessor.num_vals();
|
||||
Some(num_bits as f32 / num_bits_uncompressed as f32)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -236,19 +259,48 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
use rand::RngCore;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<
|
||||
LinearInterpolFastFieldSerializer,
|
||||
LinearInterpolFastFieldReader,
|
||||
>(data, name);
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_datasets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> Option<(f32, f32)> {
|
||||
crate::tests::create_and_validate::<LinearCodec>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
fn get_calculated_value_test() {
|
||||
// pos slope
|
||||
assert_eq!(get_calculated_value(100, 10, 5.0), 150);
|
||||
|
||||
// neg slope
|
||||
assert_eq!(get_calculated_value(100, 10, -5.0), 50);
|
||||
|
||||
// pos slope, very high values
|
||||
assert_eq!(
|
||||
get_calculated_value(i64::MAX as u64, 10, 5.0),
|
||||
i64::MAX as u64 + 50
|
||||
);
|
||||
// neg slope, very high values
|
||||
assert_eq!(
|
||||
get_calculated_value(i64::MAX as u64, 10, -5.0),
|
||||
i64::MAX as u64 - 50
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large").unwrap();
|
||||
|
||||
assert!(actual_compression < 0.01);
|
||||
assert!(estimate < 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_datasets() {
|
||||
let data_sets = get_codec_test_datasets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
@@ -265,6 +317,13 @@ mod tests {
|
||||
|
||||
create_and_validate(&data, "large amplitude");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn overflow_error_test() {
|
||||
let data = vec![1572656989877777, 1170935903116329, 720575940379279, 0];
|
||||
create_and_validate(&data, "overflow test");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_concave_data() {
|
||||
let data = vec![0, 1, 2, 5, 8, 10, 20, 50];
|
||||
@@ -284,10 +343,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..5000 {
|
||||
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>();
|
||||
let mut rng = rand::thread_rng();
|
||||
for _ in 0..50 {
|
||||
let mut data = (0..10_000).map(|_| rng.next_u64()).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
@@ -1,12 +1,35 @@
|
||||
#[macro_use]
|
||||
extern crate prettytable;
|
||||
use fastfield_codecs::{
|
||||
linearinterpol::LinearInterpolFastFieldSerializer,
|
||||
multilinearinterpol::MultiLinearInterpolFastFieldSerializer, FastFieldCodecSerializer,
|
||||
FastFieldStats,
|
||||
};
|
||||
use fastfield_codecs::bitpacked::BitpackedCodec;
|
||||
use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
|
||||
use fastfield_codecs::linear::LinearCodec;
|
||||
use fastfield_codecs::{Column, FastFieldCodec, FastFieldCodecType, FastFieldStats};
|
||||
use prettytable::{Cell, Row, Table};
|
||||
|
||||
struct Data<'a>(&'a [u64]);
|
||||
|
||||
impl<'a> Column for Data<'a> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self.0[position as usize]
|
||||
}
|
||||
|
||||
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = u64> + 'b> {
|
||||
Box::new(self.0.iter().cloned())
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
*self.0.iter().min().unwrap_or(&0)
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
*self.0.iter().max().unwrap_or(&0)
|
||||
}
|
||||
|
||||
fn num_vals(&self) -> u64 {
|
||||
self.0.len() as u64
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut table = Table::new();
|
||||
|
||||
@@ -14,42 +37,32 @@ fn main() {
|
||||
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
|
||||
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let mut results = vec![];
|
||||
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
|
||||
&data,
|
||||
);
|
||||
results.push(res);
|
||||
|
||||
//let best_estimation_codec = results
|
||||
//.iter()
|
||||
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
|
||||
//.unwrap();
|
||||
let results: Vec<(f32, f32, FastFieldCodecType)> = [
|
||||
serialize_with_codec::<LinearCodec>(&data),
|
||||
serialize_with_codec::<BlockwiseLinearCodec>(&data),
|
||||
serialize_with_codec::<BlockwiseLinearCodec>(&data),
|
||||
serialize_with_codec::<BitpackedCodec>(&data),
|
||||
]
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.collect();
|
||||
let best_compression_ratio_codec = results
|
||||
.iter()
|
||||
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
|
||||
.min_by(|&res1, &res2| res1.partial_cmp(res2).unwrap())
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
||||
for (is_applicable, est, comp, name) in results {
|
||||
let (est_cell, ratio_cell) = if !is_applicable {
|
||||
("Codec Disabled".to_string(), "".to_string())
|
||||
} else {
|
||||
(est.to_string(), comp.to_string())
|
||||
};
|
||||
#[allow(clippy::all)]
|
||||
for (est, comp, codec_type) in results {
|
||||
let est_cell = est.to_string();
|
||||
let ratio_cell = comp.to_string();
|
||||
let style = if comp == best_compression_ratio_codec.1 {
|
||||
"Fb"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
|
||||
table.add_row(Row::new(vec![
|
||||
Cell::new(&name.to_string()).style_spec("bFg"),
|
||||
Cell::new(&format!("{codec_type:?}")).style_spec("bFg"),
|
||||
Cell::new(&ratio_cell).style_spec(style),
|
||||
Cell::new(&est_cell).style_spec(""),
|
||||
]));
|
||||
@@ -73,7 +86,7 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
current_cumulative
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
//let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
|
||||
// let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing concave"));
|
||||
|
||||
let mut current_cumulative = 0;
|
||||
@@ -94,26 +107,15 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
data_and_names
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
pub fn serialize_with_codec<C: FastFieldCodec>(
|
||||
data: &[u64],
|
||||
) -> (bool, f32, f32, &'static str) {
|
||||
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
|
||||
if !is_applicable {
|
||||
return (false, 0.0, 0.0, S::NAME);
|
||||
}
|
||||
let estimation = S::estimate(&data, stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
&data,
|
||||
stats_from_vec(data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
(true, estimation, actual_compression, S::NAME)
|
||||
) -> Option<(f32, f32, FastFieldCodecType)> {
|
||||
let data = Data(data);
|
||||
let estimation = C::estimate(&data)?;
|
||||
let mut out = Vec::new();
|
||||
C::serialize(&mut out, &data).unwrap();
|
||||
let actual_compression = out.len() as f32 / (data.num_vals() * 8) as f32;
|
||||
Some((estimation, actual_compression, C::CODEC_TYPE))
|
||||
}
|
||||
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
[package]
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
name = "ownedbytes"
|
||||
version = "0.1.0"
|
||||
edition = "2018"
|
||||
version = "0.3.0"
|
||||
edition = "2021"
|
||||
description = "Expose data as static slice"
|
||||
license = "MIT"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use stable_deref_trait::StableDeref;
|
||||
use std::convert::TryInto;
|
||||
use std::mem;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, io};
|
||||
use std::{fmt, io, mem};
|
||||
|
||||
use stable_deref_trait::StableDeref;
|
||||
|
||||
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
|
||||
/// this data as a static slice.
|
||||
/// this data as a slice.
|
||||
///
|
||||
/// The backing object is required to be `StableDeref`.
|
||||
#[derive(Clone)]
|
||||
@@ -21,7 +21,7 @@ impl OwnedBytes {
|
||||
OwnedBytes::new(&[][..])
|
||||
}
|
||||
|
||||
/// Creates an `OwnedBytes` intance given a `StableDeref` object.
|
||||
/// Creates an `OwnedBytes` instance given a `StableDeref` object.
|
||||
pub fn new<T: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync>(
|
||||
data_holder: T,
|
||||
) -> OwnedBytes {
|
||||
@@ -35,6 +35,8 @@ impl OwnedBytes {
|
||||
}
|
||||
|
||||
/// creates a fileslice that is just a view over a slice of the data.
|
||||
#[must_use]
|
||||
#[inline]
|
||||
pub fn slice(&self, range: Range<usize>) -> Self {
|
||||
OwnedBytes {
|
||||
data: &self.data[range],
|
||||
@@ -63,6 +65,8 @@ impl OwnedBytes {
|
||||
/// On the other hand, both `left` and `right` retain a handle over
|
||||
/// the entire slice of memory. In other words, the memory will only
|
||||
/// be released when both left and right are dropped.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
|
||||
let right_box_stable_deref = self.box_stable_deref.clone();
|
||||
let left = OwnedBytes {
|
||||
@@ -76,6 +80,19 @@ impl OwnedBytes {
|
||||
(left, right)
|
||||
}
|
||||
|
||||
/// Splits the right part of the `OwnedBytes` at the given offset.
|
||||
///
|
||||
/// `self` is truncated to `split_len`, left with the remaining bytes.
|
||||
pub fn split_off(&mut self, split_len: usize) -> OwnedBytes {
|
||||
let right_box_stable_deref = self.box_stable_deref.clone();
|
||||
let right_piece = OwnedBytes {
|
||||
data: &self.data[split_len..],
|
||||
box_stable_deref: right_box_stable_deref,
|
||||
};
|
||||
self.data = &self.data[..split_len];
|
||||
right_piece
|
||||
}
|
||||
|
||||
/// Returns true iff this `OwnedBytes` is empty.
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
@@ -83,8 +100,6 @@ impl OwnedBytes {
|
||||
}
|
||||
|
||||
/// Drops the left most `advance_len` bytes.
|
||||
///
|
||||
/// See also [.clip(clip_len: usize))](#method.clip).
|
||||
#[inline]
|
||||
pub fn advance(&mut self, advance_len: usize) {
|
||||
self.data = &self.data[advance_len..]
|
||||
@@ -124,6 +139,34 @@ impl fmt::Debug for OwnedBytes {
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for OwnedBytes {
|
||||
fn eq(&self, other: &OwnedBytes) -> bool {
|
||||
self.as_slice() == other.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for OwnedBytes {}
|
||||
|
||||
impl PartialEq<[u8]> for OwnedBytes {
|
||||
fn eq(&self, other: &[u8]) -> bool {
|
||||
self.as_slice() == other
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq<str> for OwnedBytes {
|
||||
fn eq(&self, other: &str) -> bool {
|
||||
self.as_slice() == other.as_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: ?Sized> PartialEq<&'a T> for OwnedBytes
|
||||
where OwnedBytes: PartialEq<T>
|
||||
{
|
||||
fn eq(&self, other: &&'a T) -> bool {
|
||||
*self == **other
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for OwnedBytes {
|
||||
type Target = [u8];
|
||||
|
||||
@@ -287,4 +330,14 @@ mod tests {
|
||||
assert_eq!(right.as_slice(), b"");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_off() {
|
||||
let mut data = OwnedBytes::new(b"abcdef".as_ref());
|
||||
assert_eq!(data, "abcdef");
|
||||
assert_eq!(data.split_off(2), "cdef");
|
||||
assert_eq!(data, "ab");
|
||||
assert_eq!(data.split_off(1), "b");
|
||||
assert_eq!(data, "a");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,18 +1,17 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.15.0"
|
||||
version = "0.18.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
description = """Search engine library"""
|
||||
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
|
||||
homepage = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
combine = {version="4", default-features=false, features=[] }
|
||||
once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std", "unicode"] }
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![allow(clippy::derive_partial_eq_without_eq)]
|
||||
|
||||
mod occur;
|
||||
mod query_grammar;
|
||||
mod user_input_ast;
|
||||
|
||||
@@ -2,11 +2,11 @@ use std::fmt;
|
||||
use std::fmt::Write;
|
||||
|
||||
/// Defines whether a term in a query must be present,
|
||||
/// should be present or must be not present.
|
||||
/// should be present or must not be present.
|
||||
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
|
||||
pub enum Occur {
|
||||
/// For a given document to be considered for scoring,
|
||||
/// at least one of the document with the Should or the Must
|
||||
/// at least one of the terms with the Should or the Must
|
||||
/// Occur constraint must be within the document.
|
||||
Should,
|
||||
/// Document without the term are excluded from the search.
|
||||
|
||||
@@ -1,21 +1,24 @@
|
||||
use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
use crate::Occur;
|
||||
use combine::error::StringStreamError;
|
||||
use combine::parser::char::{char, digit, space, spaces, string};
|
||||
use combine::parser::combinator::recognize;
|
||||
use combine::parser::range::{take_while, take_while1};
|
||||
use combine::parser::repeat::escaped;
|
||||
use combine::parser::Parser;
|
||||
use combine::{
|
||||
attempt, choice, eof, many, many1, one_of, optional, parser, satisfy, skip_many1, value,
|
||||
};
|
||||
use combine::{error::StringStreamError, parser::combinator::recognize};
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to special characters.
|
||||
use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
use crate::Occur;
|
||||
|
||||
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
|
||||
// special characters.
|
||||
const SPECIAL_CHARS: &[char] = &[
|
||||
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
|
||||
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
|
||||
];
|
||||
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*| )"#;
|
||||
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|!|\\|\*|\s)"#;
|
||||
|
||||
/// Parses a field_name
|
||||
/// A field name must have at least one character and be followed by a colon.
|
||||
@@ -31,7 +34,8 @@ fn field_name<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
take_while(|c| !SPECIAL_CHARS.contains(&c)),
|
||||
),
|
||||
'\\',
|
||||
satisfy(|c| SPECIAL_CHARS.contains(&c)),
|
||||
satisfy(|_| true), /* if the next character is not a special char, the \ will be treated
|
||||
* as the \ character. */
|
||||
))
|
||||
.skip(char(':'))
|
||||
.map(|s| ESCAPED_SPECIAL_CHARS_RE.replace_all(&s, "$1").to_string())
|
||||
@@ -63,8 +67,8 @@ fn word<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
/// 2021-04-13T19:46:26.266051969+00:00
|
||||
///
|
||||
/// NOTE: also accepts 999999-99-99T99:99:99.266051969+99:99
|
||||
/// We delegate rejecting such invalid dates to the logical AST compuation code
|
||||
/// which invokes chrono::DateTime::parse_from_rfc3339 on the value to actually parse
|
||||
/// We delegate rejecting such invalid dates to the logical AST computation code
|
||||
/// which invokes time::OffsetDateTime::parse(..., &Rfc3339) on the value to actually parse
|
||||
/// it (instead of merely extracting the datetime value as string as done here).
|
||||
fn date_time<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
let two_digits = || recognize::<String, _, _>((digit(), digit()));
|
||||
@@ -116,22 +120,36 @@ fn date_time<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
|
||||
fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
|
||||
phrase.or(word())
|
||||
negative_number().or(phrase.or(word()))
|
||||
}
|
||||
|
||||
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
|
||||
let term_val_with_field = negative_number().or(term_val());
|
||||
(field_name(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
|
||||
(field_name(), term_val(), slop_val()).map(|(field_name, phrase, slop)| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
slop,
|
||||
})
|
||||
}
|
||||
|
||||
fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
|
||||
let slop =
|
||||
(char('~'), many1(digit())).and_then(|(_, slop): (_, String)| match slop.parse::<u32>() {
|
||||
Ok(d) => Ok(d),
|
||||
_ => Err(StringStreamError::UnexpectedParse),
|
||||
});
|
||||
optional(slop).map(|slop| match slop {
|
||||
Some(d) => d,
|
||||
_ => 0,
|
||||
})
|
||||
}
|
||||
|
||||
fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
|
||||
let term_default_field = term_val().map(|phrase| UserInputLiteral {
|
||||
let term_default_field = (term_val(), slop_val()).map(|(phrase, slop)| UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase,
|
||||
slop,
|
||||
});
|
||||
|
||||
attempt(term_query())
|
||||
.or(term_default_field)
|
||||
.map(UserInputLeaf::from)
|
||||
@@ -281,7 +299,7 @@ fn boost<'a>() -> impl Parser<&'a str, Output = f64> {
|
||||
|
||||
fn boosted_leaf<'a>() -> impl Parser<&'a str, Output = UserInputAst> {
|
||||
(leaf(), optional(boost())).map(|(leaf, boost_opt)| match boost_opt {
|
||||
Some(boost) if (boost - 1.0).abs() > std::f64::EPSILON => {
|
||||
Some(boost) if (boost - 1.0).abs() > f64::EPSILON => {
|
||||
UserInputAst::Boost(Box::new(leaf), boost)
|
||||
}
|
||||
_ => leaf,
|
||||
@@ -363,9 +381,10 @@ mod test {
|
||||
|
||||
type TestParseResult = Result<(), StringStreamError>;
|
||||
|
||||
use super::*;
|
||||
use combine::parser::Parser;
|
||||
|
||||
use super::*;
|
||||
|
||||
pub fn nearly_equals(a: f64, b: f64) -> bool {
|
||||
(a - b).abs() < 0.0005 * (a + b).abs()
|
||||
}
|
||||
@@ -512,14 +531,18 @@ mod test {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_field_name() -> TestParseResult {
|
||||
fn test_field_name() {
|
||||
assert_eq!(
|
||||
super::field_name().parse(".my.field.name:a"),
|
||||
Ok((".my.field.name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse("my\\ field\\ name:a"),
|
||||
Ok(("my field name".to_string(), "a"))
|
||||
super::field_name().parse(r#"にんじん:a"#),
|
||||
Ok(("にんじん".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"my\field:a"#),
|
||||
Ok((r#"my\field"#.to_string(), "a"))
|
||||
);
|
||||
assert!(super::field_name().parse("my field:a").is_err());
|
||||
assert_eq!(
|
||||
@@ -530,14 +553,32 @@ mod test {
|
||||
super::field_name().parse("my_field_name:a"),
|
||||
Ok(("my_field_name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse("myfield.b:hello").unwrap(),
|
||||
("myfield.b".to_string(), "hello")
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"myfield\.b:hello"#).unwrap(),
|
||||
(r#"myfield\.b"#.to_string(), "hello")
|
||||
);
|
||||
assert!(super::field_name().parse("my_field_name").is_err());
|
||||
assert!(super::field_name().parse(":a").is_err());
|
||||
assert!(super::field_name().parse("-my_field:a").is_err());
|
||||
assert_eq!(
|
||||
super::field_name().parse("_my_field:a")?,
|
||||
("_my_field".to_string(), "a")
|
||||
super::field_name().parse("_my_field:a"),
|
||||
Ok(("_my_field".to_string(), "a"))
|
||||
);
|
||||
Ok(())
|
||||
assert_eq!(
|
||||
super::field_name().parse("~my~field:a"),
|
||||
Ok(("~my~field".to_string(), "a"))
|
||||
);
|
||||
for special_char in SPECIAL_CHARS.iter() {
|
||||
let query = &format!("\\{special_char}my\\{special_char}field:a");
|
||||
assert_eq!(
|
||||
super::field_name().parse(query),
|
||||
Ok((format!("{special_char}my{special_char}field"), "a"))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -690,4 +731,22 @@ mod test {
|
||||
);
|
||||
test_is_parse_err("abc + ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_slop() {
|
||||
assert!(parse_to_ast().parse("\"a b\"~").is_err());
|
||||
assert!(parse_to_ast().parse("foo:\"a b\"~").is_err());
|
||||
assert!(parse_to_ast().parse("\"a b\"~a").is_err());
|
||||
assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err());
|
||||
|
||||
test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *\"~4\")");
|
||||
test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\"");
|
||||
test_parse_query_to_ast_helper("~Document", "\"~Document\"");
|
||||
test_parse_query_to_ast_helper("a~2", "\"a~2\"");
|
||||
test_parse_query_to_ast_helper("\"a b\"~0", "\"a b\"");
|
||||
test_parse_query_to_ast_helper("\"a b\"~1", "\"a b\"~1");
|
||||
test_parse_query_to_ast_helper("\"a b\"~3", "\"a b\"~3");
|
||||
test_parse_query_to_ast_helper("foo:\"a b\"~300", "\"foo\":\"a b\"~300");
|
||||
test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,14 +40,19 @@ impl Debug for UserInputLeaf {
|
||||
pub struct UserInputLiteral {
|
||||
pub field_name: Option<String>,
|
||||
pub phrase: String,
|
||||
pub slop: u32,
|
||||
}
|
||||
|
||||
impl fmt::Debug for UserInputLiteral {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
match self.field_name {
|
||||
Some(ref field_name) => write!(formatter, "\"{}\":\"{}\"", field_name, self.phrase),
|
||||
None => write!(formatter, "\"{}\"", self.phrase),
|
||||
if let Some(ref field) = self.field_name {
|
||||
write!(formatter, "\"{}\":", field)?;
|
||||
}
|
||||
write!(formatter, "\"{}\"", self.phrase)?;
|
||||
if self.slop > 0 {
|
||||
write!(formatter, "~{}", self.slop)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,7 +64,7 @@ pub enum UserInputBound {
|
||||
}
|
||||
|
||||
impl UserInputBound {
|
||||
fn display_lower(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
fn display_lower(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word),
|
||||
@@ -67,7 +72,7 @@ impl UserInputBound {
|
||||
}
|
||||
}
|
||||
|
||||
fn display_upper(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
fn display_upper(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word),
|
||||
@@ -91,6 +96,7 @@ pub enum UserInputAst {
|
||||
}
|
||||
|
||||
impl UserInputAst {
|
||||
#[must_use]
|
||||
pub fn unary(self, occur: Occur) -> UserInputAst {
|
||||
UserInputAst::Clause(vec![(Some(occur), self)])
|
||||
}
|
||||
|
||||
@@ -1 +1,7 @@
|
||||
use_try_shorthand = true
|
||||
comment_width = 120
|
||||
format_strings = true
|
||||
group_imports = "StdExternalCrate"
|
||||
imports_granularity = "Module"
|
||||
normalize_comments = true
|
||||
where_single_line = true
|
||||
wrap_comments = true
|
||||
|
||||
36
src/aggregation/README.md
Normal file
@@ -0,0 +1,36 @@
|
||||
# Contributing
|
||||
|
||||
When adding new bucket aggregation make sure to extend the "test_aggregation_flushing" test for at least 2 levels.
|
||||
|
||||
|
||||
|
||||
# Code Organization
|
||||
|
||||
Tantivy's aggregations have been designed to mimic the
|
||||
[aggregations of elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations.html).
|
||||
|
||||
The code is organized in submodules:
|
||||
|
||||
## bucket
|
||||
Contains all bucket aggregations, like range aggregation. These bucket aggregations group documents into buckets and can contain sub-aggregations.
|
||||
|
||||
## metric
|
||||
Contains all metric aggregations, like average aggregation. Metric aggregations do not have sub aggregations.
|
||||
|
||||
#### agg_req
|
||||
agg_req contains the users aggregation request. Deserialization from json is compatible with elasticsearch aggregation requests.
|
||||
|
||||
#### agg_req_with_accessor
|
||||
agg_req_with_accessor contains the users aggregation request enriched with fast field accessors etc, which are
|
||||
used during collection.
|
||||
|
||||
#### segment_agg_result
|
||||
segment_agg_result contains the aggregation result tree, which is used for collection of a segment.
|
||||
The tree from agg_req_with_accessor is passed during collection.
|
||||
|
||||
#### intermediate_agg_result
|
||||
intermediate_agg_result contains the aggregation tree for merging with other trees.
|
||||
|
||||
#### agg_result
|
||||
agg_result contains the final aggregation tree.
|
||||
|
||||
369
src/aggregation/agg_req.rs
Normal file
@@ -0,0 +1,369 @@
|
||||
//! Contains the aggregation request tree. Used to build an
|
||||
//! [AggregationCollector](super::AggregationCollector).
|
||||
//!
|
||||
//! [Aggregations] is the top level entry point to create a request, which is a `HashMap<String,
|
||||
//! Aggregation>`.
|
||||
//!
|
||||
//! Requests are compatible with the json format of elasticsearch.
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```
|
||||
//! use tantivy::aggregation::bucket::RangeAggregation;
|
||||
//! use tantivy::aggregation::agg_req::BucketAggregationType;
|
||||
//! use tantivy::aggregation::agg_req::{Aggregation, Aggregations};
|
||||
//! use tantivy::aggregation::agg_req::BucketAggregation;
|
||||
//! let agg_req1: Aggregations = vec![
|
||||
//! (
|
||||
//! "range".to_string(),
|
||||
//! Aggregation::Bucket(BucketAggregation {
|
||||
//! bucket_agg: BucketAggregationType::Range(RangeAggregation{
|
||||
//! field: "score".to_string(),
|
||||
//! ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
|
||||
//! keyed: false,
|
||||
//! }),
|
||||
//! sub_aggregation: Default::default(),
|
||||
//! }),
|
||||
//! ),
|
||||
//! ]
|
||||
//! .into_iter()
|
||||
//! .collect();
|
||||
//!
|
||||
//! let elasticsearch_compatible_json_req = r#"
|
||||
//! {
|
||||
//! "range": {
|
||||
//! "range": {
|
||||
//! "field": "score",
|
||||
//! "ranges": [
|
||||
//! { "from": 3.0, "to": 7.0 },
|
||||
//! { "from": 7.0, "to": 20.0 }
|
||||
//! ]
|
||||
//! }
|
||||
//! }
|
||||
//! }"#;
|
||||
//! let agg_req2: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
|
||||
//! assert_eq!(agg_req1, agg_req2);
|
||||
//! ```
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
pub use super::bucket::RangeAggregation;
|
||||
use super::bucket::{HistogramAggregation, TermsAggregation};
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
use super::VecWithNames;
|
||||
|
||||
/// The top-level aggregation request structure, which contains [Aggregation] and their user defined
|
||||
/// names. It is also used in [buckets](BucketAggregation) to define sub-aggregations.
|
||||
///
|
||||
/// The key is the user defined name of the aggregation.
|
||||
pub type Aggregations = HashMap<String, Aggregation>;
|
||||
|
||||
/// Like Aggregations, but optimized to work with the aggregation result
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct AggregationsInternal {
|
||||
pub(crate) metrics: VecWithNames<MetricAggregation>,
|
||||
pub(crate) buckets: VecWithNames<BucketAggregationInternal>,
|
||||
}
|
||||
|
||||
impl From<Aggregations> for AggregationsInternal {
|
||||
fn from(aggs: Aggregations) -> Self {
|
||||
let mut metrics = vec![];
|
||||
let mut buckets = vec![];
|
||||
for (key, agg) in aggs {
|
||||
match agg {
|
||||
Aggregation::Bucket(bucket) => buckets.push((
|
||||
key,
|
||||
BucketAggregationInternal {
|
||||
bucket_agg: bucket.bucket_agg,
|
||||
sub_aggregation: bucket.sub_aggregation.into(),
|
||||
},
|
||||
)),
|
||||
Aggregation::Metric(metric) => metrics.push((key, metric)),
|
||||
}
|
||||
}
|
||||
Self {
|
||||
metrics: VecWithNames::from_entries(metrics),
|
||||
buckets: VecWithNames::from_entries(buckets),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
// Like BucketAggregation, but optimized to work with the result
|
||||
pub(crate) struct BucketAggregationInternal {
|
||||
/// Bucket aggregation strategy to group documents.
|
||||
pub bucket_agg: BucketAggregationType,
|
||||
/// The sub_aggregations in the buckets. Each bucket will aggregate on the document set in the
|
||||
/// bucket.
|
||||
pub sub_aggregation: AggregationsInternal,
|
||||
}
|
||||
|
||||
impl BucketAggregationInternal {
|
||||
pub(crate) fn as_range(&self) -> Option<&RangeAggregation> {
|
||||
match &self.bucket_agg {
|
||||
BucketAggregationType::Range(range) => Some(range),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_histogram(&self) -> Option<&HistogramAggregation> {
|
||||
match &self.bucket_agg {
|
||||
BucketAggregationType::Histogram(histogram) => Some(histogram),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_term(&self) -> Option<&TermsAggregation> {
|
||||
match &self.bucket_agg {
|
||||
BucketAggregationType::Terms(terms) => Some(terms),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract all fields, where the term directory is used in the tree.
|
||||
pub fn get_term_dict_field_names(aggs: &Aggregations) -> HashSet<String> {
|
||||
let mut term_dict_field_names = Default::default();
|
||||
for el in aggs.values() {
|
||||
el.get_term_dict_field_names(&mut term_dict_field_names)
|
||||
}
|
||||
term_dict_field_names
|
||||
}
|
||||
|
||||
/// Extract all fast field names used in the tree.
|
||||
pub fn get_fast_field_names(aggs: &Aggregations) -> HashSet<String> {
|
||||
let mut fast_field_names = Default::default();
|
||||
for el in aggs.values() {
|
||||
el.get_fast_field_names(&mut fast_field_names)
|
||||
}
|
||||
fast_field_names
|
||||
}
|
||||
|
||||
/// Aggregation request of [BucketAggregation] or [MetricAggregation].
|
||||
///
|
||||
/// An aggregation is either a bucket or a metric.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum Aggregation {
|
||||
/// Bucket aggregation, see [BucketAggregation] for details.
|
||||
Bucket(BucketAggregation),
|
||||
/// Metric aggregation, see [MetricAggregation] for details.
|
||||
Metric(MetricAggregation),
|
||||
}
|
||||
|
||||
impl Aggregation {
|
||||
fn get_term_dict_field_names(&self, term_field_names: &mut HashSet<String>) {
|
||||
if let Aggregation::Bucket(bucket) = self {
|
||||
bucket.get_term_dict_field_names(term_field_names)
|
||||
}
|
||||
}
|
||||
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
match self {
|
||||
Aggregation::Bucket(bucket) => bucket.get_fast_field_names(fast_field_names),
|
||||
Aggregation::Metric(metric) => metric.get_fast_field_names(fast_field_names),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// BucketAggregations create buckets of documents. Each bucket is associated with a rule which
|
||||
/// determines whether or not a document in the falls into it. In other words, the buckets
|
||||
/// effectively define document sets. Buckets are not necessarily disjunct, therefore a document can
|
||||
/// fall into multiple buckets. In addition to the buckets themselves, the bucket aggregations also
|
||||
/// compute and return the number of documents for each bucket. Bucket aggregations, as opposed to
|
||||
/// metric aggregations, can hold sub-aggregations. These sub-aggregations will be aggregated for
|
||||
/// the buckets created by their "parent" bucket aggregation. There are different bucket
|
||||
/// aggregators, each with a different "bucketing" strategy. Some define a single bucket, some
|
||||
/// define fixed number of multiple buckets, and others dynamically create the buckets during the
|
||||
/// aggregation process.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct BucketAggregation {
|
||||
/// Bucket aggregation strategy to group documents.
|
||||
#[serde(flatten)]
|
||||
pub bucket_agg: BucketAggregationType,
|
||||
/// The sub_aggregations in the buckets. Each bucket will aggregate on the document set in the
|
||||
/// bucket.
|
||||
#[serde(rename = "aggs")]
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Aggregations::is_empty")]
|
||||
pub sub_aggregation: Aggregations,
|
||||
}
|
||||
|
||||
impl BucketAggregation {
|
||||
fn get_term_dict_field_names(&self, term_dict_field_names: &mut HashSet<String>) {
|
||||
if let BucketAggregationType::Terms(terms) = &self.bucket_agg {
|
||||
term_dict_field_names.insert(terms.field.to_string());
|
||||
}
|
||||
term_dict_field_names.extend(get_term_dict_field_names(&self.sub_aggregation));
|
||||
}
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
self.bucket_agg.get_fast_field_names(fast_field_names);
|
||||
fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
|
||||
}
|
||||
}
|
||||
|
||||
/// The bucket aggregation types.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum BucketAggregationType {
|
||||
/// Put data into buckets of user-defined ranges.
|
||||
#[serde(rename = "range")]
|
||||
Range(RangeAggregation),
|
||||
/// Put data into buckets of user-defined ranges.
|
||||
#[serde(rename = "histogram")]
|
||||
Histogram(HistogramAggregation),
|
||||
/// Put data into buckets of terms.
|
||||
#[serde(rename = "terms")]
|
||||
Terms(TermsAggregation),
|
||||
}
|
||||
|
||||
impl BucketAggregationType {
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
match self {
|
||||
BucketAggregationType::Terms(terms) => fast_field_names.insert(terms.field.to_string()),
|
||||
BucketAggregationType::Range(range) => fast_field_names.insert(range.field.to_string()),
|
||||
BucketAggregationType::Histogram(histogram) => {
|
||||
fast_field_names.insert(histogram.field.to_string())
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// The aggregations in this family compute metrics based on values extracted
|
||||
/// from the documents that are being aggregated. Values are extracted from the fast field of
|
||||
/// the document.
|
||||
|
||||
/// Some aggregations output a single numeric metric (e.g. Average) and are called
|
||||
/// single-value numeric metrics aggregation, others generate multiple metrics (e.g. Stats) and are
|
||||
/// called multi-value numeric metrics aggregation.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum MetricAggregation {
|
||||
/// Calculates the average.
|
||||
#[serde(rename = "avg")]
|
||||
Average(AverageAggregation),
|
||||
/// Calculates stats sum, average, min, max, standard_deviation on a field.
|
||||
#[serde(rename = "stats")]
|
||||
Stats(StatsAggregation),
|
||||
}
|
||||
|
||||
impl MetricAggregation {
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
match self {
|
||||
MetricAggregation::Average(avg) => fast_field_names.insert(avg.field.to_string()),
|
||||
MetricAggregation::Stats(stats) => fast_field_names.insert(stats.field.to_string()),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn serialize_to_json_test() {
|
||||
let agg_req1: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "score".to_string(),
|
||||
ranges: vec![
|
||||
(f64::MIN..3f64).into(),
|
||||
(3f64..7f64).into(),
|
||||
(7f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
],
|
||||
keyed: true,
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let elasticsearch_compatible_json_req = r#"{
|
||||
"range": {
|
||||
"range": {
|
||||
"field": "score",
|
||||
"ranges": [
|
||||
{
|
||||
"to": 3.0
|
||||
},
|
||||
{
|
||||
"from": 3.0,
|
||||
"to": 7.0
|
||||
},
|
||||
{
|
||||
"from": 7.0,
|
||||
"to": 20.0
|
||||
},
|
||||
{
|
||||
"from": 20.0
|
||||
}
|
||||
],
|
||||
"keyed": true
|
||||
}
|
||||
}
|
||||
}"#;
|
||||
let agg_req2: String = serde_json::to_string_pretty(&agg_req1).unwrap();
|
||||
assert_eq!(agg_req2, elasticsearch_compatible_json_req);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_fast_field_names() {
|
||||
let agg_req2: Aggregations = vec![
|
||||
(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "score2".to_string(),
|
||||
ranges: vec![
|
||||
(f64::MIN..3f64).into(),
|
||||
(3f64..7f64).into(),
|
||||
(7f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
),
|
||||
(
|
||||
"metric".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Average(
|
||||
AverageAggregation::from_field_name("field123".to_string()),
|
||||
)),
|
||||
),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let agg_req1: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "score".to_string(),
|
||||
ranges: vec![
|
||||
(f64::MIN..3f64).into(),
|
||||
(3f64..7f64).into(),
|
||||
(7f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: agg_req2,
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
get_fast_field_names(&agg_req1),
|
||||
vec![
|
||||
"score".to_string(),
|
||||
"score2".to_string(),
|
||||
"field123".to_string()
|
||||
]
|
||||
.into_iter()
|
||||
.collect()
|
||||
)
|
||||
}
|
||||
}
|
||||
221
src/aggregation/agg_req_with_accessor.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! This will enhance the request tree with access to the fastfield and metadata.
|
||||
|
||||
use std::rc::Rc;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
|
||||
use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
use super::segment_agg_result::BucketCount;
|
||||
use super::VecWithNames;
|
||||
use crate::fastfield::{
|
||||
type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader,
|
||||
};
|
||||
use crate::schema::{Cardinality, Type};
|
||||
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub(crate) struct AggregationsWithAccessor {
|
||||
pub metrics: VecWithNames<MetricAggregationWithAccessor>,
|
||||
pub buckets: VecWithNames<BucketAggregationWithAccessor>,
|
||||
}
|
||||
|
||||
impl AggregationsWithAccessor {
|
||||
fn from_data(
|
||||
metrics: VecWithNames<MetricAggregationWithAccessor>,
|
||||
buckets: VecWithNames<BucketAggregationWithAccessor>,
|
||||
) -> Self {
|
||||
Self { metrics, buckets }
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.metrics.is_empty() && self.buckets.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) enum FastFieldAccessor {
|
||||
Multi(MultiValuedFastFieldReader<u64>),
|
||||
Single(DynamicFastFieldReader<u64>),
|
||||
}
|
||||
impl FastFieldAccessor {
|
||||
pub fn as_single(&self) -> Option<&DynamicFastFieldReader<u64>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(_) => None,
|
||||
FastFieldAccessor::Single(reader) => Some(reader),
|
||||
}
|
||||
}
|
||||
pub fn as_multi(&self) -> Option<&MultiValuedFastFieldReader<u64>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(reader) => Some(reader),
|
||||
FastFieldAccessor::Single(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BucketAggregationWithAccessor {
|
||||
/// In general there can be buckets without fast field access, e.g. buckets that are created
|
||||
/// based on search terms. So eventually this needs to be Option or moved.
|
||||
pub(crate) accessor: FastFieldAccessor,
|
||||
pub(crate) inverted_index: Option<Arc<InvertedIndexReader>>,
|
||||
pub(crate) field_type: Type,
|
||||
pub(crate) bucket_agg: BucketAggregationType,
|
||||
pub(crate) sub_aggregation: AggregationsWithAccessor,
|
||||
pub(crate) bucket_count: BucketCount,
|
||||
}
|
||||
|
||||
impl BucketAggregationWithAccessor {
|
||||
fn try_from_bucket(
|
||||
bucket: &BucketAggregationType,
|
||||
sub_aggregation: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
bucket_count: Rc<AtomicU32>,
|
||||
max_bucket_count: u32,
|
||||
) -> crate::Result<BucketAggregationWithAccessor> {
|
||||
let mut inverted_index = None;
|
||||
let (accessor, field_type) = match &bucket {
|
||||
BucketAggregationType::Range(RangeAggregation {
|
||||
field: field_name, ..
|
||||
}) => get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?,
|
||||
BucketAggregationType::Histogram(HistogramAggregation {
|
||||
field: field_name, ..
|
||||
}) => get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?,
|
||||
BucketAggregationType::Terms(TermsAggregation {
|
||||
field: field_name, ..
|
||||
}) => {
|
||||
let field = reader
|
||||
.schema()
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
|
||||
inverted_index = Some(reader.inverted_index(field)?);
|
||||
get_ff_reader_and_validate(reader, field_name, Cardinality::MultiValues)?
|
||||
}
|
||||
};
|
||||
let sub_aggregation = sub_aggregation.clone();
|
||||
Ok(BucketAggregationWithAccessor {
|
||||
accessor,
|
||||
field_type,
|
||||
sub_aggregation: get_aggs_with_accessor_and_validate(
|
||||
&sub_aggregation,
|
||||
reader,
|
||||
bucket_count.clone(),
|
||||
max_bucket_count,
|
||||
)?,
|
||||
bucket_agg: bucket.clone(),
|
||||
inverted_index,
|
||||
bucket_count: BucketCount {
|
||||
bucket_count,
|
||||
max_bucket_count,
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Contains the metric request and the fast field accessor.
|
||||
#[derive(Clone)]
|
||||
pub struct MetricAggregationWithAccessor {
|
||||
pub metric: MetricAggregation,
|
||||
pub field_type: Type,
|
||||
pub accessor: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl MetricAggregationWithAccessor {
|
||||
fn try_from_metric(
|
||||
metric: &MetricAggregation,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<MetricAggregationWithAccessor> {
|
||||
match &metric {
|
||||
MetricAggregation::Average(AverageAggregation { field: field_name })
|
||||
| MetricAggregation::Stats(StatsAggregation { field: field_name }) => {
|
||||
let (accessor, field_type) =
|
||||
get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?;
|
||||
|
||||
Ok(MetricAggregationWithAccessor {
|
||||
accessor: accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinality")
|
||||
.clone(),
|
||||
field_type,
|
||||
metric: metric.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_aggs_with_accessor_and_validate(
|
||||
aggs: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
bucket_count: Rc<AtomicU32>,
|
||||
max_bucket_count: u32,
|
||||
) -> crate::Result<AggregationsWithAccessor> {
|
||||
let mut metrics = vec![];
|
||||
let mut buckets = vec![];
|
||||
for (key, agg) in aggs.iter() {
|
||||
match agg {
|
||||
Aggregation::Bucket(bucket) => buckets.push((
|
||||
key.to_string(),
|
||||
BucketAggregationWithAccessor::try_from_bucket(
|
||||
&bucket.bucket_agg,
|
||||
&bucket.sub_aggregation,
|
||||
reader,
|
||||
Rc::clone(&bucket_count),
|
||||
max_bucket_count,
|
||||
)?,
|
||||
)),
|
||||
Aggregation::Metric(metric) => metrics.push((
|
||||
key.to_string(),
|
||||
MetricAggregationWithAccessor::try_from_metric(metric, reader)?,
|
||||
)),
|
||||
}
|
||||
}
|
||||
Ok(AggregationsWithAccessor::from_data(
|
||||
VecWithNames::from_entries(metrics),
|
||||
VecWithNames::from_entries(buckets),
|
||||
))
|
||||
}
|
||||
|
||||
/// Get fast field reader with given cardinatility.
|
||||
fn get_ff_reader_and_validate(
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
cardinality: Cardinality,
|
||||
) -> crate::Result<(FastFieldAccessor, Type)> {
|
||||
let field = reader
|
||||
.schema()
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
|
||||
let field_type = reader.schema().get_field_entry(field).field_type();
|
||||
|
||||
if let Some((ff_type, field_cardinality)) = type_and_cardinality(field_type) {
|
||||
if ff_type == FastType::Date {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"Unsupported field type date in aggregation".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
if cardinality != field_cardinality {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Invalid field cardinality on field {} expected {:?}, but got {:?}",
|
||||
field_name, cardinality, field_cardinality
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Only fast fields of type f64, u64, i64 are supported, but got {:?} ",
|
||||
field_type.value_type()
|
||||
)));
|
||||
};
|
||||
|
||||
let ff_fields = reader.fast_fields();
|
||||
match cardinality {
|
||||
Cardinality::SingleValue => ff_fields
|
||||
.u64_lenient(field)
|
||||
.map(|field| (FastFieldAccessor::Single(field), field_type.value_type())),
|
||||
Cardinality::MultiValues => ff_fields
|
||||
.u64s_lenient(field)
|
||||
.map(|field| (FastFieldAccessor::Multi(field), field_type.value_type())),
|
||||
}
|
||||
}
|
||||
243
src/aggregation/agg_result.rs
Normal file
@@ -0,0 +1,243 @@
|
||||
//! Contains the final aggregation tree.
|
||||
//! This tree can be converted via the `into()` method from `IntermediateAggregationResults`.
|
||||
//! This conversion computes the final result. For example: The intermediate result contains
|
||||
//! intermediate average results, which is the sum and the number of values. The actual average is
|
||||
//! calculated on the step from intermediate to final aggregation result tree.
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use fnv::FnvHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::agg_req::BucketAggregationInternal;
|
||||
use super::bucket::GetDocCount;
|
||||
use super::intermediate_agg_result::{IntermediateBucketResult, IntermediateMetricResult};
|
||||
use super::metric::{SingleMetricResult, Stats};
|
||||
use super::Key;
|
||||
use crate::TantivyError;
|
||||
|
||||
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// The final aggegation result.
|
||||
pub struct AggregationResults(pub HashMap<String, AggregationResult>);
|
||||
|
||||
impl AggregationResults {
|
||||
pub(crate) fn get_value_from_aggregation(
|
||||
&self,
|
||||
name: &str,
|
||||
agg_property: &str,
|
||||
) -> crate::Result<Option<f64>> {
|
||||
if let Some(agg) = self.0.get(name) {
|
||||
agg.get_value_from_aggregation(name, agg_property)
|
||||
} else {
|
||||
// Validation is be done during request parsing, so we can't reach this state.
|
||||
Err(TantivyError::InternalError(format!(
|
||||
"Can't find aggregation {:?} in sub_aggregations",
|
||||
name
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
/// An aggregation is either a bucket or a metric.
|
||||
pub enum AggregationResult {
|
||||
/// Bucket result variant.
|
||||
BucketResult(BucketResult),
|
||||
/// Metric result variant.
|
||||
MetricResult(MetricResult),
|
||||
}
|
||||
|
||||
impl AggregationResult {
|
||||
pub(crate) fn get_value_from_aggregation(
|
||||
&self,
|
||||
_name: &str,
|
||||
agg_property: &str,
|
||||
) -> crate::Result<Option<f64>> {
|
||||
match self {
|
||||
AggregationResult::BucketResult(_bucket) => Err(TantivyError::InternalError(
|
||||
"Tried to retrieve value from bucket aggregation. This is not supported and \
|
||||
should not happen during collection phase, but should be caught during validation"
|
||||
.to_string(),
|
||||
)),
|
||||
AggregationResult::MetricResult(metric) => metric.get_value(agg_property),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
/// MetricResult
|
||||
pub enum MetricResult {
|
||||
/// Average metric result.
|
||||
Average(SingleMetricResult),
|
||||
/// Stats metric result.
|
||||
Stats(Stats),
|
||||
}
|
||||
|
||||
impl MetricResult {
|
||||
fn get_value(&self, agg_property: &str) -> crate::Result<Option<f64>> {
|
||||
match self {
|
||||
MetricResult::Average(avg) => Ok(avg.value),
|
||||
MetricResult::Stats(stats) => stats.get_value(agg_property),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl From<IntermediateMetricResult> for MetricResult {
|
||||
fn from(metric: IntermediateMetricResult) -> Self {
|
||||
match metric {
|
||||
IntermediateMetricResult::Average(avg_data) => {
|
||||
MetricResult::Average(avg_data.finalize().into())
|
||||
}
|
||||
IntermediateMetricResult::Stats(intermediate_stats) => {
|
||||
MetricResult::Stats(intermediate_stats.finalize())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// BucketEntry holds bucket aggregation result types.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum BucketResult {
|
||||
/// This is the range entry for a bucket, which contains a key, count, from, to, and optionally
|
||||
/// sub_aggregations.
|
||||
Range {
|
||||
/// The range buckets sorted by range.
|
||||
buckets: BucketEntries<RangeBucketEntry>,
|
||||
},
|
||||
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
Histogram {
|
||||
/// The buckets.
|
||||
///
|
||||
/// If there are holes depends on the request, if min_doc_count is 0, then there are no
|
||||
/// holes between the first and last bucket.
|
||||
/// See [HistogramAggregation](super::bucket::HistogramAggregation)
|
||||
buckets: BucketEntries<BucketEntry>,
|
||||
},
|
||||
/// This is the term result
|
||||
Terms {
|
||||
/// The buckets.
|
||||
///
|
||||
/// See [TermsAggregation](super::bucket::TermsAggregation)
|
||||
buckets: Vec<BucketEntry>,
|
||||
/// The number of documents that didn’t make it into to TOP N due to shard_size or size
|
||||
sum_other_doc_count: u64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
/// The upper bound error for the doc count of each term.
|
||||
doc_count_error_upper_bound: Option<u64>,
|
||||
},
|
||||
}
|
||||
|
||||
impl BucketResult {
|
||||
pub(crate) fn empty_from_req(req: &BucketAggregationInternal) -> crate::Result<Self> {
|
||||
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
|
||||
empty_bucket.into_final_bucket_result(req)
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the wrapper of buckets entries, which can be vector or hashmap
|
||||
/// depending on if it's keyed or not.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum BucketEntries<T> {
|
||||
/// Vector format bucket entries
|
||||
Vec(Vec<T>),
|
||||
/// HashMap format bucket entries
|
||||
HashMap(FnvHashMap<String, T>),
|
||||
}
|
||||
|
||||
/// This is the default entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// ...
|
||||
/// "my_histogram": {
|
||||
/// "buckets": [
|
||||
/// {
|
||||
/// "key": "2.0",
|
||||
/// "doc_count": 5
|
||||
/// },
|
||||
/// {
|
||||
/// "key": "4.0",
|
||||
/// "doc_count": 2
|
||||
/// },
|
||||
/// {
|
||||
/// "key": "6.0",
|
||||
/// "doc_count": 3
|
||||
/// }
|
||||
/// ]
|
||||
/// }
|
||||
/// ...
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct BucketEntry {
|
||||
/// The identifier of the bucket.
|
||||
pub key: Key,
|
||||
/// Number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
#[serde(flatten)]
|
||||
/// Sub-aggregations in this bucket.
|
||||
pub sub_aggregation: AggregationResults,
|
||||
}
|
||||
impl GetDocCount for &BucketEntry {
|
||||
fn doc_count(&self) -> u64 {
|
||||
self.doc_count
|
||||
}
|
||||
}
|
||||
impl GetDocCount for BucketEntry {
|
||||
fn doc_count(&self) -> u64 {
|
||||
self.doc_count
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the range entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// ...
|
||||
/// "my_ranges": {
|
||||
/// "buckets": [
|
||||
/// {
|
||||
/// "key": "*-10",
|
||||
/// "to": 10,
|
||||
/// "doc_count": 5
|
||||
/// },
|
||||
/// {
|
||||
/// "key": "10-20",
|
||||
/// "from": 10,
|
||||
/// "to": 20,
|
||||
/// "doc_count": 2
|
||||
/// },
|
||||
/// {
|
||||
/// "key": "20-*",
|
||||
/// "from": 20,
|
||||
/// "doc_count": 3
|
||||
/// }
|
||||
/// ]
|
||||
/// }
|
||||
/// ...
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RangeBucketEntry {
|
||||
/// The identifier of the bucket.
|
||||
pub key: Key,
|
||||
/// Number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
#[serde(flatten)]
|
||||
/// sub-aggregations in this bucket.
|
||||
pub sub_aggregation: AggregationResults,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
1442
src/aggregation/bucket/histogram/histogram.rs
Normal file
2
src/aggregation/bucket/histogram/mod.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
mod histogram;
|
||||
pub use histogram::*;
|
||||
140
src/aggregation/bucket/mod.rs
Normal file
@@ -0,0 +1,140 @@
|
||||
//! Module for all bucket aggregations.
|
||||
//!
|
||||
//! BucketAggregations create buckets of documents
|
||||
//! [BucketAggregation](super::agg_req::BucketAggregation).
|
||||
//!
|
||||
//! Results of final buckets are [BucketResult](super::agg_result::BucketResult).
|
||||
//! Results of intermediate buckets are
|
||||
//! [IntermediateBucketResult](super::intermediate_agg_result::IntermediateBucketResult)
|
||||
|
||||
mod histogram;
|
||||
mod range;
|
||||
mod term_agg;
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub(crate) use histogram::SegmentHistogramCollector;
|
||||
pub use histogram::*;
|
||||
pub(crate) use range::SegmentRangeCollector;
|
||||
pub use range::*;
|
||||
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
|
||||
pub use term_agg::*;
|
||||
|
||||
/// Order for buckets in a bucket aggregation.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum Order {
|
||||
/// Asc order
|
||||
#[serde(rename = "asc")]
|
||||
Asc,
|
||||
/// Desc order
|
||||
#[serde(rename = "desc")]
|
||||
Desc,
|
||||
}
|
||||
|
||||
impl Default for Order {
|
||||
fn default() -> Self {
|
||||
Order::Desc
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
/// Order property by which to apply the order
|
||||
pub enum OrderTarget {
|
||||
/// The key of the bucket
|
||||
Key,
|
||||
/// The doc count of the bucket
|
||||
Count,
|
||||
/// Order by value of the sub aggregation metric with identified by given `String`.
|
||||
///
|
||||
/// Only single value metrics are supported currently
|
||||
SubAggregation(String),
|
||||
}
|
||||
|
||||
impl Default for OrderTarget {
|
||||
fn default() -> Self {
|
||||
OrderTarget::Count
|
||||
}
|
||||
}
|
||||
impl From<&str> for OrderTarget {
|
||||
fn from(val: &str) -> Self {
|
||||
match val {
|
||||
"_key" => OrderTarget::Key,
|
||||
"_count" => OrderTarget::Count,
|
||||
_ => OrderTarget::SubAggregation(val.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ToString for OrderTarget {
|
||||
fn to_string(&self) -> String {
|
||||
match self {
|
||||
OrderTarget::Key => "_key".to_string(),
|
||||
OrderTarget::Count => "_count".to_string(),
|
||||
OrderTarget::SubAggregation(agg) => agg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the order. target is either "_count", "_key", or the name of
|
||||
/// a metric sub_aggregation.
|
||||
///
|
||||
/// De/Serializes to elasticsearch compatible JSON.
|
||||
///
|
||||
/// Examples in JSON format:
|
||||
/// { "_count": "asc" }
|
||||
/// { "_key": "asc" }
|
||||
/// { "average_price": "asc" }
|
||||
#[derive(Clone, Default, Debug, PartialEq)]
|
||||
pub struct CustomOrder {
|
||||
/// The target property by which to sort by
|
||||
pub target: OrderTarget,
|
||||
/// The order asc or desc
|
||||
pub order: Order,
|
||||
}
|
||||
|
||||
impl Serialize for CustomOrder {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer {
|
||||
let map: HashMap<String, Order> =
|
||||
std::iter::once((self.target.to_string(), self.order)).collect();
|
||||
map.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for CustomOrder {
|
||||
fn deserialize<D>(deserializer: D) -> Result<CustomOrder, D::Error>
|
||||
where D: Deserializer<'de> {
|
||||
HashMap::<String, Order>::deserialize(deserializer).and_then(|map| {
|
||||
if let Some((key, value)) = map.into_iter().next() {
|
||||
Ok(CustomOrder {
|
||||
target: key.as_str().into(),
|
||||
order: value,
|
||||
})
|
||||
} else {
|
||||
Err(de::Error::custom(
|
||||
"unexpected empty map in order".to_string(),
|
||||
))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn custom_order_serde_test() {
|
||||
let order = CustomOrder {
|
||||
target: OrderTarget::Key,
|
||||
order: Order::Desc,
|
||||
};
|
||||
|
||||
let order_str = serde_json::to_string(&order).unwrap();
|
||||
assert_eq!(order_str, "{\"_key\":\"desc\"}");
|
||||
let order_deser = serde_json::from_str(&order_str).unwrap();
|
||||
|
||||
assert_eq!(order, order_deser);
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("{}");
|
||||
assert!(order_deser.is_err());
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("[]");
|
||||
assert!(order_deser.is_err());
|
||||
}
|
||||
788
src/aggregation/bucket/range.rs
Normal file
@@ -0,0 +1,788 @@
|
||||
use std::fmt::Debug;
|
||||
use std::ops::Range;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use fnv::FnvHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::agg_req_with_accessor::{
|
||||
AggregationsWithAccessor, BucketAggregationWithAccessor,
|
||||
};
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateBucketResult, IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::{BucketCount, SegmentAggregationResultsCollector};
|
||||
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key, SerializedKey};
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
/// Provide user-defined buckets to aggregate on.
|
||||
/// Two special buckets will automatically be created to cover the whole range of values.
|
||||
/// The provided buckets have to be continuous.
|
||||
/// During the aggregation, the values extracted from the fast_field `field` will be checked
|
||||
/// against each bucket range. Note that this aggregation includes the from value and excludes the
|
||||
/// to value for each range.
|
||||
///
|
||||
/// Result type is [BucketResult](crate::aggregation::agg_result::BucketResult) with
|
||||
/// [RangeBucketEntry](crate::aggregation::agg_result::RangeBucketEntry) on the
|
||||
/// AggregationCollector.
|
||||
///
|
||||
/// Result type is
|
||||
/// [crate::aggregation::intermediate_agg_result::IntermediateBucketResult] with
|
||||
/// [crate::aggregation::intermediate_agg_result::IntermediateRangeBucketEntry] on the
|
||||
/// DistributedAggregationCollector.
|
||||
///
|
||||
/// # Limitations/Compatibility
|
||||
/// Overlapping ranges are not yet supported.
|
||||
///
|
||||
/// # Request JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "my_ranges": {
|
||||
/// "field": "score",
|
||||
/// "ranges": [
|
||||
/// { "to": 3.0 },
|
||||
/// { "from": 3.0, "to": 7.0 },
|
||||
/// { "from": 7.0, "to": 20.0 },
|
||||
/// { "from": 20.0 }
|
||||
/// ]
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RangeAggregation {
|
||||
/// The field to aggregate on.
|
||||
pub field: String,
|
||||
/// Note that this aggregation includes the from value and excludes the to value for each
|
||||
/// range. Extra buckets will be created until the first to, and last from, if necessary.
|
||||
pub ranges: Vec<RangeAggregationRange>,
|
||||
/// Whether to return the buckets as a hash map
|
||||
#[serde(default)]
|
||||
pub keyed: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// The range for one range bucket.
|
||||
pub struct RangeAggregationRange {
|
||||
/// Custom key for the range bucket
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub key: Option<String>,
|
||||
/// The from range value, which is inclusive in the range.
|
||||
/// None equals to an open ended interval.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub from: Option<f64>,
|
||||
/// The to range value, which is not inclusive in the range.
|
||||
/// None equals to an open ended interval.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl From<Range<f64>> for RangeAggregationRange {
|
||||
fn from(range: Range<f64>) -> Self {
|
||||
let from = if range.start == f64::MIN {
|
||||
None
|
||||
} else {
|
||||
Some(range.start)
|
||||
};
|
||||
let to = if range.end == f64::MAX {
|
||||
None
|
||||
} else {
|
||||
Some(range.end)
|
||||
};
|
||||
RangeAggregationRange {
|
||||
key: None,
|
||||
from,
|
||||
to,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
/// Internally used u64 range for one range bucket.
|
||||
pub(crate) struct InternalRangeAggregationRange {
|
||||
/// Custom key for the range bucket
|
||||
key: Option<String>,
|
||||
/// u64 range value
|
||||
range: Range<u64>,
|
||||
}
|
||||
|
||||
impl From<Range<u64>> for InternalRangeAggregationRange {
|
||||
fn from(range: Range<u64>) -> Self {
|
||||
InternalRangeAggregationRange { key: None, range }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) struct SegmentRangeAndBucketEntry {
|
||||
range: Range<u64>,
|
||||
bucket: SegmentRangeBucketEntry,
|
||||
}
|
||||
|
||||
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
||||
/// the correct datatype.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub struct SegmentRangeCollector {
|
||||
/// The buckets containing the aggregation data.
|
||||
buckets: Vec<SegmentRangeAndBucketEntry>,
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentRangeBucketEntry {
|
||||
pub key: Key,
|
||||
pub doc_count: u64,
|
||||
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None. Open interval, `to` is not
|
||||
/// inclusive.
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl Debug for SegmentRangeBucketEntry {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentRangeBucketEntry")
|
||||
.field("key", &self.key)
|
||||
.field("doc_count", &self.doc_count)
|
||||
.field("from", &self.from)
|
||||
.field("to", &self.to)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
impl SegmentRangeBucketEntry {
|
||||
pub(crate) fn into_intermediate_bucket_entry(
|
||||
self,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<IntermediateRangeBucketEntry> {
|
||||
let sub_aggregation = if let Some(sub_aggregation) = self.sub_aggregation {
|
||||
sub_aggregation.into_intermediate_aggregations_result(agg_with_accessor)?
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
Ok(IntermediateRangeBucketEntry {
|
||||
key: self.key,
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation,
|
||||
from: self.from,
|
||||
to: self.to,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentRangeCollector {
|
||||
pub fn into_intermediate_bucket_result(
|
||||
self,
|
||||
agg_with_accessor: &BucketAggregationWithAccessor,
|
||||
) -> crate::Result<IntermediateBucketResult> {
|
||||
let field_type = self.field_type;
|
||||
|
||||
let buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry> = self
|
||||
.buckets
|
||||
.into_iter()
|
||||
.map(move |range_bucket| {
|
||||
Ok((
|
||||
range_to_string(&range_bucket.range, &field_type),
|
||||
range_bucket
|
||||
.bucket
|
||||
.into_intermediate_bucket_entry(&agg_with_accessor.sub_aggregation)?,
|
||||
))
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
|
||||
Ok(IntermediateBucketResult::Range(
|
||||
IntermediateRangeBucketResult { buckets },
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(
|
||||
req: &RangeAggregation,
|
||||
sub_aggregation: &AggregationsWithAccessor,
|
||||
bucket_count: &BucketCount,
|
||||
field_type: Type,
|
||||
) -> crate::Result<Self> {
|
||||
// The range input on the request is f64.
|
||||
// We need to convert to u64 ranges, because we read the values as u64.
|
||||
// The mapping from the conversion is monotonic so ordering is preserved.
|
||||
let buckets: Vec<_> = extend_validate_ranges(&req.ranges, &field_type)?
|
||||
.iter()
|
||||
.map(|range| {
|
||||
let key = range
|
||||
.key
|
||||
.clone()
|
||||
.map(Key::Str)
|
||||
.unwrap_or_else(|| range_to_key(&range.range, &field_type));
|
||||
let to = if range.range.end == u64::MAX {
|
||||
None
|
||||
} else {
|
||||
Some(f64_from_fastfield_u64(range.range.end, &field_type))
|
||||
};
|
||||
let from = if range.range.start == u64::MIN {
|
||||
None
|
||||
} else {
|
||||
Some(f64_from_fastfield_u64(range.range.start, &field_type))
|
||||
};
|
||||
let sub_aggregation = if sub_aggregation.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(SegmentAggregationResultsCollector::from_req_and_validate(
|
||||
sub_aggregation,
|
||||
)?)
|
||||
};
|
||||
Ok(SegmentRangeAndBucketEntry {
|
||||
range: range.range.clone(),
|
||||
bucket: SegmentRangeBucketEntry {
|
||||
doc_count: 0,
|
||||
sub_aggregation,
|
||||
key,
|
||||
from,
|
||||
to,
|
||||
},
|
||||
})
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
|
||||
bucket_count.add_count(buckets.len() as u32);
|
||||
bucket_count.validate_bucket_count()?;
|
||||
|
||||
Ok(SegmentRangeCollector {
|
||||
buckets,
|
||||
field_type,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn collect_block(
|
||||
&mut self,
|
||||
doc: &[DocId],
|
||||
bucket_with_accessor: &BucketAggregationWithAccessor,
|
||||
force_flush: bool,
|
||||
) -> crate::Result<()> {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
let accessor = bucket_with_accessor
|
||||
.accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinatility");
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = accessor.get_val(docs[0] as u64);
|
||||
let val2 = accessor.get_val(docs[1] as u64);
|
||||
let val3 = accessor.get_val(docs[2] as u64);
|
||||
let val4 = accessor.get_val(docs[3] as u64);
|
||||
let bucket_pos1 = self.get_bucket_pos(val1);
|
||||
let bucket_pos2 = self.get_bucket_pos(val2);
|
||||
let bucket_pos3 = self.get_bucket_pos(val3);
|
||||
let bucket_pos4 = self.get_bucket_pos(val4);
|
||||
|
||||
self.increment_bucket(bucket_pos1, docs[0], &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos2, docs[1], &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos3, docs[2], &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?;
|
||||
}
|
||||
for &doc in iter.remainder() {
|
||||
let val = accessor.get_val(doc as u64);
|
||||
let bucket_pos = self.get_bucket_pos(val);
|
||||
self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?;
|
||||
}
|
||||
if force_flush {
|
||||
for bucket in &mut self.buckets {
|
||||
if let Some(sub_aggregation) = &mut bucket.bucket.sub_aggregation {
|
||||
sub_aggregation
|
||||
.flush_staged_docs(&bucket_with_accessor.sub_aggregation, force_flush)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn increment_bucket(
|
||||
&mut self,
|
||||
bucket_pos: usize,
|
||||
doc: DocId,
|
||||
bucket_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<()> {
|
||||
let bucket = &mut self.buckets[bucket_pos];
|
||||
|
||||
bucket.bucket.doc_count += 1;
|
||||
if let Some(sub_aggregation) = &mut bucket.bucket.sub_aggregation {
|
||||
sub_aggregation.collect(doc, bucket_with_accessor)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_bucket_pos(&self, val: u64) -> usize {
|
||||
let pos = self
|
||||
.buckets
|
||||
.binary_search_by_key(&val, |probe| probe.range.start)
|
||||
.unwrap_or_else(|pos| pos - 1);
|
||||
debug_assert!(self.buckets[pos].range.contains(&val));
|
||||
pos
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts the user provided f64 range value to fast field value space.
|
||||
///
|
||||
/// Internally fast field values are always stored as u64.
|
||||
/// If the fast field has u64 [1,2,5], these values are stored as is in the fast field.
|
||||
/// A fast field with f64 [1.0, 2.0, 5.0] is converted to u64 space, using a
|
||||
/// monotonic mapping function, so the order is preserved.
|
||||
///
|
||||
/// Consequently, a f64 user range 1.0..3.0 needs to be converted to fast field value space using
|
||||
/// the same monotonic mapping function, so that the provided ranges contain the u64 values in the
|
||||
/// fast field.
|
||||
/// The alternative would be that every value read would be converted to the f64 range, but that is
|
||||
/// more computational expensive when many documents are hit.
|
||||
fn to_u64_range(
|
||||
range: &RangeAggregationRange,
|
||||
field_type: &Type,
|
||||
) -> crate::Result<InternalRangeAggregationRange> {
|
||||
let start = if let Some(from) = range.from {
|
||||
f64_to_fastfield_u64(from, field_type)
|
||||
.ok_or_else(|| TantivyError::InvalidArgument("invalid field type".to_string()))?
|
||||
} else {
|
||||
u64::MIN
|
||||
};
|
||||
|
||||
let end = if let Some(to) = range.to {
|
||||
f64_to_fastfield_u64(to, field_type)
|
||||
.ok_or_else(|| TantivyError::InvalidArgument("invalid field type".to_string()))?
|
||||
} else {
|
||||
u64::MAX
|
||||
};
|
||||
|
||||
Ok(InternalRangeAggregationRange {
|
||||
key: range.key.clone(),
|
||||
range: start..end,
|
||||
})
|
||||
}
|
||||
|
||||
/// Extends the provided buckets to contain the whole value range, by inserting buckets at the
|
||||
/// beginning and end and filling gaps.
|
||||
fn extend_validate_ranges(
|
||||
buckets: &[RangeAggregationRange],
|
||||
field_type: &Type,
|
||||
) -> crate::Result<Vec<InternalRangeAggregationRange>> {
|
||||
let mut converted_buckets = buckets
|
||||
.iter()
|
||||
.map(|range| to_u64_range(range, field_type))
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
|
||||
converted_buckets.sort_by_key(|bucket| bucket.range.start);
|
||||
if converted_buckets[0].range.start != u64::MIN {
|
||||
converted_buckets.insert(0, (u64::MIN..converted_buckets[0].range.start).into());
|
||||
}
|
||||
|
||||
if converted_buckets[converted_buckets.len() - 1].range.end != u64::MAX {
|
||||
converted_buckets
|
||||
.push((converted_buckets[converted_buckets.len() - 1].range.end..u64::MAX).into());
|
||||
}
|
||||
|
||||
// fill up holes in the ranges
|
||||
let find_hole = |converted_buckets: &[InternalRangeAggregationRange]| {
|
||||
for (pos, ranges) in converted_buckets.windows(2).enumerate() {
|
||||
if ranges[0].range.end > ranges[1].range.start {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Overlapping ranges not supported range {:?}, range+1 {:?}",
|
||||
ranges[0], ranges[1]
|
||||
)));
|
||||
}
|
||||
if ranges[0].range.end != ranges[1].range.start {
|
||||
return Ok(Some(pos));
|
||||
}
|
||||
}
|
||||
Ok(None)
|
||||
};
|
||||
|
||||
while let Some(hole_pos) = find_hole(&converted_buckets)? {
|
||||
let new_range =
|
||||
converted_buckets[hole_pos].range.end..converted_buckets[hole_pos + 1].range.start;
|
||||
converted_buckets.insert(hole_pos + 1, new_range.into());
|
||||
}
|
||||
|
||||
Ok(converted_buckets)
|
||||
}
|
||||
|
||||
pub(crate) fn range_to_string(range: &Range<u64>, field_type: &Type) -> String {
|
||||
// is_start is there for malformed requests, e.g. ig the user passes the range u64::MIN..0.0,
|
||||
// it should be rendered as "*-0" and not "*-*"
|
||||
let to_str = |val: u64, is_start: bool| {
|
||||
if (is_start && val == u64::MIN) || (!is_start && val == u64::MAX) {
|
||||
"*".to_string()
|
||||
} else {
|
||||
f64_from_fastfield_u64(val, field_type).to_string()
|
||||
}
|
||||
};
|
||||
|
||||
format!("{}-{}", to_str(range.start, true), to_str(range.end, false))
|
||||
}
|
||||
|
||||
pub(crate) fn range_to_key(range: &Range<u64>, field_type: &Type) -> Key {
|
||||
Key::Str(range_to_string(range, field_type))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::aggregation::agg_req::{
|
||||
Aggregation, Aggregations, BucketAggregation, BucketAggregationType,
|
||||
};
|
||||
use crate::aggregation::tests::{exec_request_with_query, get_test_index_with_num_docs};
|
||||
use crate::fastfield::FastValue;
|
||||
|
||||
pub fn get_collector_from_ranges(
|
||||
ranges: Vec<RangeAggregationRange>,
|
||||
field_type: Type,
|
||||
) -> SegmentRangeCollector {
|
||||
let req = RangeAggregation {
|
||||
field: "dummy".to_string(),
|
||||
ranges,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
SegmentRangeCollector::from_req_and_validate(
|
||||
&req,
|
||||
&Default::default(),
|
||||
&Default::default(),
|
||||
field_type,
|
||||
)
|
||||
.expect("unexpected error")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_fraction_test() -> crate::Result<()> {
|
||||
let index = get_test_index_with_num_docs(false, 100)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "fraction_f64".to_string(),
|
||||
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
|
||||
assert_eq!(res["range"]["buckets"][0]["key"], "*-0");
|
||||
assert_eq!(res["range"]["buckets"][0]["doc_count"], 0);
|
||||
assert_eq!(res["range"]["buckets"][1]["key"], "0-0.1");
|
||||
assert_eq!(res["range"]["buckets"][1]["doc_count"], 10);
|
||||
assert_eq!(res["range"]["buckets"][2]["key"], "0.1-0.2");
|
||||
assert_eq!(res["range"]["buckets"][2]["doc_count"], 10);
|
||||
assert_eq!(res["range"]["buckets"][3]["key"], "0.2-*");
|
||||
assert_eq!(res["range"]["buckets"][3]["doc_count"], 80);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_keyed_buckets_test() -> crate::Result<()> {
|
||||
let index = get_test_index_with_num_docs(false, 100)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "fraction_f64".to_string(),
|
||||
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
|
||||
keyed: true,
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
|
||||
assert_eq!(
|
||||
res,
|
||||
json!({
|
||||
"range": {
|
||||
"buckets": {
|
||||
"*-0": { "key": "*-0", "doc_count": 0, "to": 0.0},
|
||||
"0-0.1": {"key": "0-0.1", "doc_count": 10, "from": 0.0, "to": 0.1},
|
||||
"0.1-0.2": {"key": "0.1-0.2", "doc_count": 10, "from": 0.1, "to": 0.2},
|
||||
"0.2-*": {"key": "0.2-*", "doc_count": 80, "from": 0.2},
|
||||
}
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_custom_key_test() -> crate::Result<()> {
|
||||
let index = get_test_index_with_num_docs(false, 100)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "fraction_f64".to_string(),
|
||||
ranges: vec![
|
||||
RangeAggregationRange {
|
||||
key: Some("custom-key-0-to-0.1".to_string()),
|
||||
from: Some(0f64),
|
||||
to: Some(0.1f64),
|
||||
},
|
||||
RangeAggregationRange {
|
||||
key: None,
|
||||
from: Some(0.1f64),
|
||||
to: Some(0.2f64),
|
||||
},
|
||||
],
|
||||
keyed: false,
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
|
||||
assert_eq!(
|
||||
res,
|
||||
json!({
|
||||
"range": {
|
||||
"buckets": [
|
||||
{"key": "*-0", "doc_count": 0, "to": 0.0},
|
||||
{"key": "custom-key-0-to-0.1", "doc_count": 10, "from": 0.0, "to": 0.1},
|
||||
{"key": "0.1-0.2", "doc_count": 10, "from": 0.1, "to": 0.2},
|
||||
{"key": "0.2-*", "doc_count": 80, "from": 0.2}
|
||||
]
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_custom_key_keyed_buckets_test() -> crate::Result<()> {
|
||||
let index = get_test_index_with_num_docs(false, 100)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "fraction_f64".to_string(),
|
||||
ranges: vec![RangeAggregationRange {
|
||||
key: Some("custom-key-0-to-0.1".to_string()),
|
||||
from: Some(0f64),
|
||||
to: Some(0.1f64),
|
||||
}],
|
||||
keyed: true,
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
|
||||
assert_eq!(
|
||||
res,
|
||||
json!({
|
||||
"range": {
|
||||
"buckets": {
|
||||
"*-0": { "key": "*-0", "doc_count": 0, "to": 0.0},
|
||||
"custom-key-0-to-0.1": {"key": "custom-key-0-to-0.1", "doc_count": 10, "from": 0.0, "to": 0.1},
|
||||
"0.1-*": {"key": "0.1-*", "doc_count": 90, "from": 0.1},
|
||||
}
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_test_extend_range_hole() {
|
||||
let buckets = vec![(10f64..20f64).into(), (30f64..40f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, Type::F64);
|
||||
|
||||
let buckets = collector.buckets;
|
||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.end, 20f64.to_u64());
|
||||
// Added bucket to fill hole
|
||||
assert_eq!(buckets[2].range.start, 20f64.to_u64());
|
||||
assert_eq!(buckets[2].range.end, 30f64.to_u64());
|
||||
assert_eq!(buckets[3].range.start, 30f64.to_u64());
|
||||
assert_eq!(buckets[3].range.end, 40f64.to_u64());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_test_range_conversion_special_case() {
|
||||
// the monotonic conversion between f64 and u64, does not map f64::MIN.to_u64() ==
|
||||
// u64::MIN, but the into trait converts f64::MIN/MAX to None
|
||||
let buckets = vec![
|
||||
(f64::MIN..10f64).into(),
|
||||
(10f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
];
|
||||
let collector = get_collector_from_ranges(buckets, Type::F64);
|
||||
|
||||
let buckets = collector.buckets;
|
||||
assert_eq!(buckets[0].range.start, u64::MIN);
|
||||
assert_eq!(buckets[0].range.end, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.start, 10f64.to_u64());
|
||||
assert_eq!(buckets[1].range.end, 20f64.to_u64());
|
||||
assert_eq!(buckets[2].range.start, 20f64.to_u64());
|
||||
assert_eq!(buckets[2].range.end, u64::MAX);
|
||||
assert_eq!(buckets.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_range_test_negative_vals() {
|
||||
let buckets = vec![(-10f64..-1f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, Type::F64);
|
||||
|
||||
let buckets = collector.buckets;
|
||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*--10");
|
||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "-1-*");
|
||||
}
|
||||
#[test]
|
||||
fn bucket_range_test_positive_vals() {
|
||||
let buckets = vec![(0f64..10f64).into()];
|
||||
let collector = get_collector_from_ranges(buckets, Type::F64);
|
||||
|
||||
let buckets = collector.buckets;
|
||||
assert_eq!(&buckets[0].bucket.key.to_string(), "*-0");
|
||||
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "10-*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_binary_search_test_u64() {
|
||||
let check_ranges = |ranges: Vec<RangeAggregationRange>| {
|
||||
let collector = get_collector_from_ranges(ranges, Type::U64);
|
||||
let search = |val: u64| collector.get_bucket_pos(val);
|
||||
|
||||
assert_eq!(search(u64::MIN), 0);
|
||||
assert_eq!(search(9), 0);
|
||||
assert_eq!(search(10), 1);
|
||||
assert_eq!(search(11), 1);
|
||||
assert_eq!(search(99), 1);
|
||||
assert_eq!(search(100), 2);
|
||||
assert_eq!(search(u64::MAX - 1), 2); // Since the end range is never included, the max
|
||||
// value
|
||||
};
|
||||
|
||||
let ranges = vec![(10.0..100.0).into()];
|
||||
check_ranges(ranges);
|
||||
|
||||
let ranges = vec![
|
||||
RangeAggregationRange {
|
||||
key: None,
|
||||
to: Some(10.0),
|
||||
from: None,
|
||||
},
|
||||
(10.0..100.0).into(),
|
||||
];
|
||||
check_ranges(ranges);
|
||||
|
||||
let ranges = vec![
|
||||
RangeAggregationRange {
|
||||
key: None,
|
||||
to: Some(10.0),
|
||||
from: None,
|
||||
},
|
||||
(10.0..100.0).into(),
|
||||
RangeAggregationRange {
|
||||
key: None,
|
||||
to: None,
|
||||
from: Some(100.0),
|
||||
},
|
||||
];
|
||||
check_ranges(ranges);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_binary_search_test_f64() {
|
||||
let ranges = vec![(10.0..100.0).into()];
|
||||
|
||||
let collector = get_collector_from_ranges(ranges, Type::F64);
|
||||
let search = |val: u64| collector.get_bucket_pos(val);
|
||||
|
||||
assert_eq!(search(u64::MIN), 0);
|
||||
assert_eq!(search(9f64.to_u64()), 0);
|
||||
assert_eq!(search(10f64.to_u64()), 1);
|
||||
assert_eq!(search(11f64.to_u64()), 1);
|
||||
assert_eq!(search(99f64.to_u64()), 1);
|
||||
assert_eq!(search(100f64.to_u64()), 2);
|
||||
assert_eq!(search(u64::MAX - 1), 2); // Since the end range is never included,
|
||||
// the max value
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use itertools::Itertools;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
|
||||
use super::*;
|
||||
use crate::aggregation::bucket::range::tests::get_collector_from_ranges;
|
||||
|
||||
const TOTAL_DOCS: u64 = 1_000_000u64;
|
||||
const NUM_DOCS: u64 = 50_000u64;
|
||||
|
||||
fn get_collector_with_buckets(num_buckets: u64, num_docs: u64) -> SegmentRangeCollector {
|
||||
let bucket_size = num_docs / num_buckets;
|
||||
let mut buckets: Vec<RangeAggregationRange> = vec![];
|
||||
for i in 0..num_buckets {
|
||||
let bucket_start = (i * bucket_size) as f64;
|
||||
buckets.push((bucket_start..bucket_start + bucket_size as f64).into())
|
||||
}
|
||||
|
||||
get_collector_from_ranges(buckets, Type::U64)
|
||||
}
|
||||
|
||||
fn get_rand_docs(total_docs: u64, num_docs_returned: u64) -> Vec<u64> {
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let all_docs = (0..total_docs - 1).collect_vec();
|
||||
let mut vals = all_docs
|
||||
.as_slice()
|
||||
.choose_multiple(&mut rng, num_docs_returned as usize)
|
||||
.cloned()
|
||||
.collect_vec();
|
||||
vals.sort();
|
||||
vals
|
||||
}
|
||||
|
||||
fn bench_range_binary_search(b: &mut test::Bencher, num_buckets: u64) {
|
||||
let collector = get_collector_with_buckets(num_buckets, TOTAL_DOCS);
|
||||
let vals = get_rand_docs(TOTAL_DOCS, NUM_DOCS);
|
||||
b.iter(|| {
|
||||
let mut bucket_pos = 0;
|
||||
for val in &vals {
|
||||
bucket_pos = collector.get_bucket_pos(*val);
|
||||
}
|
||||
bucket_pos
|
||||
})
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_range_100_buckets(b: &mut test::Bencher) {
|
||||
bench_range_binary_search(b, 100)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_range_10_buckets(b: &mut test::Bencher) {
|
||||
bench_range_binary_search(b, 10)
|
||||
}
|
||||
}
|
||||
1364
src/aggregation/bucket/term_agg.rs
Normal file
183
src/aggregation/collector.rs
Normal file
@@ -0,0 +1,183 @@
|
||||
use std::rc::Rc;
|
||||
|
||||
use super::agg_req::Aggregations;
|
||||
use super::agg_req_with_accessor::AggregationsWithAccessor;
|
||||
use super::agg_result::AggregationResults;
|
||||
use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use super::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_validate;
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::{SegmentReader, TantivyError};
|
||||
|
||||
/// The default max bucket count, before the aggregation fails.
|
||||
pub const MAX_BUCKET_COUNT: u32 = 65000;
|
||||
|
||||
/// Collector for aggregations.
|
||||
///
|
||||
/// The collector collects all aggregations by the underlying aggregation request.
|
||||
pub struct AggregationCollector {
|
||||
agg: Aggregations,
|
||||
max_bucket_count: u32,
|
||||
}
|
||||
|
||||
impl AggregationCollector {
|
||||
/// Create collector from aggregation request.
|
||||
///
|
||||
/// Aggregation fails when the total bucket count is higher than max_bucket_count.
|
||||
/// max_bucket_count will default to `MAX_BUCKET_COUNT` (65000) when unset
|
||||
pub fn from_aggs(agg: Aggregations, max_bucket_count: Option<u32>) -> Self {
|
||||
Self {
|
||||
agg,
|
||||
max_bucket_count: max_bucket_count.unwrap_or(MAX_BUCKET_COUNT),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Collector for distributed aggregations.
|
||||
///
|
||||
/// The collector collects all aggregations by the underlying aggregation request.
|
||||
///
|
||||
/// # Purpose
|
||||
/// AggregationCollector returns `IntermediateAggregationResults` and not the final
|
||||
/// `AggregationResults`, so that results from different indices can be merged and then converted
|
||||
/// into the final `AggregationResults` via the `into_final_result()` method.
|
||||
pub struct DistributedAggregationCollector {
|
||||
agg: Aggregations,
|
||||
max_bucket_count: u32,
|
||||
}
|
||||
|
||||
impl DistributedAggregationCollector {
|
||||
/// Create collector from aggregation request.
|
||||
///
|
||||
/// max_bucket_count will default to `MAX_BUCKET_COUNT` (65000) when unset
|
||||
pub fn from_aggs(agg: Aggregations, max_bucket_count: Option<u32>) -> Self {
|
||||
Self {
|
||||
agg,
|
||||
max_bucket_count: max_bucket_count.unwrap_or(MAX_BUCKET_COUNT),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for DistributedAggregationCollector {
|
||||
type Fruit = IntermediateAggregationResults;
|
||||
|
||||
type Child = AggregationSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: crate::SegmentOrdinal,
|
||||
reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
AggregationSegmentCollector::from_agg_req_and_reader(
|
||||
&self.agg,
|
||||
reader,
|
||||
self.max_bucket_count,
|
||||
)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
merge_fruits(segment_fruits)
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for AggregationCollector {
|
||||
type Fruit = AggregationResults;
|
||||
|
||||
type Child = AggregationSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: crate::SegmentOrdinal,
|
||||
reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
AggregationSegmentCollector::from_agg_req_and_reader(
|
||||
&self.agg,
|
||||
reader,
|
||||
self.max_bucket_count,
|
||||
)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
let res = merge_fruits(segment_fruits)?;
|
||||
res.into_final_bucket_result(self.agg.clone())
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
mut segment_fruits: Vec<crate::Result<IntermediateAggregationResults>>,
|
||||
) -> crate::Result<IntermediateAggregationResults> {
|
||||
if let Some(fruit) = segment_fruits.pop() {
|
||||
let mut fruit = fruit?;
|
||||
for next_fruit in segment_fruits {
|
||||
fruit.merge_fruits(next_fruit?);
|
||||
}
|
||||
Ok(fruit)
|
||||
} else {
|
||||
Ok(IntermediateAggregationResults::default())
|
||||
}
|
||||
}
|
||||
|
||||
/// AggregationSegmentCollector does the aggregation collection on a segment.
|
||||
pub struct AggregationSegmentCollector {
|
||||
aggs_with_accessor: AggregationsWithAccessor,
|
||||
result: SegmentAggregationResultsCollector,
|
||||
error: Option<TantivyError>,
|
||||
}
|
||||
|
||||
impl AggregationSegmentCollector {
|
||||
/// Creates an AggregationSegmentCollector from an [Aggregations] request and a segment reader.
|
||||
/// Also includes validation, e.g. checking field types and existence.
|
||||
pub fn from_agg_req_and_reader(
|
||||
agg: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
max_bucket_count: u32,
|
||||
) -> crate::Result<Self> {
|
||||
let aggs_with_accessor =
|
||||
get_aggs_with_accessor_and_validate(agg, reader, Rc::default(), max_bucket_count)?;
|
||||
let result =
|
||||
SegmentAggregationResultsCollector::from_req_and_validate(&aggs_with_accessor)?;
|
||||
Ok(AggregationSegmentCollector {
|
||||
aggs_with_accessor,
|
||||
result,
|
||||
error: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for AggregationSegmentCollector {
|
||||
type Fruit = crate::Result<IntermediateAggregationResults>;
|
||||
|
||||
#[inline]
|
||||
fn collect(&mut self, doc: crate::DocId, _score: crate::Score) {
|
||||
if self.error.is_some() {
|
||||
return;
|
||||
}
|
||||
if let Err(err) = self.result.collect(doc, &self.aggs_with_accessor) {
|
||||
self.error = Some(err);
|
||||
}
|
||||
}
|
||||
|
||||
fn harvest(mut self) -> Self::Fruit {
|
||||
if let Some(err) = self.error {
|
||||
return Err(err);
|
||||
}
|
||||
self.result
|
||||
.flush_staged_docs(&self.aggs_with_accessor, true)?;
|
||||
self.result
|
||||
.into_intermediate_aggregations_result(&self.aggs_with_accessor)
|
||||
}
|
||||
}
|
||||
738
src/aggregation/intermediate_agg_result.rs
Normal file
@@ -0,0 +1,738 @@
|
||||
//! Contains the intermediate aggregation tree, that can be merged.
|
||||
//! Intermediate aggregation results can be used to merge results between segments or between
|
||||
//! indices.
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use fnv::FnvHashMap;
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::agg_req::{
|
||||
Aggregations, AggregationsInternal, BucketAggregationInternal, BucketAggregationType,
|
||||
MetricAggregation,
|
||||
};
|
||||
use super::agg_result::{AggregationResult, BucketResult, RangeBucketEntry};
|
||||
use super::bucket::{
|
||||
cut_off_buckets, get_agg_name_and_property, intermediate_histogram_buckets_to_final_buckets,
|
||||
GetDocCount, Order, OrderTarget, SegmentHistogramBucketEntry, TermsAggregation,
|
||||
};
|
||||
use super::metric::{IntermediateAverage, IntermediateStats};
|
||||
use super::segment_agg_result::SegmentMetricResultCollector;
|
||||
use super::{Key, SerializedKey, VecWithNames};
|
||||
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
|
||||
use crate::aggregation::bucket::TermsAggregationInternal;
|
||||
|
||||
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
||||
/// intermediate results.
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateAggregationResults {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(crate) metrics: Option<VecWithNames<IntermediateMetricResult>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(crate) buckets: Option<VecWithNames<IntermediateBucketResult>>,
|
||||
}
|
||||
|
||||
impl IntermediateAggregationResults {
|
||||
/// Convert intermediate result and its aggregation request to the final result.
|
||||
pub fn into_final_bucket_result(self, req: Aggregations) -> crate::Result<AggregationResults> {
|
||||
self.into_final_bucket_result_internal(&(req.into()))
|
||||
}
|
||||
|
||||
/// Convert intermediate result and its aggregation request to the final result.
|
||||
///
|
||||
/// Internal function, AggregationsInternal is used instead Aggregations, which is optimized
|
||||
/// for internal processing, by splitting metric and buckets into separate groups.
|
||||
pub(crate) fn into_final_bucket_result_internal(
|
||||
self,
|
||||
req: &AggregationsInternal,
|
||||
) -> crate::Result<AggregationResults> {
|
||||
// Important assumption:
|
||||
// When the tree contains buckets/metric, we expect it to have all buckets/metrics from the
|
||||
// request
|
||||
let mut results: HashMap<String, AggregationResult> = HashMap::new();
|
||||
|
||||
if let Some(buckets) = self.buckets {
|
||||
convert_and_add_final_buckets_to_result(&mut results, buckets, &req.buckets)?
|
||||
} else {
|
||||
// When there are no buckets, we create empty buckets, so that the serialized json
|
||||
// format is constant
|
||||
add_empty_final_buckets_to_result(&mut results, &req.buckets)?
|
||||
};
|
||||
|
||||
if let Some(metrics) = self.metrics {
|
||||
convert_and_add_final_metrics_to_result(&mut results, metrics);
|
||||
} else {
|
||||
// When there are no metrics, we create empty metric results, so that the serialized
|
||||
// json format is constant
|
||||
add_empty_final_metrics_to_result(&mut results, &req.metrics)?;
|
||||
}
|
||||
|
||||
Ok(AggregationResults(results))
|
||||
}
|
||||
|
||||
pub(crate) fn empty_from_req(req: &AggregationsInternal) -> Self {
|
||||
let metrics = if req.metrics.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let metrics = req
|
||||
.metrics
|
||||
.iter()
|
||||
.map(|(key, req)| {
|
||||
(
|
||||
key.to_string(),
|
||||
IntermediateMetricResult::empty_from_req(req),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
Some(VecWithNames::from_entries(metrics))
|
||||
};
|
||||
|
||||
let buckets = if req.buckets.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let buckets = req
|
||||
.buckets
|
||||
.iter()
|
||||
.map(|(key, req)| {
|
||||
(
|
||||
key.to_string(),
|
||||
IntermediateBucketResult::empty_from_req(&req.bucket_agg),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
Some(VecWithNames::from_entries(buckets))
|
||||
};
|
||||
|
||||
Self { metrics, buckets }
|
||||
}
|
||||
|
||||
/// Merge an other intermediate aggregation result into this result.
|
||||
///
|
||||
/// The order of the values need to be the same on both results. This is ensured when the same
|
||||
/// (key values) are present on the underlying VecWithNames struct.
|
||||
pub fn merge_fruits(&mut self, other: IntermediateAggregationResults) {
|
||||
if let (Some(buckets_left), Some(buckets_right)) = (&mut self.buckets, other.buckets) {
|
||||
for (bucket_left, bucket_right) in
|
||||
buckets_left.values_mut().zip(buckets_right.into_values())
|
||||
{
|
||||
bucket_left.merge_fruits(bucket_right);
|
||||
}
|
||||
}
|
||||
|
||||
if let (Some(metrics_left), Some(metrics_right)) = (&mut self.metrics, other.metrics) {
|
||||
for (metric_left, metric_right) in
|
||||
metrics_left.values_mut().zip(metrics_right.into_values())
|
||||
{
|
||||
metric_left.merge_fruits(metric_right);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn convert_and_add_final_metrics_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
metrics: VecWithNames<IntermediateMetricResult>,
|
||||
) {
|
||||
results.extend(
|
||||
metrics
|
||||
.into_iter()
|
||||
.map(|(key, metric)| (key, AggregationResult::MetricResult(metric.into()))),
|
||||
);
|
||||
}
|
||||
|
||||
fn add_empty_final_metrics_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
req_metrics: &VecWithNames<MetricAggregation>,
|
||||
) -> crate::Result<()> {
|
||||
results.extend(req_metrics.iter().map(|(key, req)| {
|
||||
let empty_bucket = IntermediateMetricResult::empty_from_req(req);
|
||||
(
|
||||
key.to_string(),
|
||||
AggregationResult::MetricResult(empty_bucket.into()),
|
||||
)
|
||||
}));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn add_empty_final_buckets_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
req_buckets: &VecWithNames<BucketAggregationInternal>,
|
||||
) -> crate::Result<()> {
|
||||
let requested_buckets = req_buckets.iter();
|
||||
for (key, req) in requested_buckets {
|
||||
let empty_bucket = AggregationResult::BucketResult(BucketResult::empty_from_req(req)?);
|
||||
results.insert(key.to_string(), empty_bucket);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn convert_and_add_final_buckets_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
buckets: VecWithNames<IntermediateBucketResult>,
|
||||
req_buckets: &VecWithNames<BucketAggregationInternal>,
|
||||
) -> crate::Result<()> {
|
||||
assert_eq!(buckets.len(), req_buckets.len());
|
||||
|
||||
let buckets_with_request = buckets.into_iter().zip(req_buckets.values());
|
||||
for ((key, bucket), req) in buckets_with_request {
|
||||
let result = AggregationResult::BucketResult(bucket.into_final_bucket_result(req)?);
|
||||
results.insert(key, result);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// An aggregation is either a bucket or a metric.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum IntermediateAggregationResult {
|
||||
/// Bucket variant
|
||||
Bucket(IntermediateBucketResult),
|
||||
/// Metric variant
|
||||
Metric(IntermediateMetricResult),
|
||||
}
|
||||
|
||||
/// Holds the intermediate data for metric results
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum IntermediateMetricResult {
|
||||
/// Average containing intermediate average data result
|
||||
Average(IntermediateAverage),
|
||||
/// AverageData variant
|
||||
Stats(IntermediateStats),
|
||||
}
|
||||
|
||||
impl From<SegmentMetricResultCollector> for IntermediateMetricResult {
|
||||
fn from(tree: SegmentMetricResultCollector) -> Self {
|
||||
match tree {
|
||||
SegmentMetricResultCollector::Average(collector) => {
|
||||
IntermediateMetricResult::Average(IntermediateAverage::from_collector(collector))
|
||||
}
|
||||
SegmentMetricResultCollector::Stats(collector) => {
|
||||
IntermediateMetricResult::Stats(collector.stats)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateMetricResult {
|
||||
pub(crate) fn empty_from_req(req: &MetricAggregation) -> Self {
|
||||
match req {
|
||||
MetricAggregation::Average(_) => {
|
||||
IntermediateMetricResult::Average(IntermediateAverage::default())
|
||||
}
|
||||
MetricAggregation::Stats(_) => {
|
||||
IntermediateMetricResult::Stats(IntermediateStats::default())
|
||||
}
|
||||
}
|
||||
}
|
||||
fn merge_fruits(&mut self, other: IntermediateMetricResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateMetricResult::Average(avg_data_left),
|
||||
IntermediateMetricResult::Average(avg_data_right),
|
||||
) => {
|
||||
avg_data_left.merge_fruits(avg_data_right);
|
||||
}
|
||||
(
|
||||
IntermediateMetricResult::Stats(stats_left),
|
||||
IntermediateMetricResult::Stats(stats_right),
|
||||
) => {
|
||||
stats_left.merge_fruits(stats_right);
|
||||
}
|
||||
_ => {
|
||||
panic!("incompatible fruit types in tree");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The intermediate bucket results. Internally they can be easily merged via the keys of the
|
||||
/// buckets.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum IntermediateBucketResult {
|
||||
/// This is the range entry for a bucket, which contains a key, count, from, to, and optionally
|
||||
/// sub_aggregations.
|
||||
Range(IntermediateRangeBucketResult),
|
||||
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
Histogram {
|
||||
/// The buckets
|
||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||
},
|
||||
/// Term aggregation
|
||||
Terms(IntermediateTermBucketResult),
|
||||
}
|
||||
|
||||
impl IntermediateBucketResult {
|
||||
pub(crate) fn into_final_bucket_result(
|
||||
self,
|
||||
req: &BucketAggregationInternal,
|
||||
) -> crate::Result<BucketResult> {
|
||||
match self {
|
||||
IntermediateBucketResult::Range(range_res) => {
|
||||
let mut buckets: Vec<RangeBucketEntry> = range_res
|
||||
.buckets
|
||||
.into_iter()
|
||||
.map(|(_, bucket)| bucket.into_final_bucket_entry(&req.sub_aggregation))
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
|
||||
buckets.sort_by(|left, right| {
|
||||
left.from
|
||||
.unwrap_or(f64::MIN)
|
||||
.total_cmp(&right.from.unwrap_or(f64::MIN))
|
||||
});
|
||||
|
||||
let is_keyed = req
|
||||
.as_range()
|
||||
.expect("unexpected aggregation, expected range aggregation")
|
||||
.keyed;
|
||||
let buckets = if is_keyed {
|
||||
let mut bucket_map =
|
||||
FnvHashMap::with_capacity_and_hasher(buckets.len(), Default::default());
|
||||
for bucket in buckets {
|
||||
bucket_map.insert(bucket.key.to_string(), bucket);
|
||||
}
|
||||
BucketEntries::HashMap(bucket_map)
|
||||
} else {
|
||||
BucketEntries::Vec(buckets)
|
||||
};
|
||||
Ok(BucketResult::Range { buckets })
|
||||
}
|
||||
IntermediateBucketResult::Histogram { buckets } => {
|
||||
let buckets = intermediate_histogram_buckets_to_final_buckets(
|
||||
buckets,
|
||||
req.as_histogram()
|
||||
.expect("unexpected aggregation, expected histogram aggregation"),
|
||||
&req.sub_aggregation,
|
||||
)?;
|
||||
|
||||
let buckets = if req.as_histogram().unwrap().keyed {
|
||||
let mut bucket_map =
|
||||
FnvHashMap::with_capacity_and_hasher(buckets.len(), Default::default());
|
||||
for bucket in buckets {
|
||||
bucket_map.insert(bucket.key.to_string(), bucket);
|
||||
}
|
||||
BucketEntries::HashMap(bucket_map)
|
||||
} else {
|
||||
BucketEntries::Vec(buckets)
|
||||
};
|
||||
Ok(BucketResult::Histogram { buckets })
|
||||
}
|
||||
IntermediateBucketResult::Terms(terms) => terms.into_final_result(
|
||||
req.as_term()
|
||||
.expect("unexpected aggregation, expected term aggregation"),
|
||||
&req.sub_aggregation,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn empty_from_req(req: &BucketAggregationType) -> Self {
|
||||
match req {
|
||||
BucketAggregationType::Terms(_) => IntermediateBucketResult::Terms(Default::default()),
|
||||
BucketAggregationType::Range(_) => IntermediateBucketResult::Range(Default::default()),
|
||||
BucketAggregationType::Histogram(_) => {
|
||||
IntermediateBucketResult::Histogram { buckets: vec![] }
|
||||
}
|
||||
}
|
||||
}
|
||||
fn merge_fruits(&mut self, other: IntermediateBucketResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateBucketResult::Terms(term_res_left),
|
||||
IntermediateBucketResult::Terms(term_res_right),
|
||||
) => {
|
||||
merge_maps(&mut term_res_left.entries, term_res_right.entries);
|
||||
term_res_left.sum_other_doc_count += term_res_right.sum_other_doc_count;
|
||||
term_res_left.doc_count_error_upper_bound +=
|
||||
term_res_right.doc_count_error_upper_bound;
|
||||
}
|
||||
|
||||
(
|
||||
IntermediateBucketResult::Range(range_res_left),
|
||||
IntermediateBucketResult::Range(range_res_right),
|
||||
) => {
|
||||
merge_maps(&mut range_res_left.buckets, range_res_right.buckets);
|
||||
}
|
||||
(
|
||||
IntermediateBucketResult::Histogram {
|
||||
buckets: buckets_left,
|
||||
..
|
||||
},
|
||||
IntermediateBucketResult::Histogram {
|
||||
buckets: buckets_right,
|
||||
..
|
||||
},
|
||||
) => {
|
||||
let buckets = buckets_left
|
||||
.drain(..)
|
||||
.merge_join_by(buckets_right.into_iter(), |left, right| {
|
||||
left.key.partial_cmp(&right.key).unwrap_or(Ordering::Equal)
|
||||
})
|
||||
.map(|either| match either {
|
||||
itertools::EitherOrBoth::Both(mut left, right) => {
|
||||
left.merge_fruits(right);
|
||||
left
|
||||
}
|
||||
itertools::EitherOrBoth::Left(left) => left,
|
||||
itertools::EitherOrBoth::Right(right) => right,
|
||||
})
|
||||
.collect();
|
||||
|
||||
*buckets_left = buckets;
|
||||
}
|
||||
(IntermediateBucketResult::Range(_), _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
(IntermediateBucketResult::Histogram { .. }, _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
(IntermediateBucketResult::Terms { .. }, _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// Range aggregation including error counts
|
||||
pub struct IntermediateRangeBucketResult {
|
||||
pub(crate) buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// Term aggregation including error counts
|
||||
pub struct IntermediateTermBucketResult {
|
||||
pub(crate) entries: FnvHashMap<String, IntermediateTermBucketEntry>,
|
||||
pub(crate) sum_other_doc_count: u64,
|
||||
pub(crate) doc_count_error_upper_bound: u64,
|
||||
}
|
||||
|
||||
impl IntermediateTermBucketResult {
|
||||
pub(crate) fn into_final_result(
|
||||
self,
|
||||
req: &TermsAggregation,
|
||||
sub_aggregation_req: &AggregationsInternal,
|
||||
) -> crate::Result<BucketResult> {
|
||||
let req = TermsAggregationInternal::from_req(req);
|
||||
let mut buckets: Vec<BucketEntry> = self
|
||||
.entries
|
||||
.into_iter()
|
||||
.filter(|bucket| bucket.1.doc_count >= req.min_doc_count)
|
||||
.map(|(key, entry)| {
|
||||
Ok(BucketEntry {
|
||||
key: Key::Str(key),
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: entry
|
||||
.sub_aggregation
|
||||
.into_final_bucket_result_internal(sub_aggregation_req)?,
|
||||
})
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
|
||||
let order = req.order.order;
|
||||
match req.order.target {
|
||||
OrderTarget::Key => {
|
||||
buckets.sort_by(|left, right| {
|
||||
if req.order.order == Order::Desc {
|
||||
left.key.partial_cmp(&right.key)
|
||||
} else {
|
||||
right.key.partial_cmp(&left.key)
|
||||
}
|
||||
.expect("expected type string, which is always sortable")
|
||||
});
|
||||
}
|
||||
OrderTarget::Count => {
|
||||
if req.order.order == Order::Desc {
|
||||
buckets.sort_unstable_by_key(|bucket| std::cmp::Reverse(bucket.doc_count()));
|
||||
} else {
|
||||
buckets.sort_unstable_by_key(|bucket| bucket.doc_count());
|
||||
}
|
||||
}
|
||||
OrderTarget::SubAggregation(name) => {
|
||||
let (agg_name, agg_property) = get_agg_name_and_property(&name);
|
||||
let mut buckets_with_val = buckets
|
||||
.into_iter()
|
||||
.map(|bucket| {
|
||||
let val = bucket
|
||||
.sub_aggregation
|
||||
.get_value_from_aggregation(agg_name, agg_property)?
|
||||
.unwrap_or(f64::NAN);
|
||||
Ok((bucket, val))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
|
||||
buckets_with_val.sort_by(|(_, val1), (_, val2)| match &order {
|
||||
Order::Desc => val2.total_cmp(val1),
|
||||
Order::Asc => val1.total_cmp(val2),
|
||||
});
|
||||
buckets = buckets_with_val
|
||||
.into_iter()
|
||||
.map(|(bucket, _val)| bucket)
|
||||
.collect_vec();
|
||||
}
|
||||
}
|
||||
|
||||
// We ignore _term_doc_count_before_cutoff here, because it increases the upperbound error
|
||||
// only for terms that didn't make it into the top N.
|
||||
//
|
||||
// This can be interesting, as a value of quality of the results, but not good to check the
|
||||
// actual error count for the returned terms.
|
||||
let (_term_doc_count_before_cutoff, sum_other_doc_count) =
|
||||
cut_off_buckets(&mut buckets, req.size as usize);
|
||||
|
||||
let doc_count_error_upper_bound = if req.show_term_doc_count_error {
|
||||
Some(self.doc_count_error_upper_bound)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(BucketResult::Terms {
|
||||
buckets,
|
||||
sum_other_doc_count: self.sum_other_doc_count + sum_other_doc_count,
|
||||
doc_count_error_upper_bound,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
trait MergeFruits {
|
||||
fn merge_fruits(&mut self, other: Self);
|
||||
}
|
||||
|
||||
fn merge_maps<V: MergeFruits + Clone>(
|
||||
entries_left: &mut FnvHashMap<SerializedKey, V>,
|
||||
mut entries_right: FnvHashMap<SerializedKey, V>,
|
||||
) {
|
||||
for (name, entry_left) in entries_left.iter_mut() {
|
||||
if let Some(entry_right) = entries_right.remove(name) {
|
||||
entry_left.merge_fruits(entry_right);
|
||||
}
|
||||
}
|
||||
|
||||
for (key, res) in entries_right.into_iter() {
|
||||
entries_left.entry(key).or_insert(res);
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateHistogramBucketEntry {
|
||||
/// The unique the bucket is identified.
|
||||
pub key: f64,
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
}
|
||||
|
||||
impl IntermediateHistogramBucketEntry {
|
||||
pub(crate) fn into_final_bucket_entry(
|
||||
self,
|
||||
req: &AggregationsInternal,
|
||||
) -> crate::Result<BucketEntry> {
|
||||
Ok(BucketEntry {
|
||||
key: Key::F64(self.key),
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation: self
|
||||
.sub_aggregation
|
||||
.into_final_bucket_result_internal(req)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SegmentHistogramBucketEntry> for IntermediateHistogramBucketEntry {
|
||||
fn from(entry: SegmentHistogramBucketEntry) -> Self {
|
||||
IntermediateHistogramBucketEntry {
|
||||
key: entry.key,
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the range entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateRangeBucketEntry {
|
||||
/// The unique the bucket is identified.
|
||||
pub key: Key,
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl IntermediateRangeBucketEntry {
|
||||
pub(crate) fn into_final_bucket_entry(
|
||||
self,
|
||||
req: &AggregationsInternal,
|
||||
) -> crate::Result<RangeBucketEntry> {
|
||||
Ok(RangeBucketEntry {
|
||||
key: self.key,
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation: self
|
||||
.sub_aggregation
|
||||
.into_final_bucket_result_internal(req)?,
|
||||
to: self.to,
|
||||
from: self.from,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the term entry for a bucket, which contains a count, and optionally
|
||||
/// sub_aggregations.
|
||||
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateTermBucketEntry {
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
}
|
||||
|
||||
impl MergeFruits for IntermediateTermBucketEntry {
|
||||
fn merge_fruits(&mut self, other: IntermediateTermBucketEntry) {
|
||||
self.doc_count += other.doc_count;
|
||||
self.sub_aggregation.merge_fruits(other.sub_aggregation);
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeFruits for IntermediateRangeBucketEntry {
|
||||
fn merge_fruits(&mut self, other: IntermediateRangeBucketEntry) {
|
||||
self.doc_count += other.doc_count;
|
||||
self.sub_aggregation.merge_fruits(other.sub_aggregation);
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeFruits for IntermediateHistogramBucketEntry {
|
||||
fn merge_fruits(&mut self, other: IntermediateHistogramBucketEntry) {
|
||||
self.doc_count += other.doc_count;
|
||||
self.sub_aggregation.merge_fruits(other.sub_aggregation);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn get_sub_test_tree(data: &[(String, u64)]) -> IntermediateAggregationResults {
|
||||
let mut map = HashMap::new();
|
||||
let mut buckets = FnvHashMap::default();
|
||||
for (key, doc_count) in data {
|
||||
buckets.insert(
|
||||
key.to_string(),
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
sub_aggregation: Default::default(),
|
||||
from: None,
|
||||
to: None,
|
||||
},
|
||||
);
|
||||
}
|
||||
map.insert(
|
||||
"my_agg_level2".to_string(),
|
||||
IntermediateBucketResult::Range(IntermediateRangeBucketResult { buckets }),
|
||||
);
|
||||
IntermediateAggregationResults {
|
||||
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),
|
||||
metrics: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_intermediat_tree_with_ranges(
|
||||
data: &[(String, u64, String, u64)],
|
||||
) -> IntermediateAggregationResults {
|
||||
let mut map = HashMap::new();
|
||||
let mut buckets: FnvHashMap<_, _> = Default::default();
|
||||
for (key, doc_count, sub_aggregation_key, sub_aggregation_count) in data {
|
||||
buckets.insert(
|
||||
key.to_string(),
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
from: None,
|
||||
to: None,
|
||||
sub_aggregation: get_sub_test_tree(&[(
|
||||
sub_aggregation_key.to_string(),
|
||||
*sub_aggregation_count,
|
||||
)]),
|
||||
},
|
||||
);
|
||||
}
|
||||
map.insert(
|
||||
"my_agg_level1".to_string(),
|
||||
IntermediateBucketResult::Range(IntermediateRangeBucketResult { buckets }),
|
||||
);
|
||||
IntermediateAggregationResults {
|
||||
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),
|
||||
metrics: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_1() {
|
||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
let tree_right = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 60, "1900".to_string(), 30),
|
||||
("blue".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
tree_left.merge_fruits(tree_right);
|
||||
|
||||
let tree_expected = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 110, "1900".to_string(), 55),
|
||||
("blue".to_string(), 55, "1900".to_string(), 80),
|
||||
]);
|
||||
|
||||
assert_eq!(tree_left, tree_expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_2() {
|
||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
let tree_right = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 60, "1900".to_string(), 30),
|
||||
("green".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
tree_left.merge_fruits(tree_right);
|
||||
|
||||
let tree_expected = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 110, "1900".to_string(), 55),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
("green".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
assert_eq!(tree_left, tree_expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_empty() {
|
||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
|
||||
let orig = tree_left.clone();
|
||||
|
||||
tree_left.merge_fruits(IntermediateAggregationResults::default());
|
||||
|
||||
assert_eq!(tree_left, orig);
|
||||
}
|
||||
}
|
||||
115
src/aggregation/metric/average.rs
Normal file
@@ -0,0 +1,115 @@
|
||||
use std::fmt::Debug;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::schema::Type;
|
||||
use crate::DocId;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// A single-value metric aggregation that computes the average of numeric values that are
|
||||
/// extracted from the aggregated documents.
|
||||
/// Supported field types are u64, i64, and f64.
|
||||
/// See [super::SingleMetricResult] for return value.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "avg": {
|
||||
/// "field": "score",
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
pub struct AverageAggregation {
|
||||
/// The field name to compute the stats on.
|
||||
pub field: String,
|
||||
}
|
||||
impl AverageAggregation {
|
||||
/// Create new AverageAggregation from a field.
|
||||
pub fn from_field_name(field_name: String) -> Self {
|
||||
AverageAggregation { field: field_name }
|
||||
}
|
||||
/// Return the field name.
|
||||
pub fn field_name(&self) -> &str {
|
||||
&self.field
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentAverageCollector {
|
||||
pub data: IntermediateAverage,
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
impl Debug for SegmentAverageCollector {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("AverageCollector")
|
||||
.field("data", &self.data)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentAverageCollector {
|
||||
pub fn from_req(field_type: Type) -> Self {
|
||||
Self {
|
||||
field_type,
|
||||
data: Default::default(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = field.get_val(docs[0] as u64);
|
||||
let val2 = field.get_val(docs[1] as u64);
|
||||
let val3 = field.get_val(docs[2] as u64);
|
||||
let val4 = field.get_val(docs[3] as u64);
|
||||
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
|
||||
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
|
||||
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
|
||||
let val4 = f64_from_fastfield_u64(val4, &self.field_type);
|
||||
self.data.collect(val1);
|
||||
self.data.collect(val2);
|
||||
self.data.collect(val3);
|
||||
self.data.collect(val4);
|
||||
}
|
||||
for &doc in iter.remainder() {
|
||||
let val = field.get_val(doc as u64);
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.data.collect(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Contains mergeable version of average data.
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateAverage {
|
||||
pub(crate) sum: f64,
|
||||
pub(crate) doc_count: u64,
|
||||
}
|
||||
|
||||
impl IntermediateAverage {
|
||||
pub(crate) fn from_collector(collector: SegmentAverageCollector) -> Self {
|
||||
collector.data
|
||||
}
|
||||
|
||||
/// Merge average data into this instance.
|
||||
pub fn merge_fruits(&mut self, other: IntermediateAverage) {
|
||||
self.sum += other.sum;
|
||||
self.doc_count += other.doc_count;
|
||||
}
|
||||
/// compute final result
|
||||
pub fn finalize(&self) -> Option<f64> {
|
||||
if self.doc_count == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(self.sum / self.doc_count as f64)
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
fn collect(&mut self, val: f64) {
|
||||
self.doc_count += 1;
|
||||
self.sum += val;
|
||||
}
|
||||
}
|
||||
30
src/aggregation/metric/mod.rs
Normal file
@@ -0,0 +1,30 @@
|
||||
//! Module for all metric aggregations.
|
||||
//!
|
||||
//! The aggregations in this family compute metrics, see [super::agg_req::MetricAggregation] for
|
||||
//! details.
|
||||
mod average;
|
||||
mod stats;
|
||||
pub use average::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
pub use stats::*;
|
||||
|
||||
/// Single-metric aggregations use this common result structure.
|
||||
///
|
||||
/// Main reason to wrap it in value is to match elasticsearch output structure.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SingleMetricResult {
|
||||
/// The value of the single value metric.
|
||||
pub value: Option<f64>,
|
||||
}
|
||||
|
||||
impl From<f64> for SingleMetricResult {
|
||||
fn from(value: f64) -> Self {
|
||||
Self { value: Some(value) }
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Option<f64>> for SingleMetricResult {
|
||||
fn from(value: Option<f64>) -> Self {
|
||||
Self { value }
|
||||
}
|
||||
}
|
||||
372
src/aggregation/metric/stats.rs
Normal file
@@ -0,0 +1,372 @@
|
||||
use fastfield_codecs::Column;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
/// A multi-value metric aggregation that computes stats of numeric values that are
|
||||
/// extracted from the aggregated documents.
|
||||
/// Supported field types are u64, i64, and f64.
|
||||
/// See [Stats] for returned statistics.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "stats": {
|
||||
/// "field": "score",
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct StatsAggregation {
|
||||
/// The field name to compute the stats on.
|
||||
pub field: String,
|
||||
}
|
||||
|
||||
impl StatsAggregation {
|
||||
/// Create new StatsAggregation from a field.
|
||||
pub fn from_field_name(field_name: String) -> Self {
|
||||
StatsAggregation { field: field_name }
|
||||
}
|
||||
/// Return the field name.
|
||||
pub fn field_name(&self) -> &str {
|
||||
&self.field
|
||||
}
|
||||
}
|
||||
|
||||
/// Stats contains a collection of statistics.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Stats {
|
||||
/// The number of documents.
|
||||
pub count: usize,
|
||||
/// The sum of the fast field values.
|
||||
pub sum: f64,
|
||||
/// The standard deviation of the fast field values. None for count == 0.
|
||||
pub standard_deviation: Option<f64>,
|
||||
/// The min value of the fast field values.
|
||||
pub min: Option<f64>,
|
||||
/// The max value of the fast field values.
|
||||
pub max: Option<f64>,
|
||||
/// The average of the values. None for count == 0.
|
||||
pub avg: Option<f64>,
|
||||
}
|
||||
|
||||
impl Stats {
|
||||
pub(crate) fn get_value(&self, agg_property: &str) -> crate::Result<Option<f64>> {
|
||||
match agg_property {
|
||||
"count" => Ok(Some(self.count as f64)),
|
||||
"sum" => Ok(Some(self.sum)),
|
||||
"standard_deviation" => Ok(self.standard_deviation),
|
||||
"min" => Ok(self.min),
|
||||
"max" => Ok(self.max),
|
||||
"avg" => Ok(self.avg),
|
||||
_ => Err(TantivyError::InvalidArgument(format!(
|
||||
"unknown property {} on stats metric aggregation",
|
||||
agg_property
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// IntermediateStats contains the mergeable version for stats.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateStats {
|
||||
count: usize,
|
||||
sum: f64,
|
||||
squared_sum: f64,
|
||||
min: f64,
|
||||
max: f64,
|
||||
}
|
||||
impl Default for IntermediateStats {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
count: 0,
|
||||
sum: 0.0,
|
||||
squared_sum: 0.0,
|
||||
min: f64::MAX,
|
||||
max: f64::MIN,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateStats {
|
||||
pub(crate) fn avg(&self) -> Option<f64> {
|
||||
if self.count == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(self.sum / (self.count as f64))
|
||||
}
|
||||
}
|
||||
|
||||
fn square_mean(&self) -> f64 {
|
||||
self.squared_sum / (self.count as f64)
|
||||
}
|
||||
|
||||
pub(crate) fn standard_deviation(&self) -> Option<f64> {
|
||||
self.avg()
|
||||
.map(|average| (self.square_mean() - average * average).sqrt())
|
||||
}
|
||||
|
||||
/// Merge data from other stats into this instance.
|
||||
pub fn merge_fruits(&mut self, other: IntermediateStats) {
|
||||
self.count += other.count;
|
||||
self.sum += other.sum;
|
||||
self.squared_sum += other.squared_sum;
|
||||
self.min = self.min.min(other.min);
|
||||
self.max = self.max.max(other.max);
|
||||
}
|
||||
|
||||
/// compute final resultimprove_docs
|
||||
pub fn finalize(&self) -> Stats {
|
||||
let min = if self.count == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(self.min)
|
||||
};
|
||||
let max = if self.count == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(self.max)
|
||||
};
|
||||
Stats {
|
||||
count: self.count,
|
||||
sum: self.sum,
|
||||
standard_deviation: self.standard_deviation(),
|
||||
min,
|
||||
max,
|
||||
avg: self.avg(),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn collect(&mut self, value: f64) {
|
||||
self.count += 1;
|
||||
self.sum += value;
|
||||
self.squared_sum += value * value;
|
||||
self.min = self.min.min(value);
|
||||
self.max = self.max.max(value);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) struct SegmentStatsCollector {
|
||||
pub(crate) stats: IntermediateStats,
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
impl SegmentStatsCollector {
|
||||
pub fn from_req(field_type: Type) -> Self {
|
||||
Self {
|
||||
field_type,
|
||||
stats: IntermediateStats::default(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = field.get_val(docs[0] as u64);
|
||||
let val2 = field.get_val(docs[1] as u64);
|
||||
let val3 = field.get_val(docs[2] as u64);
|
||||
let val4 = field.get_val(docs[3] as u64);
|
||||
let val1 = f64_from_fastfield_u64(val1, &self.field_type);
|
||||
let val2 = f64_from_fastfield_u64(val2, &self.field_type);
|
||||
let val3 = f64_from_fastfield_u64(val3, &self.field_type);
|
||||
let val4 = f64_from_fastfield_u64(val4, &self.field_type);
|
||||
self.stats.collect(val1);
|
||||
self.stats.collect(val2);
|
||||
self.stats.collect(val3);
|
||||
self.stats.collect(val4);
|
||||
}
|
||||
for &doc in iter.remainder() {
|
||||
let val = field.get_val(doc as u64);
|
||||
let val = f64_from_fastfield_u64(val, &self.field_type);
|
||||
self.stats.collect(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::iter;
|
||||
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::aggregation::agg_req::{
|
||||
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
|
||||
RangeAggregation,
|
||||
};
|
||||
use crate::aggregation::agg_result::AggregationResults;
|
||||
use crate::aggregation::metric::StatsAggregation;
|
||||
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values};
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::Term;
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_stats_empty_index() -> crate::Result<()> {
|
||||
// test index without segments
|
||||
let values = vec![];
|
||||
|
||||
let index = get_test_index_from_values(false, &values)?;
|
||||
|
||||
let agg_req_1: Aggregations = vec![(
|
||||
"stats".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
|
||||
"score".to_string(),
|
||||
))),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
assert_eq!(
|
||||
res["stats"],
|
||||
json!({
|
||||
"avg": Value::Null,
|
||||
"count": 0,
|
||||
"max": Value::Null,
|
||||
"min": Value::Null,
|
||||
"standard_deviation": Value::Null,
|
||||
"sum": 0.0
|
||||
})
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_stats() -> crate::Result<()> {
|
||||
let index = get_test_index_2_segments(false)?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let agg_req_1: Aggregations = vec![
|
||||
(
|
||||
"stats_i64".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
|
||||
"score_i64".to_string(),
|
||||
))),
|
||||
),
|
||||
(
|
||||
"stats_f64".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
|
||||
"score_f64".to_string(),
|
||||
))),
|
||||
),
|
||||
(
|
||||
"stats".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
|
||||
"score".to_string(),
|
||||
))),
|
||||
),
|
||||
(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "score".to_string(),
|
||||
ranges: vec![
|
||||
(3f64..7f64).into(),
|
||||
(7f64..19f64).into(),
|
||||
(19f64..20f64).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: iter::once((
|
||||
"stats".to_string(),
|
||||
Aggregation::Metric(MetricAggregation::Stats(
|
||||
StatsAggregation::from_field_name("score".to_string()),
|
||||
)),
|
||||
))
|
||||
.collect(),
|
||||
}),
|
||||
),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
assert_eq!(
|
||||
res["stats"],
|
||||
json!({
|
||||
"avg": 12.142857142857142,
|
||||
"count": 7,
|
||||
"max": 44.0,
|
||||
"min": 1.0,
|
||||
"standard_deviation": 13.65313748796613,
|
||||
"sum": 85.0
|
||||
})
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
res["stats_i64"],
|
||||
json!({
|
||||
"avg": 12.142857142857142,
|
||||
"count": 7,
|
||||
"max": 44.0,
|
||||
"min": 1.0,
|
||||
"standard_deviation": 13.65313748796613,
|
||||
"sum": 85.0
|
||||
})
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
res["stats_f64"],
|
||||
json!({
|
||||
"avg": 12.214285714285714,
|
||||
"count": 7,
|
||||
"max": 44.5,
|
||||
"min": 1.0,
|
||||
"standard_deviation": 13.819905785437443,
|
||||
"sum": 85.5
|
||||
})
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
res["range"]["buckets"][2]["stats"],
|
||||
json!({
|
||||
"avg": 10.666666666666666,
|
||||
"count": 3,
|
||||
"max": 14.0,
|
||||
"min": 7.0,
|
||||
"standard_deviation": 2.867441755680877,
|
||||
"sum": 32.0
|
||||
})
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
res["range"]["buckets"][3]["stats"],
|
||||
json!({
|
||||
"avg": serde_json::Value::Null,
|
||||
"count": 0,
|
||||
"max": serde_json::Value::Null,
|
||||
"min": serde_json::Value::Null,
|
||||
"standard_deviation": serde_json::Value::Null,
|
||||
"sum": 0.0,
|
||||
})
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
1605
src/aggregation/mod.rs
Normal file
313
src/aggregation/segment_agg_result.rs
Normal file
@@ -0,0 +1,313 @@
|
||||
//! Contains aggregation trees which is used during collection in a segment.
|
||||
//! This tree contains datastructrues optimized for fast collection.
|
||||
//! The tree can be converted to an intermediate tree, which contains datastructrues optimized for
|
||||
//! merging.
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::rc::Rc;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
|
||||
use super::agg_req::MetricAggregation;
|
||||
use super::agg_req_with_accessor::{
|
||||
AggregationsWithAccessor, BucketAggregationWithAccessor, MetricAggregationWithAccessor,
|
||||
};
|
||||
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
|
||||
use super::collector::MAX_BUCKET_COUNT;
|
||||
use super::intermediate_agg_result::{IntermediateAggregationResults, IntermediateBucketResult};
|
||||
use super::metric::{
|
||||
AverageAggregation, SegmentAverageCollector, SegmentStatsCollector, StatsAggregation,
|
||||
};
|
||||
use super::VecWithNames;
|
||||
use crate::aggregation::agg_req::BucketAggregationType;
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
pub(crate) const DOC_BLOCK_SIZE: usize = 64;
|
||||
pub(crate) type DocBlock = [DocId; DOC_BLOCK_SIZE];
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentAggregationResultsCollector {
|
||||
pub(crate) metrics: Option<VecWithNames<SegmentMetricResultCollector>>,
|
||||
pub(crate) buckets: Option<VecWithNames<SegmentBucketResultCollector>>,
|
||||
staged_docs: DocBlock,
|
||||
num_staged_docs: usize,
|
||||
}
|
||||
|
||||
impl Default for SegmentAggregationResultsCollector {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
metrics: Default::default(),
|
||||
buckets: Default::default(),
|
||||
staged_docs: [0; DOC_BLOCK_SIZE],
|
||||
num_staged_docs: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for SegmentAggregationResultsCollector {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentAggregationResultsCollector")
|
||||
.field("metrics", &self.metrics)
|
||||
.field("buckets", &self.buckets)
|
||||
.field("staged_docs", &&self.staged_docs[..self.num_staged_docs])
|
||||
.field("num_staged_docs", &self.num_staged_docs)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentAggregationResultsCollector {
|
||||
pub fn into_intermediate_aggregations_result(
|
||||
self,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<IntermediateAggregationResults> {
|
||||
let buckets = if let Some(buckets) = self.buckets {
|
||||
let entries = buckets
|
||||
.into_iter()
|
||||
.zip(agg_with_accessor.buckets.values())
|
||||
.map(|((key, bucket), acc)| Ok((key, bucket.into_intermediate_bucket_result(acc)?)))
|
||||
.collect::<crate::Result<Vec<(String, _)>>>()?;
|
||||
Some(VecWithNames::from_entries(entries))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let metrics = self.metrics.map(VecWithNames::from_other);
|
||||
|
||||
Ok(IntermediateAggregationResults { metrics, buckets })
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(req: &AggregationsWithAccessor) -> crate::Result<Self> {
|
||||
let buckets = req
|
||||
.buckets
|
||||
.entries()
|
||||
.map(|(key, req)| {
|
||||
Ok((
|
||||
key.to_string(),
|
||||
SegmentBucketResultCollector::from_req_and_validate(req)?,
|
||||
))
|
||||
})
|
||||
.collect::<crate::Result<Vec<(String, _)>>>()?;
|
||||
let metrics = req
|
||||
.metrics
|
||||
.entries()
|
||||
.map(|(key, req)| {
|
||||
Ok((
|
||||
key.to_string(),
|
||||
SegmentMetricResultCollector::from_req_and_validate(req)?,
|
||||
))
|
||||
})
|
||||
.collect::<crate::Result<Vec<(String, _)>>>()?;
|
||||
let metrics = if metrics.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(VecWithNames::from_entries(metrics))
|
||||
};
|
||||
let buckets = if buckets.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(VecWithNames::from_entries(buckets))
|
||||
};
|
||||
Ok(SegmentAggregationResultsCollector {
|
||||
metrics,
|
||||
buckets,
|
||||
staged_docs: [0; DOC_BLOCK_SIZE],
|
||||
num_staged_docs: 0,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn collect(
|
||||
&mut self,
|
||||
doc: crate::DocId,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<()> {
|
||||
self.staged_docs[self.num_staged_docs] = doc;
|
||||
self.num_staged_docs += 1;
|
||||
if self.num_staged_docs == self.staged_docs.len() {
|
||||
self.flush_staged_docs(agg_with_accessor, false)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn flush_staged_docs(
|
||||
&mut self,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
force_flush: bool,
|
||||
) -> crate::Result<()> {
|
||||
if self.num_staged_docs == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
if let Some(metrics) = &mut self.metrics {
|
||||
for (collector, agg_with_accessor) in
|
||||
metrics.values_mut().zip(agg_with_accessor.metrics.values())
|
||||
{
|
||||
collector
|
||||
.collect_block(&self.staged_docs[..self.num_staged_docs], agg_with_accessor);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(buckets) = &mut self.buckets {
|
||||
for (collector, agg_with_accessor) in
|
||||
buckets.values_mut().zip(agg_with_accessor.buckets.values())
|
||||
{
|
||||
collector.collect_block(
|
||||
&self.staged_docs[..self.num_staged_docs],
|
||||
agg_with_accessor,
|
||||
force_flush,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
self.num_staged_docs = 0;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) enum SegmentMetricResultCollector {
|
||||
Average(SegmentAverageCollector),
|
||||
Stats(SegmentStatsCollector),
|
||||
}
|
||||
|
||||
impl SegmentMetricResultCollector {
|
||||
pub fn from_req_and_validate(req: &MetricAggregationWithAccessor) -> crate::Result<Self> {
|
||||
match &req.metric {
|
||||
MetricAggregation::Average(AverageAggregation { field: _ }) => {
|
||||
Ok(SegmentMetricResultCollector::Average(
|
||||
SegmentAverageCollector::from_req(req.field_type),
|
||||
))
|
||||
}
|
||||
MetricAggregation::Stats(StatsAggregation { field: _ }) => {
|
||||
Ok(SegmentMetricResultCollector::Stats(
|
||||
SegmentStatsCollector::from_req(req.field_type),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
pub(crate) fn collect_block(&mut self, doc: &[DocId], metric: &MetricAggregationWithAccessor) {
|
||||
match self {
|
||||
SegmentMetricResultCollector::Average(avg_collector) => {
|
||||
avg_collector.collect_block(doc, &metric.accessor);
|
||||
}
|
||||
SegmentMetricResultCollector::Stats(stats_collector) => {
|
||||
stats_collector.collect_block(doc, &metric.accessor);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// SegmentBucketAggregationResultCollectors will have specialized buckets for collection inside
|
||||
/// segments.
|
||||
/// The typical structure of Map<Key, Bucket> is not suitable during collection for performance
|
||||
/// reasons.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) enum SegmentBucketResultCollector {
|
||||
Range(SegmentRangeCollector),
|
||||
Histogram(Box<SegmentHistogramCollector>),
|
||||
Terms(Box<SegmentTermCollector>),
|
||||
}
|
||||
|
||||
impl SegmentBucketResultCollector {
|
||||
pub fn into_intermediate_bucket_result(
|
||||
self,
|
||||
agg_with_accessor: &BucketAggregationWithAccessor,
|
||||
) -> crate::Result<IntermediateBucketResult> {
|
||||
match self {
|
||||
SegmentBucketResultCollector::Terms(terms) => {
|
||||
terms.into_intermediate_bucket_result(agg_with_accessor)
|
||||
}
|
||||
SegmentBucketResultCollector::Range(range) => {
|
||||
range.into_intermediate_bucket_result(agg_with_accessor)
|
||||
}
|
||||
SegmentBucketResultCollector::Histogram(histogram) => {
|
||||
histogram.into_intermediate_bucket_result(agg_with_accessor)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_req_and_validate(req: &BucketAggregationWithAccessor) -> crate::Result<Self> {
|
||||
match &req.bucket_agg {
|
||||
BucketAggregationType::Terms(terms_req) => Ok(Self::Terms(Box::new(
|
||||
SegmentTermCollector::from_req_and_validate(
|
||||
terms_req,
|
||||
&req.sub_aggregation,
|
||||
req.field_type,
|
||||
req.accessor
|
||||
.as_multi()
|
||||
.expect("unexpected fast field cardinality"),
|
||||
)?,
|
||||
))),
|
||||
BucketAggregationType::Range(range_req) => {
|
||||
Ok(Self::Range(SegmentRangeCollector::from_req_and_validate(
|
||||
range_req,
|
||||
&req.sub_aggregation,
|
||||
&req.bucket_count,
|
||||
req.field_type,
|
||||
)?))
|
||||
}
|
||||
BucketAggregationType::Histogram(histogram) => Ok(Self::Histogram(Box::new(
|
||||
SegmentHistogramCollector::from_req_and_validate(
|
||||
histogram,
|
||||
&req.sub_aggregation,
|
||||
req.field_type,
|
||||
req.accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinality"),
|
||||
)?,
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn collect_block(
|
||||
&mut self,
|
||||
doc: &[DocId],
|
||||
bucket_with_accessor: &BucketAggregationWithAccessor,
|
||||
force_flush: bool,
|
||||
) -> crate::Result<()> {
|
||||
match self {
|
||||
SegmentBucketResultCollector::Range(range) => {
|
||||
range.collect_block(doc, bucket_with_accessor, force_flush)?;
|
||||
}
|
||||
SegmentBucketResultCollector::Histogram(histogram) => {
|
||||
histogram.collect_block(doc, bucket_with_accessor, force_flush)?;
|
||||
}
|
||||
SegmentBucketResultCollector::Terms(terms) => {
|
||||
terms.collect_block(doc, bucket_with_accessor, force_flush)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct BucketCount {
|
||||
/// The counter which is shared between the aggregations for one request.
|
||||
pub(crate) bucket_count: Rc<AtomicU32>,
|
||||
pub(crate) max_bucket_count: u32,
|
||||
}
|
||||
|
||||
impl Default for BucketCount {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
bucket_count: Default::default(),
|
||||
max_bucket_count: MAX_BUCKET_COUNT,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BucketCount {
|
||||
pub(crate) fn validate_bucket_count(&self) -> crate::Result<()> {
|
||||
if self.get_count() > self.max_bucket_count {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"Aborting aggregation because too many buckets were created".to_string(),
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
pub(crate) fn add_count(&self, count: u32) {
|
||||
self.bucket_count
|
||||
.fetch_add(count as u32, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
pub(crate) fn get_count(&self) -> u32 {
|
||||
self.bucket_count.load(std::sync::atomic::Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,6 @@
|
||||
use super::Collector;
|
||||
use crate::collector::SegmentCollector;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader};
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
/// documents match the query.
|
||||
@@ -20,10 +17,10 @@ use crate::SegmentReader;
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind")).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib")).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow")).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl")).unwrap();
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
@@ -80,8 +77,7 @@ impl SegmentCollector for SegmentCountCollector {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{Count, SegmentCountCollector};
|
||||
use crate::collector::Collector;
|
||||
use crate::collector::SegmentCollector;
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
|
||||
#[test]
|
||||
fn test_count_collect_does_not_requires_scoring() {
|
||||
|
||||
@@ -8,8 +8,7 @@ pub(crate) struct CustomScoreTopCollector<TCustomScorer, TScore = Score> {
|
||||
}
|
||||
|
||||
impl<TCustomScorer, TScore> CustomScoreTopCollector<TCustomScorer, TScore>
|
||||
where
|
||||
TScore: Clone + PartialOrd,
|
||||
where TScore: Clone + PartialOrd
|
||||
{
|
||||
pub(crate) fn new(
|
||||
custom_scorer: TCustomScorer,
|
||||
@@ -114,8 +113,7 @@ where
|
||||
}
|
||||
|
||||
impl<F, TScore> CustomSegmentScorer<TScore> for F
|
||||
where
|
||||
F: 'static + FnMut(DocId) -> TScore,
|
||||
where F: 'static + FnMut(DocId) -> TScore
|
||||
{
|
||||
fn score(&mut self, doc: DocId) -> TScore {
|
||||
(self)(doc)
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use crate::{DocAddress, DocId, Score};
|
||||
|
||||
use super::{Collector, SegmentCollector};
|
||||
use crate::{DocAddress, DocId, Score};
|
||||
|
||||
/// Collectors that returns the set of DocAddress that matches the query.
|
||||
///
|
||||
|
||||
@@ -1,21 +1,14 @@
|
||||
use crate::collector::Collector;
|
||||
use crate::collector::SegmentCollector;
|
||||
use crate::fastfield::FacetReader;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::Field;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::btree_map;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::{btree_map, BTreeMap, BTreeSet, BinaryHeap};
|
||||
use std::iter::Peekable;
|
||||
use std::ops::Bound;
|
||||
use std::{u64, usize};
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::FacetReader;
|
||||
use crate::schema::{Facet, Field};
|
||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader};
|
||||
|
||||
struct Hit<'a> {
|
||||
count: u64,
|
||||
facet: &'a Facet,
|
||||
@@ -83,7 +76,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// ```rust
|
||||
/// use tantivy::collector::FacetCollector;
|
||||
/// use tantivy::query::AllQuery;
|
||||
/// use tantivy::schema::{Facet, Schema, INDEXED, TEXT};
|
||||
/// use tantivy::schema::{Facet, Schema, FacetOptions, TEXT};
|
||||
/// use tantivy::{doc, Index};
|
||||
///
|
||||
/// fn example() -> tantivy::Result<()> {
|
||||
@@ -92,7 +85,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// // Facet have their own specific type.
|
||||
/// // It is not a bad practise to put all of your
|
||||
/// // facet information in the same field.
|
||||
/// let facet = schema_builder.add_facet_field("facet", INDEXED);
|
||||
/// let facet = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
@@ -103,23 +96,23 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// title => "The Name of the Wind",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/fiction/fantasy")
|
||||
/// ));
|
||||
/// ))?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "Dune",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/fiction/sci-fi")
|
||||
/// ));
|
||||
/// ))?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "La Vénus d'Ille",
|
||||
/// facet => Facet::from("/lang/fr"),
|
||||
/// facet => Facet::from("/category/fiction/fantasy"),
|
||||
/// facet => Facet::from("/category/fiction/horror")
|
||||
/// ));
|
||||
/// ))?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/biography")
|
||||
/// ));
|
||||
/// ))?;
|
||||
/// index_writer.commit()?;
|
||||
/// }
|
||||
/// let reader = index.reader()?;
|
||||
@@ -240,9 +233,7 @@ impl FacetCollector {
|
||||
/// If you need the correct number of unique documents for two such facets,
|
||||
/// just add them in separate `FacetCollector`.
|
||||
pub fn add_facet<T>(&mut self, facet_from: T)
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
where Facet: From<T> {
|
||||
let facet = Facet::from(facet_from);
|
||||
for old_facet in &self.facets {
|
||||
assert!(
|
||||
@@ -280,8 +271,8 @@ impl Collector for FacetCollector {
|
||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream()?;
|
||||
if facet_streamer.advance() {
|
||||
'outer: loop {
|
||||
// at the begining of this loop, facet_streamer
|
||||
// is positionned on a term that has not been processed yet.
|
||||
// at the beginning of this loop, facet_streamer
|
||||
// is positioned on a term that has not been processed yet.
|
||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||
match skip_result {
|
||||
SkipResult::Found => {
|
||||
@@ -400,11 +391,9 @@ impl<'a> Iterator for FacetChildIterator<'a> {
|
||||
|
||||
impl FacetCounts {
|
||||
/// Returns an iterator over all of the facet count pairs inside this result.
|
||||
/// See the documentation for `FacetCollector` for a usage example.
|
||||
/// See the documentation for [FacetCollector] for a usage example.
|
||||
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator<'_>
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
where Facet: From<T> {
|
||||
let facet = Facet::from(facet_from);
|
||||
let left_bound = Bound::Excluded(facet.clone());
|
||||
let right_bound = if facet.is_root() {
|
||||
@@ -421,11 +410,9 @@ impl FacetCounts {
|
||||
}
|
||||
|
||||
/// Returns a vector of top `k` facets with their counts, sorted highest-to-lowest by counts.
|
||||
/// See the documentation for `FacetCollector` for a usage example.
|
||||
/// See the documentation for [FacetCollector] for a usage example.
|
||||
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
where Facet: From<T> {
|
||||
let mut heap = BinaryHeap::with_capacity(k);
|
||||
let mut it = self.get(facet);
|
||||
|
||||
@@ -458,25 +445,27 @@ impl FacetCounts {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::iter;
|
||||
|
||||
use rand::distributions::Uniform;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
use super::{FacetCollector, FacetCounts};
|
||||
use crate::collector::Count;
|
||||
use crate::core::Index;
|
||||
use crate::query::{AllQuery, QueryParser, TermQuery};
|
||||
use crate::schema::{Document, Facet, Field, IndexRecordOption, Schema, INDEXED};
|
||||
use crate::schema::{Document, Facet, FacetOptions, Field, IndexRecordOption, Schema};
|
||||
use crate::Term;
|
||||
use rand::distributions::Uniform;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::{thread_rng, Rng};
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_drilldown() {
|
||||
fn test_facet_collector_drilldown() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let num_facets: usize = 3 * 4 * 5;
|
||||
let facets: Vec<Facet> = (0..num_facets)
|
||||
.map(|mut n| {
|
||||
@@ -491,14 +480,14 @@ mod tests {
|
||||
for i in 0..num_facets * 10 {
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet(Facet::from("/top1"));
|
||||
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
let counts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
|
||||
{
|
||||
let facets: Vec<(String, u64)> = counts
|
||||
@@ -518,11 +507,13 @@ mod tests {
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet.")]
|
||||
#[should_panic(
|
||||
expected = "Tried to add a facet which is a descendant of an already added facet."
|
||||
)]
|
||||
fn test_misused_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field::from_field_id(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
@@ -530,48 +521,49 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_unsorted_multifacet() {
|
||||
fn test_doc_unsorted_multifacet() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facets", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facets", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/subjects/A/a").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/B/a").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/A/b").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/B/b").unwrap(),
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 1);
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet("/subjects");
|
||||
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
let counts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
|
||||
assert_eq!(facets[0].1, 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_search_by_facet() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/A").unwrap(),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/B").unwrap(),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/C/A").unwrap(),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/D/C/A").unwrap(),
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -613,7 +605,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_facet_collector_topk() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -637,7 +629,7 @@ mod tests {
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
@@ -662,7 +654,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_facet_collector_topk_tie_break() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -677,7 +669,7 @@ mod tests {
|
||||
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
@@ -698,13 +690,14 @@ mod tests {
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use test::Bencher;
|
||||
|
||||
use crate::collector::FacetCollector;
|
||||
use crate::query::AllQuery;
|
||||
use crate::schema::{Facet, Schema, INDEXED};
|
||||
use crate::Index;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_facet_collector(b: &mut Bencher) {
|
||||
@@ -725,7 +718,7 @@ mod bench {
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
@@ -11,13 +11,16 @@
|
||||
// Importing tantivy...
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastValue};
|
||||
use crate::schema::Field;
|
||||
use crate::{Score, SegmentReader, TantivyError};
|
||||
|
||||
/// The `FilterCollector` collector filters docs using a fast field value and a predicate.
|
||||
/// Only the documents for which the predicate returned "true" will be passed on to the next collector.
|
||||
/// The `FilterCollector` filters docs using a fast field value and a predicate.
|
||||
/// Only the documents for which the predicate returned "true" will be passed on to the next
|
||||
/// collector.
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::collector::{TopDocs, FilterCollector};
|
||||
@@ -25,38 +28,40 @@ use crate::{Score, SegmentReader, TantivyError};
|
||||
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64));
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary").unwrap();
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
|
||||
/// let top_docs = searcher.search(&query, &no_filter_collector).unwrap();
|
||||
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
|
||||
///
|
||||
/// assert_eq!(top_docs.len(), 1);
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
///
|
||||
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
|
||||
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
|
||||
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
|
||||
///
|
||||
/// assert_eq!(filtered_top_docs.len(), 0);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
|
||||
where
|
||||
TPredicate: 'static + Clone,
|
||||
where TPredicate: 'static + Clone
|
||||
{
|
||||
field: Field,
|
||||
collector: TCollector,
|
||||
@@ -171,7 +176,7 @@ where
|
||||
type Fruit = TSegmentCollector::Fruit;
|
||||
|
||||
fn collect(&mut self, doc: u32, score: Score) {
|
||||
let value = self.fast_field_reader.get(doc);
|
||||
let value = self.fast_field_reader.get_val(doc as u64);
|
||||
if (self.predicate)(value) {
|
||||
self.segment_collector.collect(doc, score)
|
||||
}
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
use fastdivide::DividerU64;
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastValue};
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::{DocId, Score};
|
||||
use fastdivide::DividerU64;
|
||||
|
||||
/// Histogram builds an histogram of the values of a fastfield for the
|
||||
/// collected DocSet.
|
||||
@@ -18,7 +20,7 @@ use fastdivide::DividerU64;
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// f64 field. are not supported.
|
||||
/// f64 fields are not supported.
|
||||
#[derive(Clone)]
|
||||
pub struct HistogramCollector {
|
||||
min_value: u64,
|
||||
@@ -36,8 +38,8 @@ impl HistogramCollector {
|
||||
/// - `bucket_width`: the length of the interval that is associated to each buckets.
|
||||
/// - `num_buckets`: The overall number of buckets.
|
||||
///
|
||||
/// Together, this parameters define a partition of `[min_value, min_value + num_buckets * bucket_width)`
|
||||
/// into `num_buckets` intervals of width bucket that we call `bucket`.
|
||||
/// Together, this parameters define a partition of `[min_value, min_value + num_buckets *
|
||||
/// bucket_width)` into `num_buckets` intervals of width bucket that we call `bucket`.
|
||||
///
|
||||
/// # Disclaimer
|
||||
/// This function panics if the field given is of type f64.
|
||||
@@ -71,8 +73,7 @@ impl HistogramComputer {
|
||||
return;
|
||||
}
|
||||
let delta = value - self.min_value;
|
||||
let delta_u64 = delta.to_u64();
|
||||
let bucket_id: usize = self.divider.divide(delta_u64) as usize;
|
||||
let bucket_id: usize = self.divider.divide(delta) as usize;
|
||||
if bucket_id < self.counts.len() {
|
||||
self.counts[bucket_id] += 1;
|
||||
}
|
||||
@@ -91,7 +92,7 @@ impl SegmentCollector for SegmentHistogramCollector {
|
||||
type Fruit = Vec<u64>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let value = self.ff_reader.get(doc);
|
||||
let value = self.ff_reader.get_val(doc as u64);
|
||||
self.histogram_computer.add_value(value);
|
||||
}
|
||||
|
||||
@@ -147,13 +148,14 @@ fn add_vecs(mut vals_list: Vec<Vec<u64>>, len: usize) -> Vec<u64> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{add_vecs, HistogramCollector, HistogramComputer};
|
||||
use crate::chrono::{TimeZone, Utc};
|
||||
use crate::schema::{Schema, FAST};
|
||||
use crate::{doc, query, Index};
|
||||
use fastdivide::DividerU64;
|
||||
use query::AllQuery;
|
||||
|
||||
use super::{add_vecs, HistogramCollector, HistogramComputer};
|
||||
use crate::schema::{Schema, FAST};
|
||||
use crate::time::{Date, Month};
|
||||
use crate::{doc, query, DateTime, Index};
|
||||
|
||||
#[test]
|
||||
fn test_add_histograms_simple() {
|
||||
assert_eq!(
|
||||
@@ -226,10 +228,10 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||
writer.add_document(doc!(val_field=>12i64));
|
||||
writer.add_document(doc!(val_field=>-30i64));
|
||||
writer.add_document(doc!(val_field=>-12i64));
|
||||
writer.add_document(doc!(val_field=>-10i64));
|
||||
writer.add_document(doc!(val_field=>12i64))?;
|
||||
writer.add_document(doc!(val_field=>-30i64))?;
|
||||
writer.add_document(doc!(val_field=>-12i64))?;
|
||||
writer.add_document(doc!(val_field=>-10i64))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -247,13 +249,13 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||
writer.add_document(doc!(val_field=>12i64));
|
||||
writer.add_document(doc!(val_field=>12i64))?;
|
||||
writer.commit()?;
|
||||
writer.add_document(doc!(val_field=>-30i64));
|
||||
writer.add_document(doc!(val_field=>-30i64))?;
|
||||
writer.commit()?;
|
||||
writer.add_document(doc!(val_field=>-12i64));
|
||||
writer.add_document(doc!(val_field=>-12i64))?;
|
||||
writer.commit()?;
|
||||
writer.add_document(doc!(val_field=>-10i64));
|
||||
writer.add_document(doc!(val_field=>-10i64))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -271,17 +273,21 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)));
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)));
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)));
|
||||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
|
||||
writer.add_document(
|
||||
doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1986, Month::March, 9)?.with_hms(0, 0, 0)?)),
|
||||
)?;
|
||||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let all_query = AllQuery;
|
||||
let week_histogram_collector = HistogramCollector::new(
|
||||
date_field,
|
||||
Utc.ymd(1980, 1, 1).and_hms(0, 0, 0),
|
||||
3600 * 24 * 365, // it is just for a unit test... sorry leap years.
|
||||
DateTime::from_primitive(
|
||||
Date::from_calendar_date(1980, Month::January, 1)?.with_hms(0, 0, 0)?,
|
||||
),
|
||||
3_600_000_000 * 24 * 365, // it is just for a unit test... sorry leap years.
|
||||
10,
|
||||
);
|
||||
let week_histogram = searcher.search(&all_query, &week_histogram_collector)?;
|
||||
|
||||
@@ -1,95 +1,90 @@
|
||||
/*!
|
||||
//! # Collectors
|
||||
//!
|
||||
//! Collectors define the information you want to extract from the documents matching the queries.
|
||||
//! In tantivy jargon, we call this information your search "fruit".
|
||||
//!
|
||||
//! Your fruit could for instance be :
|
||||
//! - [the count of matching documents](./struct.Count.html)
|
||||
//! - [the top 10 documents, by relevancy or by a fast field](./struct.TopDocs.html)
|
||||
//! - [facet counts](./struct.FacetCollector.html)
|
||||
//!
|
||||
//! At one point in your code, you will trigger the actual search operation by calling
|
||||
//! [the `search(...)` method of your `Searcher` object](../struct.Searcher.html#method.search).
|
||||
//! This call will look like this.
|
||||
//!
|
||||
//! ```verbatim
|
||||
//! let fruit = searcher.search(&query, &collector)?;
|
||||
//! ```
|
||||
//!
|
||||
//! Here the type of fruit is actually determined as an associated type of the collector
|
||||
//! (`Collector::Fruit`).
|
||||
//!
|
||||
//!
|
||||
//! # Combining several collectors
|
||||
//!
|
||||
//! A rich search experience often requires to run several collectors on your search query.
|
||||
//! For instance,
|
||||
//! - selecting the top-K products matching your query
|
||||
//! - counting the matching documents
|
||||
//! - computing several facets
|
||||
//! - computing statistics about the matching product prices
|
||||
//!
|
||||
//! A simple and efficient way to do that is to pass your collectors as one tuple.
|
||||
//! The resulting `Fruit` will then be a typed tuple with each collector's original fruits
|
||||
//! in their respective position.
|
||||
//!
|
||||
//! ```rust
|
||||
//! # use tantivy::schema::*;
|
||||
//! # use tantivy::*;
|
||||
//! # use tantivy::query::*;
|
||||
//! use tantivy::collector::{Count, TopDocs};
|
||||
//! #
|
||||
//! # fn main() -> tantivy::Result<()> {
|
||||
//! # let mut schema_builder = Schema::builder();
|
||||
//! # let title = schema_builder.add_text_field("title", TEXT);
|
||||
//! # let schema = schema_builder.build();
|
||||
//! # let index = Index::create_in_ram(schema);
|
||||
//! # let mut index_writer = index.writer(3_000_000)?;
|
||||
//! # index_writer.add_document(doc!(
|
||||
//! # title => "The Name of the Wind",
|
||||
//! # ))?;
|
||||
//! # index_writer.add_document(doc!(
|
||||
//! # title => "The Diary of Muadib",
|
||||
//! # ))?;
|
||||
//! # index_writer.commit()?;
|
||||
//! # let reader = index.reader()?;
|
||||
//! # let searcher = reader.searcher();
|
||||
//! # let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
//! # let query = query_parser.parse_query("diary")?;
|
||||
//! let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
|
||||
//! searcher.search(&query, &(Count, TopDocs::with_limit(2)))?;
|
||||
//! # Ok(())
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! The `Collector` trait is implemented for up to 4 collectors.
|
||||
//! If you have more than 4 collectors, you can either group them into
|
||||
//! tuples of tuples `(a,(b,(c,d)))`, or rely on [`MultiCollector`](./struct.MultiCollector.html).
|
||||
//!
|
||||
//! # Combining several collectors dynamically
|
||||
//!
|
||||
//! Combining collectors into a tuple is a zero-cost abstraction: everything
|
||||
//! happens as if you had manually implemented a single collector
|
||||
//! combining all of our features.
|
||||
//!
|
||||
//! Unfortunately it requires you to know at compile time your collector types.
|
||||
//! If on the other hand, the collectors depend on some query parameter,
|
||||
//! you can rely on `MultiCollector`'s.
|
||||
//!
|
||||
//!
|
||||
//! # Implementing your own collectors.
|
||||
//!
|
||||
//! See the `custom_collector` example.
|
||||
|
||||
# Collectors
|
||||
|
||||
Collectors define the information you want to extract from the documents matching the queries.
|
||||
In tantivy jargon, we call this information your search "fruit".
|
||||
|
||||
Your fruit could for instance be :
|
||||
- [the count of matching documents](./struct.Count.html)
|
||||
- [the top 10 documents, by relevancy or by a fast field](./struct.TopDocs.html)
|
||||
- [facet counts](./struct.FacetCollector.html)
|
||||
|
||||
At one point in your code, you will trigger the actual search operation by calling
|
||||
[the `search(...)` method of your `Searcher` object](../struct.Searcher.html#method.search).
|
||||
This call will look like this.
|
||||
|
||||
```verbatim
|
||||
let fruit = searcher.search(&query, &collector)?;
|
||||
```
|
||||
|
||||
Here the type of fruit is actually determined as an associated type of the collector (`Collector::Fruit`).
|
||||
|
||||
|
||||
# Combining several collectors
|
||||
|
||||
A rich search experience often requires to run several collectors on your search query.
|
||||
For instance,
|
||||
- selecting the top-K products matching your query
|
||||
- counting the matching documents
|
||||
- computing several facets
|
||||
- computing statistics about the matching product prices
|
||||
|
||||
A simple and efficient way to do that is to pass your collectors as one tuple.
|
||||
The resulting `Fruit` will then be a typed tuple with each collector's original fruits
|
||||
in their respective position.
|
||||
|
||||
```rust
|
||||
# use tantivy::schema::*;
|
||||
# use tantivy::*;
|
||||
# use tantivy::query::*;
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
#
|
||||
# fn main() -> tantivy::Result<()> {
|
||||
# let mut schema_builder = Schema::builder();
|
||||
# let title = schema_builder.add_text_field("title", TEXT);
|
||||
# let schema = schema_builder.build();
|
||||
# let index = Index::create_in_ram(schema);
|
||||
# let mut index_writer = index.writer(3_000_000)?;
|
||||
# index_writer.add_document(doc!(
|
||||
# title => "The Name of the Wind",
|
||||
# ));
|
||||
# index_writer.add_document(doc!(
|
||||
# title => "The Diary of Muadib",
|
||||
# ));
|
||||
# index_writer.commit()?;
|
||||
# let reader = index.reader()?;
|
||||
# let searcher = reader.searcher();
|
||||
# let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
# let query = query_parser.parse_query("diary")?;
|
||||
let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
|
||||
searcher.search(&query, &(Count, TopDocs::with_limit(2)))?;
|
||||
# Ok(())
|
||||
# }
|
||||
```
|
||||
|
||||
The `Collector` trait is implemented for up to 4 collectors.
|
||||
If you have more than 4 collectors, you can either group them into
|
||||
tuples of tuples `(a,(b,(c,d)))`, or rely on [`MultiCollector`](./struct.MultiCollector.html).
|
||||
|
||||
# Combining several collectors dynamically
|
||||
|
||||
Combining collectors into a tuple is a zero-cost abstraction: everything
|
||||
happens as if you had manually implemented a single collector
|
||||
combining all of our features.
|
||||
|
||||
Unfortunately it requires you to know at compile time your collector types.
|
||||
If on the other hand, the collectors depend on some query parameter,
|
||||
you can rely on `MultiCollector`'s.
|
||||
|
||||
|
||||
# Implementing your own collectors.
|
||||
|
||||
See the `custom_collector` example.
|
||||
|
||||
*/
|
||||
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use downcast_rs::impl_downcast;
|
||||
|
||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader};
|
||||
|
||||
mod count_collector;
|
||||
pub use self::count_collector::Count;
|
||||
|
||||
@@ -97,7 +92,7 @@ mod histogram_collector;
|
||||
pub use histogram_collector::HistogramCollector;
|
||||
|
||||
mod multi_collector;
|
||||
pub use self::multi_collector::MultiCollector;
|
||||
pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit};
|
||||
|
||||
mod top_collector;
|
||||
|
||||
@@ -111,8 +106,7 @@ mod tweak_score_top_collector;
|
||||
pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
|
||||
|
||||
mod facet_collector;
|
||||
pub use self::facet_collector::FacetCollector;
|
||||
pub use self::facet_collector::FacetCounts;
|
||||
pub use self::facet_collector::{FacetCollector, FacetCounts};
|
||||
use crate::query::Weight;
|
||||
|
||||
mod docset_collector;
|
||||
@@ -178,9 +172,9 @@ pub trait Collector: Sync + Send {
|
||||
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
|
||||
let mut segment_collector = self.for_segment(segment_ord as u32, reader)?;
|
||||
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
if let Some(alive_bitset) = reader.alive_bitset() {
|
||||
weight.for_each(reader, &mut |doc, score| {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
if alive_bitset.is_alive(doc) {
|
||||
segment_collector.collect(doc, score);
|
||||
}
|
||||
})?;
|
||||
|
||||
@@ -1,14 +1,11 @@
|
||||
use super::Collector;
|
||||
use super::SegmentCollector;
|
||||
use crate::collector::Fruit;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use crate::TantivyError;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::Deref;
|
||||
|
||||
use super::{Collector, SegmentCollector};
|
||||
use crate::collector::Fruit;
|
||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
|
||||
/// MultiFruit keeps Fruits from every nested Collector
|
||||
pub struct MultiFruit {
|
||||
sub_fruits: Vec<Option<Box<dyn Fruit>>>,
|
||||
}
|
||||
@@ -83,12 +80,17 @@ impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector
|
||||
}
|
||||
}
|
||||
|
||||
/// FruitHandle stores reference to the corresponding collector inside MultiCollector
|
||||
pub struct FruitHandle<TFruit: Fruit> {
|
||||
pos: usize,
|
||||
_phantom: PhantomData<TFruit>,
|
||||
}
|
||||
|
||||
impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
/// Extract a typed fruit off a multifruit.
|
||||
///
|
||||
/// This function involves downcasting and can panic if the multifruit was
|
||||
/// created using faulty code.
|
||||
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
|
||||
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
|
||||
*boxed_fruit
|
||||
@@ -104,7 +106,8 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
///
|
||||
/// If the type of the collectors is known, you can just group yours collectors
|
||||
/// in a tuple. See the
|
||||
/// [Combining several collectors section of the collector documentation](./index.html#combining-several-collectors).
|
||||
/// [Combining several collectors section of the collector
|
||||
/// documentation](./index.html#combining-several-collectors).
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::collector::{Count, TopDocs, MultiCollector};
|
||||
@@ -112,19 +115,19 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let mut collectors = MultiCollector::new();
|
||||
@@ -139,6 +142,8 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
///
|
||||
/// assert_eq!(count, 2);
|
||||
/// assert_eq!(top_docs.len(), 2);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[derive(Default)]
|
||||
@@ -246,30 +251,28 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::collector::{Count, TopDocs};
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
use crate::schema::{IndexRecordOption, Schema, TEXT};
|
||||
use crate::{Index, Term};
|
||||
|
||||
#[test]
|
||||
fn test_multi_collector() {
|
||||
fn test_multi_collector() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc"));
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!(text=>""));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.commit().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text=>"abc"))?;
|
||||
index_writer.add_document(doc!(text=>"abc abc abc"))?;
|
||||
index_writer.add_document(doc!(text=>"abc abc"))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(text=>""))?;
|
||||
index_writer.add_document(doc!(text=>"abc abc abc abc"))?;
|
||||
index_writer.add_document(doc!(text=>"abc"))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let term = Term::from_field_text(text, "abc");
|
||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
|
||||
@@ -280,5 +283,6 @@ mod tests {
|
||||
|
||||
assert_eq!(count_handler.extract(&mut multifruits), 5);
|
||||
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,20 +1,14 @@
|
||||
use super::*;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fastfield::BytesFastFieldReader;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Field;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::{DocAddress, Document, Searcher};
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use super::*;
|
||||
use crate::collector::{Count, FilterCollector, TopDocs};
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fastfield::{BytesFastFieldReader, DynamicFastFieldReader};
|
||||
use crate::query::{AllQuery, QueryParser};
|
||||
use crate::schema::{Schema, FAST, TEXT};
|
||||
use crate::DateTime;
|
||||
use crate::{doc, Index};
|
||||
use std::str::FromStr;
|
||||
use crate::schema::{Field, Schema, FAST, TEXT};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{doc, DateTime, DocAddress, DocId, Document, Index, Score, Searcher, SegmentOrdinal};
|
||||
|
||||
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
|
||||
compute_score: true,
|
||||
@@ -25,7 +19,7 @@ pub const TEST_COLLECTOR_WITHOUT_SCORE: TestCollector = TestCollector {
|
||||
};
|
||||
|
||||
#[test]
|
||||
pub fn test_filter_collector() {
|
||||
pub fn test_filter_collector() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let price = schema_builder.add_u64_field("price", FAST);
|
||||
@@ -33,25 +27,25 @@ pub fn test_filter_collector() {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()));
|
||||
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()));
|
||||
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()));
|
||||
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()));
|
||||
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_utc(OffsetDateTime::parse("1898-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2020-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2019-04-20T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_utc(OffsetDateTime::parse("2018-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
let query = query_parser.parse_query("diary").unwrap();
|
||||
let query = query_parser.parse_query("diary")?;
|
||||
let filter_some_collector = FilterCollector::new(
|
||||
price,
|
||||
&|value: u64| value > 20_120u64,
|
||||
TopDocs::with_limit(2),
|
||||
);
|
||||
let top_docs = searcher.search(&query, &filter_some_collector).unwrap();
|
||||
let top_docs = searcher.search(&query, &filter_some_collector)?;
|
||||
|
||||
assert_eq!(top_docs.len(), 1);
|
||||
assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
@@ -63,21 +57,22 @@ pub fn test_filter_collector() {
|
||||
assert_eq!(filtered_top_docs.len(), 0);
|
||||
|
||||
fn date_filter(value: DateTime) -> bool {
|
||||
(value - DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()).num_weeks() > 0
|
||||
(value.into_utc() - OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
|
||||
.whole_weeks()
|
||||
> 0
|
||||
}
|
||||
|
||||
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
|
||||
let filtered_date_docs = searcher.search(&query, &filter_dates_collector).unwrap();
|
||||
let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?;
|
||||
|
||||
assert_eq!(filtered_date_docs.len(), 2);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
/// This collector is only used for tests.
|
||||
/// It is unusable in pr
|
||||
///
|
||||
/// actise, as it does not store
|
||||
/// the segment ordinals
|
||||
/// It is unusable in practise, as it does
|
||||
/// not store the segment ordinals
|
||||
pub struct TestCollector {
|
||||
pub compute_score: bool,
|
||||
}
|
||||
@@ -204,7 +199,7 @@ impl SegmentCollector for FastFieldSegmentCollector {
|
||||
type Fruit = Vec<u64>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let val = self.reader.get(doc);
|
||||
let val = self.reader.get_val(doc as u64);
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
@@ -270,12 +265,12 @@ impl SegmentCollector for BytesFastFieldSegmentCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn make_test_searcher() -> crate::Result<crate::LeasedItem<Searcher>> {
|
||||
fn make_test_searcher() -> crate::Result<Searcher> {
|
||||
let schema = Schema::builder().build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.commit()?;
|
||||
Ok(index.reader()?.searcher())
|
||||
}
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
use crate::DocAddress;
|
||||
use crate::DocId;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use crate::{DocAddress, DocId, SegmentOrdinal, SegmentReader};
|
||||
|
||||
/// Contains a feature (field, score, etc.) of a document along with the document address.
|
||||
///
|
||||
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
||||
@@ -62,17 +60,14 @@ pub(crate) struct TopCollector<T> {
|
||||
}
|
||||
|
||||
impl<T> TopCollector<T>
|
||||
where
|
||||
T: PartialOrd + Clone,
|
||||
where T: PartialOrd + Clone
|
||||
{
|
||||
/// Creates a top collector, with a number of documents equal to "limit".
|
||||
///
|
||||
/// # Panics
|
||||
/// The method panics if limit is 0
|
||||
pub fn with_limit(limit: usize) -> TopCollector<T> {
|
||||
if limit < 1 {
|
||||
panic!("Limit must be strictly greater than 0.");
|
||||
}
|
||||
assert!(limit >= 1, "Limit must be strictly greater than 0.");
|
||||
Self {
|
||||
limit,
|
||||
offset: 0,
|
||||
@@ -142,7 +137,7 @@ where
|
||||
/// sorted by type `T`.
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// The theoretical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
pub(crate) struct TopSegmentCollector<T> {
|
||||
limit: usize,
|
||||
@@ -178,8 +173,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Return true iff at least K documents have gone through
|
||||
/// the collector.
|
||||
/// Return true if more documents have been collected than the limit.
|
||||
#[inline]
|
||||
pub(crate) fn at_capacity(&self) -> bool {
|
||||
self.heap.len() >= self.limit
|
||||
@@ -255,7 +249,7 @@ mod tests {
|
||||
// when harvesting we have to guarantee stable sorting in case of a tie
|
||||
// on the score
|
||||
let doc_ids_collection = [4, 5, 6];
|
||||
let score = 3.14;
|
||||
let score = 3.3f32;
|
||||
|
||||
let mut top_collector_limit_2 = TopSegmentCollector::new(0, 2);
|
||||
for id in &doc_ids_collection {
|
||||
@@ -324,9 +318,10 @@ mod tests {
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::TopSegmentCollector;
|
||||
use test::Bencher;
|
||||
|
||||
use super::TopSegmentCollector;
|
||||
|
||||
#[bench]
|
||||
fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) {
|
||||
let mut top_collector = TopSegmentCollector::new(0, 400);
|
||||
|
||||
@@ -1,21 +1,20 @@
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use fastfield_codecs::Column;
|
||||
|
||||
use super::Collector;
|
||||
use crate::collector::top_collector::{ComparableDoc, TopCollector};
|
||||
use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
|
||||
use crate::collector::top_collector::{ComparableDoc, TopCollector, TopSegmentCollector};
|
||||
use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
|
||||
use crate::collector::{
|
||||
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
|
||||
};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastValue};
|
||||
use crate::query::Weight;
|
||||
use crate::schema::Field;
|
||||
use crate::DocAddress;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentOrdinal;
|
||||
use crate::SegmentReader;
|
||||
use crate::{collector::custom_score_top_collector::CustomScoreTopCollector, fastfield::FastValue};
|
||||
use crate::{collector::top_collector::TopSegmentCollector, TantivyError};
|
||||
use std::fmt;
|
||||
use std::{collections::BinaryHeap, marker::PhantomData};
|
||||
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
|
||||
struct FastFieldConvertCollector<
|
||||
TCollector: Collector<Fruit = Vec<(u64, DocAddress)>>,
|
||||
@@ -82,7 +81,7 @@ where
|
||||
/// sorted by their score.
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// The theoretical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
///
|
||||
/// This collector guarantees a stable sorting in case of a tie on the
|
||||
@@ -94,27 +93,30 @@ where
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary").unwrap();
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2))?;
|
||||
///
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct TopDocs(TopCollector<Score>);
|
||||
|
||||
@@ -134,7 +136,7 @@ struct ScorerByFastFieldReader {
|
||||
|
||||
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||
fn score(&mut self, doc: DocId) -> u64 {
|
||||
self.ff_reader.get(doc)
|
||||
self.ff_reader.get_val(doc as u64)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,41 +182,46 @@ impl TopDocs {
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"));
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary").unwrap();
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1)).unwrap();
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1))?;
|
||||
///
|
||||
/// assert_eq!(top_docs.len(), 2);
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 4));
|
||||
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
|
||||
/// Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
#[must_use]
|
||||
pub fn and_offset(self, offset: usize) -> TopDocs {
|
||||
TopDocs(self.0.and_offset(offset))
|
||||
}
|
||||
|
||||
/// Set top-K to rank documents by a given fast field.
|
||||
///
|
||||
/// If the field is not a fast or does not exist, this method returns successfully (it is not aware of any schema).
|
||||
/// An error will be returned at the moment of search.
|
||||
/// If the field is not a fast or does not exist, this method returns successfully (it is not
|
||||
/// aware of any schema). An error will be returned at the moment of search.
|
||||
///
|
||||
/// If the field is a FAST field but not a u64 field, search will return successfully but it will return
|
||||
/// returns a monotonic u64-representation (ie. the order is still correct) of the requested field type.
|
||||
/// If the field is a FAST field but not a u64 field, search will return successfully but it
|
||||
/// will return returns a monotonic u64-representation (ie. the order is still correct) of
|
||||
/// the requested field type.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
@@ -234,11 +241,11 @@ impl TopDocs {
|
||||
/// #
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64));
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
|
||||
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
|
||||
/// # assert!(index_writer.commit().is_ok());
|
||||
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64))?;
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64))?;
|
||||
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64))?;
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64))?;
|
||||
/// # index_writer.commit()?;
|
||||
/// # let reader = index.reader()?;
|
||||
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
|
||||
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
|
||||
@@ -278,7 +285,7 @@ impl TopDocs {
|
||||
///
|
||||
/// # See also
|
||||
///
|
||||
/// To confortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
|
||||
/// To comfortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
|
||||
/// [.order_by_fast_field(...)](#method.order_by_fast_field) method.
|
||||
pub fn order_by_u64_field(
|
||||
self,
|
||||
@@ -289,14 +296,15 @@ impl TopDocs {
|
||||
|
||||
/// Set top-K to rank documents by a given fast field.
|
||||
///
|
||||
/// If the field is not a fast field, or its field type does not match the generic type, this method does not panic,
|
||||
/// but an explicit error will be returned at the moment of collection.
|
||||
/// If the field is not a fast field, or its field type does not match the generic type, this
|
||||
/// method does not panic, but an explicit error will be returned at the moment of
|
||||
/// collection.
|
||||
///
|
||||
/// Note that this method is a generic. The requested fast field type will be often
|
||||
/// inferred in your code by the rust compiler.
|
||||
///
|
||||
/// Implementation-wise, for performance reason, tantivy will manipulate the u64 representation of your fast
|
||||
/// field until the last moment.
|
||||
/// Implementation-wise, for performance reason, tantivy will manipulate the u64 representation
|
||||
/// of your fast field until the last moment.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
@@ -316,9 +324,9 @@ impl TopDocs {
|
||||
/// #
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64));
|
||||
/// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64));
|
||||
/// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64));
|
||||
/// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64))?;
|
||||
/// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64))?;
|
||||
/// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64))?;
|
||||
/// # assert!(index_writer.commit().is_ok());
|
||||
/// # let reader = index.reader()?;
|
||||
/// # let top_docs = docs_sorted_by_revenue(&reader.searcher(), &AllQuery, rating)?;
|
||||
@@ -401,7 +409,7 @@ impl TopDocs {
|
||||
/// # use tantivy::query::QueryParser;
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::fastfield::FastFieldReader;
|
||||
/// use tantivy::fastfield::Column;
|
||||
/// use tantivy::schema::Field;
|
||||
///
|
||||
/// fn create_schema() -> Schema {
|
||||
@@ -417,9 +425,9 @@ impl TopDocs {
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// let product_name = index.schema().get_field("product_name").unwrap();
|
||||
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
|
||||
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
|
||||
/// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64));
|
||||
/// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64));
|
||||
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64))?;
|
||||
/// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64))?;
|
||||
/// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64))?;
|
||||
/// index_writer.commit()?;
|
||||
/// Ok(index)
|
||||
/// }
|
||||
@@ -450,7 +458,7 @@ impl TopDocs {
|
||||
///
|
||||
/// // We can now define our actual scoring function
|
||||
/// move |doc: DocId, original_score: Score| {
|
||||
/// let popularity: u64 = popularity_reader.get(doc);
|
||||
/// let popularity: u64 = popularity_reader.get_val(doc as u64);
|
||||
/// // Well.. For the sake of the example we use a simple logarithm
|
||||
/// // function.
|
||||
/// let popularity_boost_score = ((2u64 + popularity) as Score).log2();
|
||||
@@ -493,7 +501,7 @@ impl TopDocs {
|
||||
///
|
||||
/// This method only makes it possible to compute the score from a given
|
||||
/// `DocId`, fastfield values for the doc and any information you could
|
||||
/// have precomputed beforehands. It does not make it possible for instance
|
||||
/// have precomputed beforehand. It does not make it possible for instance
|
||||
/// to compute something like TfIdf as it does not have access to the list of query
|
||||
/// terms present in the document, nor the term frequencies for the different terms.
|
||||
///
|
||||
@@ -509,7 +517,7 @@ impl TopDocs {
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::schema::Field;
|
||||
/// use tantivy::fastfield::FastFieldReader;
|
||||
/// use fastfield_codecs::Column;
|
||||
///
|
||||
/// # fn create_schema() -> Schema {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
@@ -527,9 +535,9 @@ impl TopDocs {
|
||||
/// #
|
||||
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
|
||||
/// let boosted: Field = index.schema().get_field("boosted").unwrap();
|
||||
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64));
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64));
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64));
|
||||
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64))?;
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64))?;
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64))?;
|
||||
/// # index_writer.commit()?;
|
||||
/// // ...
|
||||
/// # let user_query = "diary";
|
||||
@@ -561,8 +569,8 @@ impl TopDocs {
|
||||
///
|
||||
/// // We can now define our actual scoring function
|
||||
/// move |doc: DocId| {
|
||||
/// let popularity: u64 = popularity_reader.get(doc);
|
||||
/// let boosted: u64 = boosted_reader.get(doc);
|
||||
/// let popularity: u64 = popularity_reader.get_val(doc as u64);
|
||||
/// let boosted: u64 = boosted_reader.get_val(doc as u64);
|
||||
/// // Score do not have to be `f64` in tantivy.
|
||||
/// // Here we return a couple to get lexicographical order
|
||||
/// // for free.
|
||||
@@ -629,10 +637,10 @@ impl Collector for TopDocs {
|
||||
let heap_len = self.0.limit + self.0.offset;
|
||||
let mut heap: BinaryHeap<ComparableDoc<Score, DocId>> = BinaryHeap::with_capacity(heap_len);
|
||||
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
if let Some(alive_bitset) = reader.alive_bitset() {
|
||||
let mut threshold = Score::MIN;
|
||||
weight.for_each_pruning(threshold, reader, &mut |doc, score| {
|
||||
if delete_bitset.is_deleted(doc) {
|
||||
if alive_bitset.is_deleted(doc) {
|
||||
return threshold;
|
||||
}
|
||||
let heap_item = ComparableDoc {
|
||||
@@ -708,25 +716,22 @@ mod tests {
|
||||
use crate::collector::Collector;
|
||||
use crate::query::{AllQuery, Query, QueryParser};
|
||||
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
||||
use crate::Index;
|
||||
use crate::IndexWriter;
|
||||
use crate::Score;
|
||||
use crate::{DocAddress, DocId, SegmentReader};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Score, SegmentReader};
|
||||
|
||||
fn make_index() -> Index {
|
||||
fn make_index() -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
|
||||
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
|
||||
index_writer.add_document(doc!(text_field=>"I like Droopy"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."))?;
|
||||
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"))?;
|
||||
index_writer.add_document(doc!(text_field=>"I like Droopy"))?;
|
||||
index_writer.commit()?;
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
fn assert_results_equals(results: &[(Score, DocAddress)], expected: &[(Score, DocAddress)]) {
|
||||
@@ -737,17 +742,15 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity_without_offset() {
|
||||
let index = make_index();
|
||||
fn test_top_collector_not_at_capacity_without_offset() -> crate::Result<()> {
|
||||
let index = make_index()?;
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let text_query = query_parser.parse_query("droopy tax")?;
|
||||
let score_docs: Vec<(Score, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
.reader()?
|
||||
.searcher()
|
||||
.search(&text_query, &TopDocs::with_limit(4))
|
||||
.unwrap();
|
||||
.search(&text_query, &TopDocs::with_limit(4))?;
|
||||
assert_results_equals(
|
||||
&score_docs,
|
||||
&[
|
||||
@@ -756,11 +759,12 @@ mod tests {
|
||||
(0.48527452, DocAddress::new(0, 0)),
|
||||
],
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity_with_offset() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
@@ -775,7 +779,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_at_capacity() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
@@ -796,7 +800,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_at_capacity_with_offset() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
@@ -817,7 +821,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_stable_sorting() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
|
||||
// using AllQuery to get a constant score
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
@@ -848,29 +852,35 @@ mod tests {
|
||||
const SIZE: &str = "size";
|
||||
|
||||
#[test]
|
||||
fn test_top_field_collector_not_at_capacity() {
|
||||
fn test_top_field_collector_not_at_capacity() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, query) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "growler of beer",
|
||||
size => 64u64,
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "pint of beer",
|
||||
size => 16u64,
|
||||
));
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "growler of beer",
|
||||
size => 64u64,
|
||||
))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "pint of beer",
|
||||
size => 16u64,
|
||||
))
|
||||
.unwrap();
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
&[
|
||||
@@ -879,32 +889,37 @@ mod tests {
|
||||
(12, DocAddress::new(0, 0))
|
||||
]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_field_collector_datetime() -> crate::Result<()> {
|
||||
use std::str::FromStr;
|
||||
let mut schema_builder = Schema::builder();
|
||||
let name = schema_builder.add_text_field("name", TEXT);
|
||||
let birthday = schema_builder.add_date_field("birthday", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let pr_birthday = crate::DateTime::from_str("1898-04-09T00:00:00+00:00")?;
|
||||
let pr_birthday = DateTime::from_utc(OffsetDateTime::parse(
|
||||
"1898-04-09T00:00:00+00:00",
|
||||
&Rfc3339,
|
||||
)?);
|
||||
index_writer.add_document(doc!(
|
||||
name => "Paul Robeson",
|
||||
birthday => pr_birthday
|
||||
));
|
||||
let mr_birthday = crate::DateTime::from_str("1947-11-08T00:00:00+00:00")?;
|
||||
birthday => pr_birthday,
|
||||
))?;
|
||||
let mr_birthday = DateTime::from_utc(OffsetDateTime::parse(
|
||||
"1947-11-08T00:00:00+00:00",
|
||||
&Rfc3339,
|
||||
)?);
|
||||
index_writer.add_document(doc!(
|
||||
name => "Minnie Riperton",
|
||||
birthday => mr_birthday
|
||||
));
|
||||
birthday => mr_birthday,
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
|
||||
let top_docs: Vec<(crate::DateTime, DocAddress)> =
|
||||
searcher.search(&AllQuery, &top_collector)?;
|
||||
let top_docs: Vec<(DateTime, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
&[
|
||||
@@ -926,11 +941,11 @@ mod tests {
|
||||
index_writer.add_document(doc!(
|
||||
city => "georgetown",
|
||||
altitude => -1i64,
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
city => "tokyo",
|
||||
altitude => 40i64,
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
|
||||
@@ -956,11 +971,11 @@ mod tests {
|
||||
index_writer.add_document(doc!(
|
||||
city => "georgetown",
|
||||
altitude => -1.0f64,
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
city => "tokyo",
|
||||
altitude => 40f64,
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
|
||||
@@ -983,10 +998,12 @@ mod tests {
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, _) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
))
|
||||
.unwrap();
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
|
||||
@@ -1003,7 +1020,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(size=>1u64));
|
||||
index_writer.add_document(doc!(size=>1u64))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
@@ -1020,7 +1037,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(size=>1u64));
|
||||
index_writer.add_document(doc!(size=>1u64))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
@@ -1033,30 +1050,26 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tweak_score_top_collector_with_offset() {
|
||||
let index = make_index();
|
||||
fn test_tweak_score_top_collector_with_offset() -> crate::Result<()> {
|
||||
let index = make_index()?;
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let text_query = query_parser.parse_query("droopy tax")?;
|
||||
let collector = TopDocs::with_limit(2).and_offset(1).tweak_score(
|
||||
move |_segment_reader: &SegmentReader| move |doc: DocId, _original_score: Score| doc,
|
||||
);
|
||||
let score_docs: Vec<(u32, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
.searcher()
|
||||
.search(&text_query, &collector)
|
||||
.unwrap();
|
||||
|
||||
let score_docs: Vec<(u32, DocAddress)> =
|
||||
index.reader()?.searcher().search(&text_query, &collector)?;
|
||||
assert_eq!(
|
||||
score_docs,
|
||||
vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_custom_score_top_collector_with_offset() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use crate::collector::top_collector::{TopCollector, TopSegmentCollector};
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::DocAddress;
|
||||
use crate::{DocId, Result, Score, SegmentReader};
|
||||
use crate::{DocAddress, DocId, Result, Score, SegmentReader};
|
||||
|
||||
pub(crate) struct TweakedScoreTopCollector<TScoreTweaker, TScore = Score> {
|
||||
score_tweaker: TScoreTweaker,
|
||||
@@ -9,8 +8,7 @@ pub(crate) struct TweakedScoreTopCollector<TScoreTweaker, TScore = Score> {
|
||||
}
|
||||
|
||||
impl<TScoreTweaker, TScore> TweakedScoreTopCollector<TScoreTweaker, TScore>
|
||||
where
|
||||
TScore: Clone + PartialOrd,
|
||||
where TScore: Clone + PartialOrd
|
||||
{
|
||||
pub fn new(
|
||||
score_tweaker: TScoreTweaker,
|
||||
@@ -118,8 +116,7 @@ where
|
||||
}
|
||||
|
||||
impl<F, TScore> ScoreSegmentTweaker<TScore> for F
|
||||
where
|
||||
F: 'static + FnMut(DocId, Score) -> TScore,
|
||||
where F: 'static + FnMut(DocId, Score) -> TScore
|
||||
{
|
||||
fn score(&mut self, doc: DocId, score: Score) -> TScore {
|
||||
(self)(doc, score)
|
||||
|
||||
@@ -1,396 +0,0 @@
|
||||
use std::fmt;
|
||||
use std::u64;
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub(crate) struct TinySet(u64);
|
||||
|
||||
impl fmt::Debug for TinySet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.into_iter().collect::<Vec<u32>>().fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TinySetIterator(TinySet);
|
||||
impl Iterator for TinySetIterator {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0.pop_lowest()
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for TinySet {
|
||||
type Item = u32;
|
||||
type IntoIter = TinySetIterator;
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
TinySetIterator(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl TinySet {
|
||||
/// Returns an empty `TinySet`.
|
||||
pub fn empty() -> TinySet {
|
||||
TinySet(0u64)
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
fn complement(self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` contains the element `el`.
|
||||
pub fn contains(self, el: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the TinySet.
|
||||
pub fn len(self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
|
||||
/// Returns the intersection of `self` and `other`
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
}
|
||||
|
||||
/// Creates a new `TinySet` containing only one element
|
||||
/// within `[0; 64[`
|
||||
#[inline]
|
||||
pub fn singleton(el: u32) -> TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.insert(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 | other.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` is empty.
|
||||
#[inline]
|
||||
pub fn is_empty(self) -> bool {
|
||||
self.0 == 0u64
|
||||
}
|
||||
|
||||
/// Returns the lowest element in the `TinySet`
|
||||
/// and removes it.
|
||||
#[inline]
|
||||
pub fn pop_lowest(&mut self) -> Option<u32> {
|
||||
if self.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let lowest = self.0.trailing_zeros() as u32;
|
||||
self.0 ^= TinySet::singleton(lowest).0;
|
||||
Some(lowest)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` than contains all values up
|
||||
/// to limit excluded.
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_lower(upper_bound: u32) -> TinySet {
|
||||
TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` that contains all values greater
|
||||
/// or equal to the given limit, included. (and up to 63)
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
|
||||
TinySet::range_lower(from_included).complement()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BitSet {
|
||||
tinysets: Box<[TinySet]>,
|
||||
len: usize,
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
fn num_buckets(max_val: u32) -> u32 {
|
||||
(max_val + 63u32) / 64u32
|
||||
}
|
||||
|
||||
impl BitSet {
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val[`.
|
||||
pub fn with_max_value(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
||||
BitSet {
|
||||
tinysets: tinybisets,
|
||||
len: 0,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all elements from the `BitSet`.
|
||||
pub fn clear(&mut self) {
|
||||
for tinyset in self.tinysets.iter_mut() {
|
||||
*tinyset = TinySet::empty();
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the `BitSet`.
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
pub fn insert(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
self.tinyset(el / 64u32).contains(el % 64)
|
||||
}
|
||||
|
||||
/// Returns the first non-empty `TinySet` associated to a bucket lower
|
||||
/// or greater than bucket.
|
||||
///
|
||||
/// Reminder: the tiny set with the bucket `bucket`, represents the
|
||||
/// elements from `bucket * 64` to `(bucket+1) * 64`.
|
||||
pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
|
||||
self.tinysets[bucket as usize..]
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|tinyset| !tinyset.is_empty())
|
||||
.map(|delta_bucket| bucket + delta_bucket as u32)
|
||||
}
|
||||
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
/// Returns the tiny bitset representing the
|
||||
/// the set restricted to the number range from
|
||||
/// `bucket * 64` to `(bucket + 1) * 64`.
|
||||
pub(crate) fn tinyset(&self, bucket: u32) -> TinySet {
|
||||
self.tinysets[bucket as usize]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::query::BitSetDocSet;
|
||||
use crate::tests;
|
||||
use crate::tests::generate_nonunique_unsorted;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
assert!(TinySet::empty().is_empty());
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32).insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(2u32);
|
||||
assert_eq!(u.pop_lowest(), Some(2u32));
|
||||
u.insert_mut(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(63u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset() {
|
||||
let test_against_hashset = |els: &[u32], max_value: u32| {
|
||||
let mut hashset: HashSet<u32> = HashSet::new();
|
||||
let mut bitset = BitSet::with_max_value(max_value);
|
||||
for &el in els {
|
||||
assert!(el < max_value);
|
||||
hashset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for el in 0..max_value {
|
||||
assert_eq!(hashset.contains(&el), bitset.contains(el));
|
||||
}
|
||||
assert_eq!(bitset.max_value(), max_value);
|
||||
};
|
||||
|
||||
test_against_hashset(&[], 0);
|
||||
test_against_hashset(&[], 1);
|
||||
test_against_hashset(&[0u32], 1);
|
||||
test_against_hashset(&[0u32], 100);
|
||||
test_against_hashset(&[1u32, 2u32], 4);
|
||||
test_against_hashset(&[99u32], 100);
|
||||
test_against_hashset(&[63u32], 64);
|
||||
test_against_hashset(&[62u32, 63u32], 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_large() {
|
||||
let arr = generate_nonunique_unsorted(100_000, 5_000);
|
||||
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
|
||||
let mut bitset = BitSet::with_max_value(100_000);
|
||||
for el in arr {
|
||||
btreeset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for i in 0..100_000 {
|
||||
assert_eq!(btreeset.contains(&i), bitset.contains(i));
|
||||
}
|
||||
assert_eq!(btreeset.len(), bitset.len());
|
||||
let mut bitset_docset = BitSetDocSet::from(bitset);
|
||||
let mut remaining = true;
|
||||
for el in btreeset.into_iter() {
|
||||
assert!(remaining);
|
||||
assert_eq!(bitset_docset.doc(), el);
|
||||
remaining = bitset_docset.advance() != TERMINATED;
|
||||
}
|
||||
assert!(!remaining);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_num_buckets() {
|
||||
use super::num_buckets;
|
||||
assert_eq!(num_buckets(0u32), 0);
|
||||
assert_eq!(num_buckets(1u32), 1);
|
||||
assert_eq!(num_buckets(64u32), 1);
|
||||
assert_eq!(num_buckets(65u32), 2);
|
||||
assert_eq!(num_buckets(128u32), 2);
|
||||
assert_eq!(num_buckets(129u32), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tinyset_range() {
|
||||
assert_eq!(
|
||||
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1, 2]
|
||||
);
|
||||
assert!(TinySet::range_lower(0).is_empty());
|
||||
assert_eq!(
|
||||
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
|
||||
(0u32..63u32).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
|
||||
[0]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_greater_or_equal(3)
|
||||
.into_iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
(3u32..64u32).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_len() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(104u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_clear() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
let els = tests::sample(1_000, 0.01f64);
|
||||
for &el in &els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
assert!(els.iter().all(|el| bitset.contains(*el)));
|
||||
bitset.clear();
|
||||
for el in 0u32..1000u32 {
|
||||
assert!(!bitset.contains(el));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use test;
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let mut tinyset = TinySet::singleton(test::black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
b.iter(|| {
|
||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
}
|
||||
@@ -1,203 +0,0 @@
|
||||
mod bitset;
|
||||
mod composite_file;
|
||||
|
||||
pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
pub use common::CountingWriter;
|
||||
pub use common::{
|
||||
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
|
||||
};
|
||||
pub use common::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
|
||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||
///
|
||||
/// We do not allow segments with more than
|
||||
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
||||
|
||||
/// Has length trait
|
||||
pub trait HasLen {
|
||||
/// Return length
|
||||
fn len(&self) -> usize;
|
||||
|
||||
/// Returns true iff empty.
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
}
|
||||
|
||||
const HIGHEST_BIT: u64 = 1 << 63;
|
||||
|
||||
/// Maps a `i64` to `u64`
|
||||
///
|
||||
/// For simplicity, tantivy internally handles `i64` as `u64`.
|
||||
/// The mapping is defined by this function.
|
||||
///
|
||||
/// Maps `i64` to `u64` so that
|
||||
/// `-2^63 .. 2^63-1` is mapped
|
||||
/// to
|
||||
/// `0 .. 2^64-1`
|
||||
/// in that order.
|
||||
///
|
||||
/// This is more suited than simply casting (`val as u64`)
|
||||
/// because of bitpacking.
|
||||
///
|
||||
/// Imagine a list of `i64` ranging from -10 to 10.
|
||||
/// When casting negative values, the negative values are projected
|
||||
/// to values over 2^63, and all values end up requiring 64 bits.
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
|
||||
#[inline]
|
||||
pub fn i64_to_u64(val: i64) -> u64 {
|
||||
(val as u64) ^ HIGHEST_BIT
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline]
|
||||
pub fn u64_to_i64(val: u64) -> i64 {
|
||||
(val ^ HIGHEST_BIT) as i64
|
||||
}
|
||||
|
||||
/// Maps a `f64` to `u64`
|
||||
///
|
||||
/// For simplicity, tantivy internally handles `f64` as `u64`.
|
||||
/// The mapping is defined by this function.
|
||||
///
|
||||
/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved.
|
||||
///
|
||||
/// This is more suited than simply casting (`val as u64`)
|
||||
/// which would truncate the result
|
||||
///
|
||||
/// # Reference
|
||||
///
|
||||
/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/)
|
||||
/// explains the mapping in a clear manner.
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
|
||||
#[inline]
|
||||
pub fn f64_to_u64(val: f64) -> u64 {
|
||||
let bits = val.to_bits();
|
||||
if val.is_sign_positive() {
|
||||
bits ^ HIGHEST_BIT
|
||||
} else {
|
||||
!bits
|
||||
}
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline]
|
||||
pub fn u64_to_f64(val: u64) -> f64 {
|
||||
f64::from_bits(if val & HIGHEST_BIT != 0 {
|
||||
val ^ HIGHEST_BIT
|
||||
} else {
|
||||
!val
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use proptest::prelude::*;
|
||||
use std::f64;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
pub use tantivy_bitpacker::minmax;
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
fn test_f64_converter_helper(val: f64) {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
|
||||
let left_u64 = f64_to_u64(left);
|
||||
let right_u64 = f64_to_u64(right);
|
||||
assert_eq!(left_u64 < right_u64, left < right);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_i64_converter() {
|
||||
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
|
||||
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
|
||||
test_i64_converter_helper(0i64);
|
||||
test_i64_converter_helper(i64::min_value());
|
||||
test_i64_converter_helper(i64::max_value());
|
||||
for i in -1000i64..1000i64 {
|
||||
test_i64_converter_helper(i);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_converter() {
|
||||
test_f64_converter_helper(f64::INFINITY);
|
||||
test_f64_converter_helper(f64::NEG_INFINITY);
|
||||
test_f64_converter_helper(0.0);
|
||||
test_f64_converter_helper(-0.0);
|
||||
test_f64_converter_helper(1.0);
|
||||
test_f64_converter_helper(-1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_order() {
|
||||
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
|
||||
.contains(&f64_to_u64(f64::NAN))); //nan is not a number
|
||||
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
|
||||
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
|
||||
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
assert_eq!(compute_num_bits(0), 0u8);
|
||||
assert_eq!(compute_num_bits(2), 2u8);
|
||||
assert_eq!(compute_num_bits(3), 2u8);
|
||||
assert_eq!(compute_num_bits(4), 3u8);
|
||||
assert_eq!(compute_num_bits(255), 8u8);
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_max_doc() {
|
||||
// this is the first time I write a unit test for a constant.
|
||||
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
|
||||
assert!((super::MAX_DOC_LIMIT as i32) < 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_empty() {
|
||||
let vals: Vec<u32> = vec![];
|
||||
assert_eq!(minmax(vals.into_iter()), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_one() {
|
||||
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_two() {
|
||||
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
|
||||
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
use crossbeam::channel;
|
||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Search executor whether search request are single thread or multithread.
|
||||
///
|
||||
/// We don't expose Rayon thread pool directly here for several reasons.
|
||||
@@ -47,17 +48,24 @@ impl Executor {
|
||||
match self {
|
||||
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
|
||||
Executor::ThreadPool(pool) => {
|
||||
let args_with_indices: Vec<(usize, A)> = args.enumerate().collect();
|
||||
let num_fruits = args_with_indices.len();
|
||||
let args: Vec<A> = args.collect();
|
||||
let num_fruits = args.len();
|
||||
let fruit_receiver = {
|
||||
let (fruit_sender, fruit_receiver) = channel::unbounded();
|
||||
let (fruit_sender, fruit_receiver) = crossbeam_channel::unbounded();
|
||||
pool.scope(|scope| {
|
||||
for arg_with_idx in args_with_indices {
|
||||
scope.spawn(|_| {
|
||||
let (idx, arg) = arg_with_idx;
|
||||
let fruit = f(arg);
|
||||
if let Err(err) = fruit_sender.send((idx, fruit)) {
|
||||
error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err);
|
||||
for (idx, arg) in args.into_iter().enumerate() {
|
||||
// We name references for f and fruit_sender_ref because we do not
|
||||
// want these two to be moved into the closure.
|
||||
let f_ref = &f;
|
||||
let fruit_sender_ref = &fruit_sender;
|
||||
scope.spawn(move |_| {
|
||||
let fruit = f_ref(arg);
|
||||
if let Err(err) = fruit_sender_ref.send((idx, fruit)) {
|
||||
error!(
|
||||
"Failed to send search task. It probably means all search \
|
||||
threads have panicked. {:?}",
|
||||
err
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -67,18 +75,19 @@ impl Executor {
|
||||
// This is important as it makes it possible for the fruit_receiver iteration to
|
||||
// terminate.
|
||||
};
|
||||
// This is lame, but safe.
|
||||
let mut results_with_position = Vec::with_capacity(num_fruits);
|
||||
let mut result_placeholders: Vec<Option<R>> =
|
||||
std::iter::repeat_with(|| None).take(num_fruits).collect();
|
||||
for (pos, fruit_res) in fruit_receiver {
|
||||
let fruit = fruit_res?;
|
||||
results_with_position.push((pos, fruit));
|
||||
result_placeholders[pos] = Some(fruit);
|
||||
}
|
||||
results_with_position.sort_by_key(|(pos, _)| *pos);
|
||||
assert_eq!(results_with_position.len(), num_fruits);
|
||||
Ok(results_with_position
|
||||
.into_iter()
|
||||
.map(|(_, fruit)| fruit)
|
||||
.collect::<Vec<_>>())
|
||||
let results: Vec<R> = result_placeholders.into_iter().flatten().collect();
|
||||
if results.len() != num_fruits {
|
||||
return Err(TantivyError::InternalError(
|
||||
"One of the mapped execution failed.".to_string(),
|
||||
));
|
||||
}
|
||||
Ok(results)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||