Compare commits

..

113 Commits

Author SHA1 Message Date
Paul Masurel
bd083159ac Revert "Closes #1195 (#1222)"
This reverts commit dde49ac8e2.
2021-12-02 14:37:59 +09:00
Paul Masurel
dde49ac8e2 Closes #1195 (#1222)
Removes the indexed option for facets.
Facets are now always indexed.

Closes #1195
2021-12-02 14:37:19 +09:00
Paul Masurel
c3cc93406d Bugfix: adds missing fdatasync on atomic_write.
In addition this PR:
- removes unnecessary flushes and fsyncs on files.
- replace all fsync by fdatasync. The latter triggers
a meta sync if a metadata required to read the file
has changed. It is therefore sufficient for us.

Closes #1224
2021-12-02 13:42:44 +09:00
Kanji Yomoda
bd0f9211da Remove unused sort for segmenta meta list (#1218)
* Remove unused sort for segment meta list
* Fix segment meta order dependent test
2021-12-01 11:18:17 +09:00
PSeitz
c503c6e4fa Switch to non-strict schema (#1216)
Fixes #1211
2021-11-29 10:38:59 +09:00
PSeitz
02174d26af Merge pull request #1209 from quickwit-inc/lz4_flex_version
fix lz4_flex version
2021-11-16 14:12:45 +08:00
PSeitz
cf92be3bd6 fix lz4_flex version 2021-11-16 06:03:04 +00:00
Shikhar Bhushan
72cef12db1 Add none compression (#1208) 2021-11-16 10:50:42 +09:00
Paul Masurel
bbc0a2e233 Fixing the build 2021-11-16 09:37:25 +09:00
François Massot
4fd1a6c84b Merge pull request #1207 from quickwit-inc/fix-chat-links
Remove patron link and changer gitter links to discord links.
2021-11-15 19:23:21 +01:00
François Massot
c83d99c414 Remove patron link and changer gitter links to discord links. 2021-11-15 19:17:35 +01:00
Paul Masurel
eacf510175 Exchange gitter link for discord 2021-11-15 16:44:13 +09:00
Paul Masurel
8802d125f8 Prepare commit is public again (#1202)
- Simplified some of the prepare commit & segment updater code using
async.
- Made PrepareCommit public again.
2021-11-12 23:25:39 +09:00
dependabot[bot]
33301a3eb4 Update fail requirement from 0.4 to 0.5 (#1197)
Updates the requirements on [fail](https://github.com/tikv/fail-rs) to permit the latest version.
- [Release notes](https://github.com/tikv/fail-rs/releases)
- [Changelog](https://github.com/tikv/fail-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/tikv/fail-rs/compare/v0.4.0...v0.5.0)

---
updated-dependencies:
- dependency-name: fail
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2021-11-12 23:21:16 +09:00
Paul Masurel
7234bef0eb Issue/1198 (#1201)
* Unit test reproducing #1198
* Fixing unit test to handle the error from add_document.
* Bump project version
2021-11-11 16:42:19 +09:00
azerowall
fcff91559b Fix the deserialization error of FieldEntry when the 'options' field appears before the 'type' field (#1199)
Co-authored-by: quel <azerowall>
2021-11-10 18:39:58 +09:00
Paul Masurel
b75d4e59d1 Remove the broken panic on drop unit test. (#1200) 2021-11-10 18:39:37 +09:00
Paul Masurel
c6b5ab1dbe Replacing the panic check in the RAM Directory on lack of flush. 2021-11-09 11:04:31 +09:00
PSeitz
c12e07f0ce Merge pull request #1196 from quickwit-inc/dependabot/cargo/measure_time-0.8.0
Update measure_time requirement from 0.7.0 to 0.8.0
2021-11-05 08:47:51 +08:00
dependabot[bot]
8b877a4c26 Update measure_time requirement from 0.7.0 to 0.8.0
Updates the requirements on [measure_time](https://github.com/PSeitz/rust_measure_time) to permit the latest version.
- [Release notes](https://github.com/PSeitz/rust_measure_time/releases)
- [Commits](https://github.com/PSeitz/rust_measure_time/commits)

---
updated-dependencies:
- dependency-name: measure_time
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2021-11-04 20:27:16 +00:00
PSeitz
7dc0dc1c9b extend proptests with adding case (#1191)
This extends the proptest to cover a case where up to a 100 documents are added to an index.
2021-11-01 09:27:10 +09:00
François Massot
0462754673 Optimize block wand for one and several TermScorer. (#1190)
* Added optimisation using block wand for single TermScorer.

A proptest was also added.

* Fix block wand algorithm by taking the last doc id of scores until the pivot scorer (included).
* In block wand, when block max score is lower than the threshold, advance the scorer with best score.
* Fix wrong condition in block_wand_single_scorer and add debug_assert to have an equality check on doc to break the loop.
2021-11-01 09:18:05 +09:00
PSeitz
5916ceda73 Merge pull request #1188 from PSeitz/sort_issue
fix incorrect padding in bitset for multiple of 64
2021-10-29 17:06:38 +08:00
Pascal Seitz
70283dc6c8 fix incorrect padding in bitset for multiple of 64 2021-10-29 16:49:22 +08:00
PSeitz
dbaf4f3623 Merge pull request #1187 from PSeitz/sort_issue
check searcher num docs in proptest
2021-10-29 16:19:24 +08:00
Pascal Seitz
4808648322 check searcher num docs in proptest 2021-10-29 14:38:30 +08:00
Paul Masurel
54afb9b34a Made PrepareCommit private 2021-10-29 14:13:14 +09:00
Paul Masurel
d336c8b938 Fixed logo 2021-10-27 08:54:16 +09:00
Paul Masurel
980d1b2796 Removing Patreon link 2021-10-27 08:53:45 +09:00
Dan Cecile
6317982876 Make indexer::prepared_commit public (#1184)
* Make indexer::prepared_commit public

* Add PreparedCommit to lib
2021-10-26 12:21:24 +09:00
PSeitz
e2fbbc08ca Merge pull request #1182 from PSeitz/remove_directory_generic
use Box<dyn Directory> as parameter to open/create an Index
2021-10-25 12:49:55 +08:00
Pascal Seitz
99cd25beae use <T: Into<Box<dyn Directory>>> as parameter to open/create an Index
This is done in order to support Box<dyn Directory> additionally to generic implementations of the trait Directory.
Remove boxing in ManagedDirectory.
2021-10-25 12:34:40 +08:00
Kanji Yomoda
737ecc7015 Fix outdated comment for IndexWriter::new (#1183) 2021-10-25 10:59:18 +09:00
Kanji Yomoda
09668459c8 Update codecov-action to v2 and make it possible to keep it up-to-date with dependabot (#1181)
* Update codecov-action to v2

* Add github-actions to dependabot
2021-10-25 10:58:16 +09:00
Evance Soumaoro
e5fd30f438 Fixed links (#1177) 2021-10-25 10:56:04 +09:00
Tom Parker-Shemilt
c412a46105 Remove travis config (#1180) 2021-10-24 15:40:43 +09:00
PSeitz
3a78402496 update links (#1176) 2021-10-18 20:45:40 +09:00
Paul Masurel
d18ac136c0 Search simplified (#1175) 2021-10-18 12:52:43 +09:00
Paul Masurel
b5b1244857 More functionality in the ownedbytes crate (#1172) 2021-10-07 18:14:49 +09:00
Paul Masurel
27acfa4dea Removing dead file (#1170) 2021-10-07 14:15:21 +09:00
Paul Masurel
02cffa4dea Code simplification. (#1169)
Code simplification and Clippy
2021-10-07 14:11:44 +09:00
Paul Masurel
b52abbc771 Bugfix transposition_cost_one in FuzzyQuery (#1167) 2021-10-07 09:38:39 +09:00
Paul Masurel
894c61867f Fix test compilation (#1168) 2021-10-06 17:50:10 +09:00
PSeitz
352e0cc58d Adde demux operation (#1150)
* add merge for DeleteBitSet, allow custom DeleteBitSet on merge
* forward delete bitsets on merge, add tests
* add demux operation and tests
2021-10-06 16:05:16 +09:00
Paul Masurel
ffe4446d90 Minor lint comments (#1166) 2021-10-06 11:27:48 +09:00
dependabot[bot]
4d05b26e7a Update lru requirement from 0.6.5 to 0.7.0 (#1165)
Updates the requirements on [lru](https://github.com/jeromefroe/lru-rs) to permit the latest version.
- [Release notes](https://github.com/jeromefroe/lru-rs/releases)
- [Changelog](https://github.com/jeromefroe/lru-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/jeromefroe/lru-rs/compare/0.6.5...0.7.0)

---
updated-dependencies:
- dependency-name: lru
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2021-10-06 05:50:24 +09:00
Paul Masurel
0855649986 Leaning more on the alive (vs delete) semantics. (#1164) 2021-10-05 18:53:29 +09:00
PSeitz
d828e58903 Merge pull request #1163 from PSeitz/reduce_mem_usage
reduce mem usage
2021-10-01 08:03:41 +02:00
Pascal Seitz
aa0396fe27 fix variable names 2021-10-01 13:48:51 +08:00
Pascal Seitz
8d8315f8d0 prealloc vec in postinglist 2021-09-29 09:02:38 +08:00
Pascal Seitz
078c0a2e2e reserve vec 2021-09-29 08:45:04 +08:00
Pascal Seitz
f21e8dd875 use only segment ordinal in docidmapping 2021-09-29 08:44:56 +08:00
Tomoko Uchida
74e36c7e97 Add unit tests for tokenizers and filters (#1156)
* add unit test for SimpleTokenizer
* add unit tests for tokenizers and filters.
2021-09-27 10:22:01 +09:00
PSeitz
f27ae04282 fix slope calculation in multilinear interpol (#1161)
add test to check for compression
2021-09-27 10:14:03 +09:00
PSeitz
0ce49c9dd4 use lz4_flex 0.9.0 (#1160) 2021-09-27 10:12:20 +09:00
PSeitz
fe8e58e078 Merge pull request #1154 from PSeitz/delete_bitset
add DeleteBitSet iterator
2021-09-24 09:37:39 +02:00
Pascal Seitz
efc0d8341b fix comment 2021-09-24 15:09:21 +08:00
Pascal Seitz
22bcc83d10 fix padding in initialization 2021-09-24 14:43:04 +08:00
Pascal Seitz
5ee5037934 create and use ReadSerializedBitSet 2021-09-24 12:53:33 +08:00
Pascal Seitz
c217bfed1e cargo fmt 2021-09-23 21:02:19 +08:00
Pascal Seitz
c27ccd3e24 improve naming 2021-09-23 21:02:09 +08:00
Paul Masurel
367f5da782 Fixed comment to the index accessor 2021-09-23 21:53:48 +09:00
Mestery
b256df6599 add index accessor for index writer (#1159)
* add index accessor for index writer

* Update src/indexer/index_writer.rs

Co-authored-by: Paul Masurel <paul@quickwit.io>
2021-09-23 21:49:20 +09:00
Pascal Seitz
d7a6a409a1 renames 2021-09-23 20:33:11 +08:00
Pascal Seitz
a1f5cead96 AliveBitSet instead of DeleteBitSet 2021-09-23 20:03:57 +08:00
dependabot[bot]
37c5fe3c86 Update memmap2 requirement from 0.4 to 0.5 (#1157)
Updates the requirements on [memmap2](https://github.com/RazrFalcon/memmap2-rs) to permit the latest version.
- [Release notes](https://github.com/RazrFalcon/memmap2-rs/releases)
- [Changelog](https://github.com/RazrFalcon/memmap2-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/RazrFalcon/memmap2-rs/compare/v0.4.0...v0.5.0)

---
updated-dependencies:
- dependency-name: memmap2
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2021-09-23 20:18:27 +09:00
Pascal Seitz
4583fa270b fixes 2021-09-23 10:39:53 +08:00
Pascal Seitz
beb3a5bd73 fix len 2021-09-18 17:58:15 +08:00
Pascal Seitz
93cbd52bf0 move code to biset, add inline, add benchmark 2021-09-18 17:35:22 +08:00
Pascal Seitz
c22177a005 add iterator 2021-09-17 15:29:27 +08:00
Pascal Seitz
4da71273e1 add de/serialization for bitset
remove len footgun
2021-09-17 10:28:12 +08:00
dependabot[bot]
2c78b31aab Update memmap2 requirement from 0.3 to 0.4 (#1155)
Updates the requirements on [memmap2](https://github.com/RazrFalcon/memmap2-rs) to permit the latest version.
- [Release notes](https://github.com/RazrFalcon/memmap2-rs/releases)
- [Changelog](https://github.com/RazrFalcon/memmap2-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/RazrFalcon/memmap2-rs/compare/v.0.3.0...v0.4.0)
2021-09-17 08:52:52 +09:00
Pascal Seitz
4ae1d87632 add DeleteBitSet iterator 2021-09-15 23:10:04 +08:00
Paul Masurel
46b86a7976 Bounced version and edited changelog 2021-09-10 23:05:09 +09:00
PSeitz
3bc177e69d fix #1151 (#1152)
* fix #1151

Fixes a off by one error in the stats for the index fast field in the multi value fast field.
When retrieving the data range for a docid, `get(doc)..get(docid+1)` is requested. On creation
the num_vals statistic was set to doc instead of docid + 1. In the multivaluelinearinterpol fast
field the last value was therefore not serialized (and would return 0 instead in most cases).
So the last document get(lastdoc)..get(lastdoc + 1) would return the invalid range `value..0`.

This PR adds a proptest to cover this scenario. A combination of a large number values, since multilinear
interpolation is only active for more than 5_000 values, and a merge is required.
2021-09-10 23:00:37 +09:00
PSeitz
319609e9c1 test cargo-llvm-cov (#1149) 2021-09-03 22:00:43 +09:00
Kanji Yomoda
9d87b89718 Fix incorrect comment for Index::create_in_dir (#1148)
* Fix incorrect comment for Index::create_in_dir
2021-09-03 10:37:16 +09:00
Tomoko Uchida
dd81e38e53 Add WhitespaceTokenizer (#1147)
* Add WhitespaceTokenizer.
2021-08-29 18:20:49 +09:00
Paul Masurel
9f32b22602 Preparing for release. 2021-08-26 09:07:08 +09:00
sigaloid
096ce7488e Resolve some clippys, format (#1144)
* cargo +nightly clippy --fix -Z unstable-options
2021-08-26 08:46:00 +09:00
PSeitz
a1782dd172 Update index_sorting.md 2021-08-25 07:55:50 +01:00
PSeitz
000d76b11a Update index_sorting.md 2021-08-24 19:28:06 +01:00
PSeitz
abd29f6646 Update index_sorting.md 2021-08-24 19:26:19 +01:00
PSeitz
b4ecf0ab2f Merge pull request #1146 from tantivy-search/sorting_doc
add sorting to book
2021-08-23 17:37:54 +01:00
Pascal Seitz
798f7dbf67 add sorting to book 2021-08-23 17:36:41 +01:00
PSeitz
06a2e47c8d Merge pull request #1145 from tantivy-search/blub2
cargo fmt
2021-08-21 18:52:50 +01:00
Pascal Seitz
e0b83eb291 cargo fmt 2021-08-21 18:52:10 +01:00
PSeitz
13401f46ea add wildcard mention 2021-08-21 18:10:33 +01:00
PSeitz
1a45b030dc Merge pull request #1141 from tantivy-search/tantivy_common
dissolve common module
2021-08-20 08:03:37 +01:00
Pascal Seitz
62052bcc2d add missing test function
closes #1139
2021-08-20 07:26:22 +01:00
Pascal Seitz
3265f7bec3 dissolve common module 2021-08-19 23:26:34 +01:00
Pascal Seitz
ee0881712a move bitset to common crate, move composite file to directory 2021-08-19 17:45:09 +01:00
PSeitz
483e0336b6 Merge pull request #1140 from tantivy-search/tantivy_common
rename common to tantivy-common
2021-08-19 13:02:54 +01:00
Pascal Seitz
3e8f267e33 rename common to tantivy-common 2021-08-19 10:27:20 +01:00
Paul Masurel
3b247fd968 Version bump 2021-08-19 10:12:30 +09:00
Paul Masurel
750f6e6479 Removed obsolete unit test (#1138) 2021-08-19 10:07:49 +09:00
Evance Soumaoro
5b475e6603 Checksum validation using active files (#1130)
* now validate checksum uses segment files not managed files
2021-08-19 10:03:20 +09:00
PSeitz
0ca7f73dc5 add docs badge, fix build badge 2021-08-13 19:40:33 +01:00
PSeitz
47ed18845e Merge pull request #1136 from tantivy-search/minor_fixes
more docs detail
2021-08-13 18:11:47 +01:00
Pascal Seitz
dc141cdb29 more docs detail
remove code duplicate
2021-08-13 17:40:13 +01:00
PSeitz
f6cf6e889b Merge pull request #1133 from tantivy-search/merge_overflow
test doc_freq and term_freq in sorted index
2021-08-05 07:53:46 +01:00
Pascal Seitz
f379a80233 test doc_freq and term_freq in sorted index 2021-08-03 11:38:05 +01:00
PSeitz
4a320fd1ff fix delta position in merge and index sorting (#1132)
fixes #1125
2021-08-03 18:06:36 +09:00
PSeitz
85d23e8e3b Merge pull request #1129 from tantivy-search/merge_overflow
add long running test in ci
2021-08-02 15:54:31 +01:00
Pascal Seitz
022ab9d298 don't run as pr 2021-08-02 15:44:00 +01:00
Pascal Seitz
605e8603dc add positions to long running test 2021-08-02 15:29:49 +01:00
Pascal Seitz
70f160b329 add long running test in ci 2021-08-02 11:35:39 +01:00
PSeitz
6d265e6bed fix gh action name 2021-08-02 10:38:01 +01:00
PSeitz
fdc512391b Merge pull request #1128 from tantivy-search/merge_overflow
add sort to functional test, add env for iterations
2021-08-02 10:29:16 +01:00
Pascal Seitz
108714c934 add sort to functional test, add env for iterations 2021-08-02 10:11:17 +01:00
Paul Masurel
44e8cf98a5 Cargo fmt 2021-07-30 15:30:01 +09:00
Paul Masurel
f0ee69d9e9 Remove the complicated block search logic for a simpler branchless (#1124)
binary search

The code is simpler and faster.

Before
test postings::bench::bench_segment_intersection                                                                         ... bench:   2,093,697 ns/iter (+/- 115,509)
test postings::bench::bench_skip_next_p01                                                                                ... bench:      58,585 ns/iter (+/- 796)
test postings::bench::bench_skip_next_p1                                                                                 ... bench:     160,872 ns/iter (+/- 5,164)
test postings::bench::bench_skip_next_p10                                                                                ... bench:     615,229 ns/iter (+/- 25,108)
test postings::bench::bench_skip_next_p90                                                                                ... bench:   1,120,509 ns/iter (+/- 22,271)

After
test postings::bench::bench_segment_intersection                                                                         ... bench:   1,747,726 ns/iter (+/- 52,867)
test postings::bench::bench_skip_next_p01                                                                                ... bench:      55,205 ns/iter (+/- 714)
test postings::bench::bench_skip_next_p1                                                                                 ... bench:     131,433 ns/iter (+/- 2,814)
test postings::bench::bench_skip_next_p10                                                                                ... bench:     478,830 ns/iter (+/- 12,794)
test postings::bench::bench_skip_next_p90                                                                                ... bench:     931,082 ns/iter (+/- 31,468)
2021-07-30 14:38:42 +09:00
Evance Soumaoro
b8a10c8406 switched to memmap2-rs (#1120) 2021-07-27 18:40:41 +09:00
163 changed files with 5087 additions and 3298 deletions

View File

@@ -6,3 +6,10 @@ updates:
interval: daily
time: "20:00"
open-pull-requests-limit: 10
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: daily
time: "20:00"
open-pull-requests-limit: 10

View File

@@ -1,27 +1,25 @@
name: coverage
name: Coverage
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
jobs:
test:
name: coverage
runs-on: ubuntu-latest
container:
image: xd009642/tarpaulin:develop-nightly
options: --security-opt seccomp=unconfined
coverage:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Generate code coverage
run: |
cargo +nightly tarpaulin --verbose --all-features --workspace --timeout 120 --out Xml
- name: Upload to codecov.io
uses: codecov/codecov-action@v1
- uses: actions/checkout@v2
- name: Install Rust
run: rustup toolchain install nightly --component llvm-tools-preview
- name: Install cargo-llvm-cov
run: curl -LsSf https://github.com/taiki-e/cargo-llvm-cov/releases/latest/download/cargo-llvm-cov-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin
- name: Generate code coverage
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v2
with:
# token: ${{secrets.CODECOV_TOKEN}} # not required for public repos
fail_ci_if_error: true
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
files: lcov.info
fail_ci_if_error: true

24
.github/workflows/long_running.yml vendored Normal file
View File

@@ -0,0 +1,24 @@
name: Rust
on:
push:
branches: [ main ]
env:
CARGO_TERM_COLOR: always
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
jobs:
functional_test_unsorted:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Run indexing_unsorted
run: cargo test indexing_unsorted -- --ignored
functional_test_sorted:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Run indexing_sorted
run: cargo test indexing_sorted -- --ignored

View File

@@ -10,7 +10,7 @@ env:
CARGO_TERM_COLOR: always
jobs:
build:
test:
runs-on: ubuntu-latest

1
.gitignore vendored
View File

@@ -1,4 +1,5 @@
tantivy.iml
.cargo
proptest-regressions
*.swp
target

View File

@@ -1,92 +0,0 @@
# Based on the "trust" template v0.1.2
# https://github.com/japaric/trust/tree/v0.1.2
dist: trusty
language: rust
services: docker
sudo: required
env:
global:
- CRATE_NAME=tantivy
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
# - secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
addons:
apt:
sources:
- ubuntu-toolchain-r-test
- kalakris-cmake
packages:
- gcc-4.8
- g++-4.8
- libcurl4-openssl-dev
- libelf-dev
- libdw-dev
- binutils-dev
- cmake
matrix:
include:
# Android
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
#- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
# Linux
#- env: TARGET=aarch64-unknown-linux-gnu
#- env: TARGET=i686-unknown-linux-gnu
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 #UPLOAD_DOCS=1
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
# OSX
#- env: TARGET=x86_64-apple-darwin
# os: osx
before_install:
- set -e
- rustup self update
- rustup component add rustfmt
install:
- sh ci/install.sh
- source ~/.cargo/env || true
- env | grep "TRAVIS"
before_script:
- export PATH=$HOME/.cargo/bin:$PATH
- cargo install cargo-update || echo "cargo-update already installed"
- cargo install cargo-travis || echo "cargo-travis already installed"
script:
- bash ci/script.sh
- cargo fmt --all -- --check
before_deploy:
- sh ci/before_deploy.sh
after_success:
# Needs GH_TOKEN env var to be set in travis settings
- if [[ -v GH_TOKEN ]]; then echo "GH TOKEN IS SET"; else echo "GH TOKEN NOT SET"; fi
- if [[ -v UPLOAD_DOCS ]]; then cargo doc; cargo doc-upload; else echo "doc upload disabled."; fi
#cache: cargo
#before_cache:
# # Travis can't cache files that are not readable by "others"
# - chmod -R a+r $HOME/.cargo
# - find ./target/debug -type f -maxdepth 1 -delete
# - rm -f ./target/.rustc_info.json
# - rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
# - rm -r target/debug/examples/
# - ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
#branches:
# only:
# # release tags
# - /^v\d+\.\d+\.\d+.*$/
# - master
notifications:
email:
on_success: never

View File

@@ -1,3 +1,21 @@
Tantivy 0.17
================================
- Change to non-strict schema. Ignore fields in data which are not defined in schema. Previously this returned an error. #1211
Tantivy 0.16.2
================================
- Bugfix in FuzzyTermQuery. (tranposition_cost_one was not doing anything)
Tantivy 0.16.1
========================
- Major Bugfix on multivalued fastfield. #1151
- Demux operation (@PSeitz)
Tantivy 0.16.0
=========================
- Bugfix in the filesum check. (@evanxg852000) #1127
- Bugfix in positions when the index is sorted by a field. (@appaquet) #1125
Tantivy 0.15.3
=========================
- Major bugfix. Deleting documents was broken when the index was sorted by a field. (@appaquet, @fulmicoton) #1101
@@ -104,7 +122,7 @@ Tantivy 0.12.0
## How to update?
Crates relying on custom tokenizer, or registering tokenizer in the manager will require some
minor changes. Check https://github.com/tantivy-search/tantivy/blob/main/examples/custom_tokenizer.rs
minor changes. Check https://github.com/quickwit-inc/tantivy/blob/main/examples/custom_tokenizer.rs
to check for some code sample.
Tantivy 0.11.3

View File

@@ -1,13 +1,13 @@
[package]
name = "tantivy"
version = "0.16.0-dev"
version = "0.17.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
description = """Search engine library"""
documentation = "https://docs.rs/tantivy/"
homepage = "https://github.com/tantivy-search/tantivy"
repository = "https://github.com/tantivy-search/tantivy"
homepage = "https://github.com/quickwit-inc/tantivy"
repository = "https://github.com/quickwit-inc/tantivy"
readme = "README.md"
keywords = ["search", "information", "retrieval"]
edition = "2018"
@@ -19,8 +19,8 @@ crc32fast = "1.2.1"
once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
tantivy-fst = "0.3"
memmap2 = {version = "0.3", optional=true}
lz4_flex = { version = "0.8.0", default-features = false, features = ["checked-decode"], optional = true }
memmap2 = {version = "0.5", optional=true}
lz4_flex = { version = "0.9", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3", optional = true }
snap = { version = "1.0.5", optional = true }
tempfile = { version = "3.2", optional = true }
@@ -31,13 +31,13 @@ num_cpus = "1.13"
fs2={ version = "0.4.3", optional = true }
levenshtein_automata = "0.2"
uuid = { version = "0.8.2", features = ["v4", "serde"] }
crossbeam = "0.8"
crossbeam = "0.8.1"
futures = { version = "0.3.15", features = ["thread-pool"] }
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
common = { version="0.1", path="./common" }
common = { version = "0.1", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
ownedbytes = { version="0.1", path="./ownedbytes" }
ownedbytes = { version="0.2", path="./ownedbytes" }
stable_deref_trait = "1.2"
rust-stemmers = "1.2"
downcast-rs = "1.2"
@@ -46,15 +46,15 @@ census = "0.4"
fnv = "1.0.7"
thiserror = "1.0.24"
htmlescape = "0.3.1"
fail = "0.4"
fail = "0.5"
murmurhash32 = "0.2"
chrono = "0.4.19"
smallvec = "1.6.1"
rayon = "1.5"
lru = "0.6.5"
lru = "0.7.0"
fastdivide = "0.3"
itertools = "0.10.0"
measure_time = "0.7.0"
measure_time = "0.8.0"
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
@@ -64,10 +64,12 @@ rand = "0.8.3"
maplit = "1.0.2"
matches = "0.1.8"
proptest = "1.0"
criterion = "0.3.4"
criterion = "0.3.5"
test-env-log = "0.2.7"
env_logger = "0.9.0"
[dev-dependencies.fail]
version = "0.4"
version = "0.5"
features = ["failpoints"]
[profile.release]

View File

@@ -1,9 +1,9 @@
[![Build Status](https://travis-ci.org/tantivy-search/tantivy.svg?branch=main)](https://travis-ci.org/tantivy-search/tantivy)
[![codecov](https://codecov.io/gh/tantivy-search/tantivy/branch/main/graph/badge.svg)](https://codecov.io/gh/tantivy-search/tantivy)
[![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![Docs](https://docs.rs/tantivy/badge.svg)](https://docs.rs/crate/tantivy/)
[![Build Status](https://github.com/quickwit-inc/tantivy/actions/workflows/test.yml/badge.svg)](https://github.com/quickwit-inc/tantivy/actions/workflows/test.yml)
[![codecov](https://codecov.io/gh/quickwit-inc/tantivy/branch/main/graph/badge.svg)](https://codecov.io/gh/quickwit-inc/tantivy)
[![Join the chat at https://discord.gg/MT27AG5EVE](https://shields.io/discord/908281611840282624?label=chat%20on%20discord)](https://discord.gg/MT27AG5EVE)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/main?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/main)
[![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy)
![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png)
@@ -17,9 +17,6 @@
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/6)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/7)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
[![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
**Tantivy** is a **full text search engine library** written in Rust.
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
@@ -78,13 +75,12 @@ It walks you through getting a wikipedia search engine up and running in a few m
There are many ways to support this project.
- Use Tantivy and tell us about your experience on [Gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
- Use Tantivy and tell us about your experience on [Discord](https://discord.gg/MT27AG5EVE) or by email (paul.masurel@gmail.com)
- Report bugs
- Write a blog post
- Help with documentation by asking questions or submitting PRs
- Contribute code (you can join [our Gitter](https://gitter.im/tantivy-search/tantivy))
- Contribute code (you can join [our Discord server](https://discord.gg/MT27AG5EVE))
- Talk about Tantivy around you
- [![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
# Contributing code
@@ -96,7 +92,7 @@ Tantivy compiles on stable Rust but requires `Rust >= 1.27`.
To check out and run tests, you can simply run:
```bash
git clone https://github.com/tantivy-search/tantivy.git
git clone https://github.com/quickwit-inc/tantivy.git
cd tantivy
cargo build
```

View File

@@ -1,12 +1,12 @@
[package]
name = "tantivy-bitpacker"
version = "0.1.0"
version = "0.1.1"
edition = "2018"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = []
description = """Tantivy-sub crate: bitpacking"""
repository = "https://github.com/tantivy-search/tantivy"
repository = "https://github.com/quickwit-inc/tantivy"
keywords = []

View File

@@ -50,3 +50,32 @@ where
}
None
}
#[test]
fn test_compute_num_bits() {
assert_eq!(compute_num_bits(1), 1u8);
assert_eq!(compute_num_bits(0), 0u8);
assert_eq!(compute_num_bits(2), 2u8);
assert_eq!(compute_num_bits(3), 2u8);
assert_eq!(compute_num_bits(4), 3u8);
assert_eq!(compute_num_bits(255), 8u8);
assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
#[test]
fn test_minmax_empty() {
let vals: Vec<u32> = vec![];
assert_eq!(minmax(vals.into_iter()), None);
}
#[test]
fn test_minmax_one() {
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
}
#[test]
fn test_minmax_two() {
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
}

View File

@@ -1,5 +1,5 @@
[package]
name = "common"
name = "tantivy-common"
version = "0.1.0"
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
@@ -10,3 +10,8 @@ description = "common traits and utility functions used by multiple tantivy subc
[dependencies]
byteorder = "1.4.3"
ownedbytes = { version="0.2", path="../ownedbytes" }
[dev-dependencies]
proptest = "1.0.0"
rand = "0.8.4"

744
common/src/bitset.rs Normal file
View File

@@ -0,0 +1,744 @@
use ownedbytes::OwnedBytes;
use std::convert::TryInto;
use std::io::Write;
use std::u64;
use std::{fmt, io};
#[derive(Clone, Copy, Eq, PartialEq)]
pub struct TinySet(u64);
impl fmt::Debug for TinySet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.into_iter().collect::<Vec<u32>>().fmt(f)
}
}
pub struct TinySetIterator(TinySet);
impl Iterator for TinySetIterator {
type Item = u32;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
self.0.pop_lowest()
}
}
impl IntoIterator for TinySet {
type Item = u32;
type IntoIter = TinySetIterator;
fn into_iter(self) -> Self::IntoIter {
TinySetIterator(self)
}
}
impl TinySet {
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
writer.write_all(self.0.to_le_bytes().as_ref())
}
pub fn into_bytes(self) -> [u8; 8] {
self.0.to_le_bytes()
}
#[inline]
pub fn deserialize(data: [u8; 8]) -> Self {
let val: u64 = u64::from_le_bytes(data);
TinySet(val)
}
/// Returns an empty `TinySet`.
#[inline]
pub fn empty() -> TinySet {
TinySet(0u64)
}
/// Returns a full `TinySet`.
#[inline]
pub fn full() -> TinySet {
TinySet::empty().complement()
}
pub fn clear(&mut self) {
self.0 = 0u64;
}
#[inline]
/// Returns the complement of the set in `[0, 64[`.
///
/// Careful on making this function public, as it will break the padding handling in the last
/// bucket.
fn complement(self) -> TinySet {
TinySet(!self.0)
}
#[inline]
/// Returns true iff the `TinySet` contains the element `el`.
pub fn contains(self, el: u32) -> bool {
!self.intersect(TinySet::singleton(el)).is_empty()
}
#[inline]
/// Returns the number of elements in the TinySet.
pub fn len(self) -> u32 {
self.0.count_ones()
}
#[inline]
/// Returns the intersection of `self` and `other`
pub fn intersect(self, other: TinySet) -> TinySet {
TinySet(self.0 & other.0)
}
/// Creates a new `TinySet` containing only one element
/// within `[0; 64[`
#[inline]
pub fn singleton(el: u32) -> TinySet {
TinySet(1u64 << u64::from(el))
}
/// Insert a new element within [0..64)
#[inline]
pub fn insert(self, el: u32) -> TinySet {
self.union(TinySet::singleton(el))
}
/// Removes an element within [0..64)
#[inline]
pub fn remove(self, el: u32) -> TinySet {
self.intersect(TinySet::singleton(el).complement())
}
/// Insert a new element within [0..64)
///
/// returns true if the set changed
#[inline]
pub fn insert_mut(&mut self, el: u32) -> bool {
let old = *self;
*self = old.insert(el);
old != *self
}
/// Remove a element within [0..64)
///
/// returns true if the set changed
#[inline]
pub fn remove_mut(&mut self, el: u32) -> bool {
let old = *self;
*self = old.remove(el);
old != *self
}
/// Returns the union of two tinysets
#[inline]
pub fn union(self, other: TinySet) -> TinySet {
TinySet(self.0 | other.0)
}
/// Returns true iff the `TinySet` is empty.
#[inline]
pub fn is_empty(self) -> bool {
self.0 == 0u64
}
/// Returns the lowest element in the `TinySet`
/// and removes it.
#[inline]
pub fn pop_lowest(&mut self) -> Option<u32> {
if self.is_empty() {
None
} else {
let lowest = self.0.trailing_zeros() as u32;
self.0 ^= TinySet::singleton(lowest).0;
Some(lowest)
}
}
/// Returns a `TinySet` than contains all values up
/// to limit excluded.
///
/// The limit is assumed to be strictly lower than 64.
pub fn range_lower(upper_bound: u32) -> TinySet {
TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
}
/// Returns a `TinySet` that contains all values greater
/// or equal to the given limit, included. (and up to 63)
///
/// The limit is assumed to be strictly lower than 64.
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
TinySet::range_lower(from_included).complement()
}
}
#[derive(Clone)]
pub struct BitSet {
tinysets: Box<[TinySet]>,
len: u64,
max_value: u32,
}
fn num_buckets(max_val: u32) -> u32 {
(max_val + 63u32) / 64u32
}
impl BitSet {
/// serialize a `BitSet`.
///
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
for tinyset in self.tinysets.iter().cloned() {
writer.write_all(&tinyset.into_bytes())?;
}
writer.flush()?;
Ok(())
}
/// Create a new `BitSet` that may contain elements
/// within `[0, max_val)`.
pub fn with_max_value(max_value: u32) -> BitSet {
let num_buckets = num_buckets(max_value);
let tinybitsets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
BitSet {
tinysets: tinybitsets,
len: 0,
max_value,
}
}
/// Create a new `BitSet` that may contain elements. Initially all values will be set.
/// within `[0, max_val)`.
pub fn with_max_value_and_full(max_value: u32) -> BitSet {
let num_buckets = num_buckets(max_value);
let mut tinybitsets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
// Fix padding
let lower = max_value % 64u32;
if lower != 0 {
tinybitsets[tinybitsets.len() - 1] = TinySet::range_lower(lower);
}
BitSet {
tinysets: tinybitsets,
len: max_value as u64,
max_value,
}
}
/// Removes all elements from the `BitSet`.
pub fn clear(&mut self) {
for tinyset in self.tinysets.iter_mut() {
*tinyset = TinySet::empty();
}
}
/// Intersect with serialized bitset
pub fn intersect_update(&mut self, other: &ReadOnlyBitSet) {
self.intersect_update_with_iter(other.iter_tinysets());
}
/// Intersect with tinysets
fn intersect_update_with_iter(&mut self, other: impl Iterator<Item = TinySet>) {
self.len = 0;
for (left, right) in self.tinysets.iter_mut().zip(other) {
*left = left.intersect(right);
self.len += left.len() as u64;
}
}
/// Returns the number of elements in the `BitSet`.
#[inline]
pub fn len(&self) -> usize {
self.len as usize
}
/// Inserts an element in the `BitSet`
#[inline]
pub fn insert(&mut self, el: u32) {
// we do not check saturated els.
let higher = el / 64u32;
let lower = el % 64u32;
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
1
} else {
0
};
}
/// Inserts an element in the `BitSet`
#[inline]
pub fn remove(&mut self, el: u32) {
// we do not check saturated els.
let higher = el / 64u32;
let lower = el % 64u32;
self.len -= if self.tinysets[higher as usize].remove_mut(lower) {
1
} else {
0
};
}
/// Returns true iff the elements is in the `BitSet`.
#[inline]
pub fn contains(&self, el: u32) -> bool {
self.tinyset(el / 64u32).contains(el % 64)
}
/// Returns the first non-empty `TinySet` associated to a bucket lower
/// or greater than bucket.
///
/// Reminder: the tiny set with the bucket `bucket`, represents the
/// elements from `bucket * 64` to `(bucket+1) * 64`.
pub fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
self.tinysets[bucket as usize..]
.iter()
.cloned()
.position(|tinyset| !tinyset.is_empty())
.map(|delta_bucket| bucket + delta_bucket as u32)
}
#[inline]
pub fn max_value(&self) -> u32 {
self.max_value
}
/// Returns the tiny bitset representing the
/// the set restricted to the number range from
/// `bucket * 64` to `(bucket + 1) * 64`.
pub fn tinyset(&self, bucket: u32) -> TinySet {
self.tinysets[bucket as usize]
}
}
/// Serialized BitSet.
#[derive(Clone)]
pub struct ReadOnlyBitSet {
data: OwnedBytes,
max_value: u32,
}
pub fn intersect_bitsets(left: &ReadOnlyBitSet, other: &ReadOnlyBitSet) -> ReadOnlyBitSet {
assert_eq!(left.max_value(), other.max_value());
assert_eq!(left.data.len(), other.data.len());
let union_tinyset_it = left
.iter_tinysets()
.zip(other.iter_tinysets())
.map(|(left_tinyset, right_tinyset)| left_tinyset.intersect(right_tinyset));
let mut output_dataset: Vec<u8> = Vec::with_capacity(left.data.len());
for tinyset in union_tinyset_it {
output_dataset.extend_from_slice(&tinyset.into_bytes());
}
ReadOnlyBitSet {
data: OwnedBytes::new(output_dataset),
max_value: left.max_value(),
}
}
impl ReadOnlyBitSet {
pub fn open(data: OwnedBytes) -> Self {
let (max_value_data, data) = data.split(4);
assert_eq!(data.len() % 8, 0);
let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap());
ReadOnlyBitSet { data, max_value }
}
/// Number of elements in the bitset.
#[inline]
pub fn len(&self) -> usize {
self.iter_tinysets()
.map(|tinyset| tinyset.len() as usize)
.sum()
}
/// Iterate the tinyset on the fly from serialized data.
///
#[inline]
fn iter_tinysets(&self) -> impl Iterator<Item = TinySet> + '_ {
self.data.chunks_exact(8).map(move |chunk| {
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap());
tinyset
})
}
/// Iterate over the positions of the elements.
///
#[inline]
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
self.iter_tinysets()
.enumerate()
.flat_map(move |(chunk_num, tinyset)| {
let chunk_base_val = chunk_num as u32 * 64;
tinyset
.into_iter()
.map(move |val| val + chunk_base_val)
.take_while(move |doc| *doc < self.max_value)
})
}
/// Returns true iff the elements is in the `BitSet`.
#[inline]
pub fn contains(&self, el: u32) -> bool {
let byte_offset = el / 8u32;
let b: u8 = self.data[byte_offset as usize];
let shift = (el % 8) as u8;
b & (1u8 << shift) != 0
}
/// Maximum value the bitset may contain.
/// (Note this is not the maximum value contained in the set.)
///
/// A bitset has an intrinsic capacity.
/// It only stores elements within [0..max_value).
#[inline]
pub fn max_value(&self) -> u32 {
self.max_value
}
/// Number of bytes used in the bitset representation.
pub fn num_bytes(&self) -> usize {
self.data.len()
}
}
impl<'a> From<&'a BitSet> for ReadOnlyBitSet {
fn from(bitset: &'a BitSet) -> ReadOnlyBitSet {
let mut buffer = Vec::with_capacity(bitset.tinysets.len() * 8 + 4);
bitset
.serialize(&mut buffer)
.expect("serializing into a buffer should never fail");
ReadOnlyBitSet::open(OwnedBytes::new(buffer))
}
}
#[cfg(test)]
mod tests {
use super::BitSet;
use super::ReadOnlyBitSet;
use super::TinySet;
use ownedbytes::OwnedBytes;
use rand::distributions::Bernoulli;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use std::collections::HashSet;
#[test]
fn test_read_serialized_bitset_full_multi() {
for i in 0..1000 {
let bitset = BitSet::with_max_value_and_full(i);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len() as usize, i as usize);
}
}
#[test]
fn test_read_serialized_bitset_full_block() {
let bitset = BitSet::with_max_value_and_full(64);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len() as usize, 64 as usize);
}
#[test]
fn test_read_serialized_bitset_full() {
let mut bitset = BitSet::with_max_value_and_full(5);
bitset.remove(3);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len(), 4);
}
#[test]
fn test_bitset_intersect() {
let bitset_serialized = {
let mut bitset = BitSet::with_max_value_and_full(5);
bitset.remove(1);
bitset.remove(3);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
ReadOnlyBitSet::open(OwnedBytes::new(out))
};
let mut bitset = BitSet::with_max_value_and_full(5);
bitset.remove(1);
bitset.intersect_update(&bitset_serialized);
assert!(bitset.contains(0));
assert!(!bitset.contains(1));
assert!(bitset.contains(2));
assert!(!bitset.contains(3));
assert!(bitset.contains(4));
bitset.intersect_update_with_iter(vec![TinySet::singleton(0)].into_iter());
assert!(bitset.contains(0));
assert!(!bitset.contains(1));
assert!(!bitset.contains(2));
assert!(!bitset.contains(3));
assert!(!bitset.contains(4));
assert_eq!(bitset.len(), 1);
bitset.intersect_update_with_iter(vec![TinySet::singleton(1)].into_iter());
assert!(!bitset.contains(0));
assert!(!bitset.contains(1));
assert!(!bitset.contains(2));
assert!(!bitset.contains(3));
assert!(!bitset.contains(4));
assert_eq!(bitset.len(), 0);
}
#[test]
fn test_read_serialized_bitset_empty() {
let mut bitset = BitSet::with_max_value(5);
bitset.insert(3);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len(), 1);
{
let bitset = BitSet::with_max_value(5);
let mut out = vec![];
bitset.serialize(&mut out).unwrap();
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
assert_eq!(bitset.len(), 0);
}
}
#[test]
fn test_tiny_set_remove() {
{
let mut u = TinySet::empty().insert(63u32).insert(5).remove(63u32);
assert_eq!(u.pop_lowest(), Some(5u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty()
.insert(63u32)
.insert(1)
.insert(5)
.remove(63u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert_eq!(u.pop_lowest(), Some(5u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty().insert(1).remove(63u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty().insert(1).remove(1u32);
assert!(u.pop_lowest().is_none());
}
}
#[test]
fn test_tiny_set() {
assert!(TinySet::empty().is_empty());
{
let mut u = TinySet::empty().insert(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none())
}
{
let mut u = TinySet::empty().insert(1u32).insert(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none())
}
{
let mut u = TinySet::empty().insert(2u32);
assert_eq!(u.pop_lowest(), Some(2u32));
u.insert_mut(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty().insert(63u32);
assert_eq!(u.pop_lowest(), Some(63u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty().insert(63u32).insert(5);
assert_eq!(u.pop_lowest(), Some(5u32));
assert_eq!(u.pop_lowest(), Some(63u32));
assert!(u.pop_lowest().is_none());
}
{
let original = TinySet::empty().insert(63u32).insert(5);
let after_serialize_deserialize = TinySet::deserialize(original.into_bytes());
assert_eq!(original, after_serialize_deserialize);
}
}
#[test]
fn test_bitset() {
let test_against_hashset = |els: &[u32], max_value: u32| {
let mut hashset: HashSet<u32> = HashSet::new();
let mut bitset = BitSet::with_max_value(max_value);
for &el in els {
assert!(el < max_value);
hashset.insert(el);
bitset.insert(el);
}
for el in 0..max_value {
assert_eq!(hashset.contains(&el), bitset.contains(el));
}
assert_eq!(bitset.max_value(), max_value);
// test deser
let mut data = vec![];
bitset.serialize(&mut data).unwrap();
let ro_bitset = ReadOnlyBitSet::open(OwnedBytes::new(data));
for el in 0..max_value {
assert_eq!(hashset.contains(&el), ro_bitset.contains(el));
}
assert_eq!(ro_bitset.max_value(), max_value);
assert_eq!(ro_bitset.len(), els.len());
};
test_against_hashset(&[], 0);
test_against_hashset(&[], 1);
test_against_hashset(&[0u32], 1);
test_against_hashset(&[0u32], 100);
test_against_hashset(&[1u32, 2u32], 4);
test_against_hashset(&[99u32], 100);
test_against_hashset(&[63u32], 64);
test_against_hashset(&[62u32, 63u32], 64);
}
#[test]
fn test_bitset_num_buckets() {
use super::num_buckets;
assert_eq!(num_buckets(0u32), 0);
assert_eq!(num_buckets(1u32), 1);
assert_eq!(num_buckets(64u32), 1);
assert_eq!(num_buckets(65u32), 2);
assert_eq!(num_buckets(128u32), 2);
assert_eq!(num_buckets(129u32), 3);
}
#[test]
fn test_tinyset_range() {
assert_eq!(
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
[0, 1, 2]
);
assert!(TinySet::range_lower(0).is_empty());
assert_eq!(
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
(0u32..63u32).collect::<Vec<_>>()
);
assert_eq!(
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
[0]
);
assert_eq!(
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
[0, 1]
);
assert_eq!(
TinySet::range_greater_or_equal(3)
.into_iter()
.collect::<Vec<u32>>(),
(3u32..64u32).collect::<Vec<_>>()
);
}
#[test]
fn test_bitset_len() {
let mut bitset = BitSet::with_max_value(1_000);
assert_eq!(bitset.len(), 0);
bitset.insert(3u32);
assert_eq!(bitset.len(), 1);
bitset.insert(103u32);
assert_eq!(bitset.len(), 2);
bitset.insert(3u32);
assert_eq!(bitset.len(), 2);
bitset.insert(103u32);
assert_eq!(bitset.len(), 2);
bitset.insert(104u32);
assert_eq!(bitset.len(), 3);
bitset.remove(105u32);
assert_eq!(bitset.len(), 3);
bitset.remove(104u32);
assert_eq!(bitset.len(), 2);
bitset.remove(3u32);
assert_eq!(bitset.len(), 1);
bitset.remove(103u32);
assert_eq!(bitset.len(), 0);
}
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
StdRng::from_seed([seed_val; 32])
.sample_iter(&Bernoulli::new(ratio).unwrap())
.take(n as usize)
.enumerate()
.filter_map(|(val, keep)| if keep { Some(val as u32) } else { None })
.collect()
}
pub fn sample(n: u32, ratio: f64) -> Vec<u32> {
sample_with_seed(n, ratio, 4)
}
#[test]
fn test_bitset_clear() {
let mut bitset = BitSet::with_max_value(1_000);
let els = sample(1_000, 0.01f64);
for &el in &els {
bitset.insert(el);
}
assert!(els.iter().all(|el| bitset.contains(*el)));
bitset.clear();
for el in 0u32..1000u32 {
assert!(!bitset.contains(el));
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::BitSet;
use super::TinySet;
use test;
#[bench]
fn bench_tinyset_pop(b: &mut test::Bencher) {
b.iter(|| {
let mut tinyset = TinySet::singleton(test::black_box(31u32));
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
});
}
#[bench]
fn bench_tinyset_sum(b: &mut test::Bencher) {
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
b.iter(|| {
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
});
}
#[bench]
fn bench_tinyarr_sum(b: &mut test::Bencher) {
let v = [10u32, 14u32, 21u32];
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
}
#[bench]
fn bench_bitset_initialize(b: &mut test::Bencher) {
b.iter(|| BitSet::with_max_value(1_000_000));
}
}

View File

@@ -1,9 +1,169 @@
#![allow(clippy::len_without_is_empty)]
use std::ops::Deref;
pub use byteorder::LittleEndian as Endianness;
mod bitset;
mod serialize;
mod vint;
mod writer;
pub use bitset::*;
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt};
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
/// Has length trait
pub trait HasLen {
/// Return length
fn len(&self) -> usize;
/// Returns true iff empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
}
impl<T: Deref<Target = [u8]>> HasLen for T {
fn len(&self) -> usize {
self.deref().len()
}
}
const HIGHEST_BIT: u64 = 1 << 63;
/// Maps a `i64` to `u64`
///
/// For simplicity, tantivy internally handles `i64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `i64` to `u64` so that
/// `-2^63 .. 2^63-1` is mapped
/// to
/// `0 .. 2^64-1`
/// in that order.
///
/// This is more suited than simply casting (`val as u64`)
/// because of bitpacking.
///
/// Imagine a list of `i64` ranging from -10 to 10.
/// When casting negative values, the negative values are projected
/// to values over 2^63, and all values end up requiring 64 bits.
///
/// # See also
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
#[inline]
pub fn i64_to_u64(val: i64) -> u64 {
(val as u64) ^ HIGHEST_BIT
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline]
pub fn u64_to_i64(val: u64) -> i64 {
(val ^ HIGHEST_BIT) as i64
}
/// Maps a `f64` to `u64`
///
/// For simplicity, tantivy internally handles `f64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved.
///
/// This is more suited than simply casting (`val as u64`)
/// which would truncate the result
///
/// # Reference
///
/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/)
/// explains the mapping in a clear manner.
///
/// # See also
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
#[inline]
pub fn f64_to_u64(val: f64) -> u64 {
let bits = val.to_bits();
if val.is_sign_positive() {
bits ^ HIGHEST_BIT
} else {
!bits
}
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline]
pub fn u64_to_f64(val: u64) -> f64 {
f64::from_bits(if val & HIGHEST_BIT != 0 {
val ^ HIGHEST_BIT
} else {
!val
})
}
#[cfg(test)]
pub mod test {
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use super::{BinarySerializable, FixedSize};
use proptest::prelude::*;
use std::f64;
fn test_i64_converter_helper(val: i64) {
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
}
fn test_f64_converter_helper(val: f64) {
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
}
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
proptest! {
#[test]
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
let left_u64 = f64_to_u64(left);
let right_u64 = f64_to_u64(right);
assert_eq!(left_u64 < right_u64, left < right);
}
}
#[test]
fn test_i64_converter() {
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
test_i64_converter_helper(0i64);
test_i64_converter_helper(i64::min_value());
test_i64_converter_helper(i64::max_value());
for i in -1000i64..1000i64 {
test_i64_converter_helper(i);
}
}
#[test]
fn test_f64_converter() {
test_f64_converter_helper(f64::INFINITY);
test_f64_converter_helper(f64::NEG_INFINITY);
test_f64_converter_helper(0.0);
test_f64_converter_helper(-0.0);
test_f64_converter_helper(1.0);
test_f64_converter_helper(-1.0);
}
#[test]
fn test_f64_order() {
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
.contains(&f64_to_u64(f64::NAN))); //nan is not a number
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
}
}

View File

@@ -7,6 +7,7 @@
- [Segments](./basis.md)
- [Defining your schema](./schema.md)
- [Facetting](./facetting.md)
- [Index Sorting](./index_sorting.md)
- [Innerworkings](./innerworkings.md)
- [Inverted index](./inverted_index.md)
- [Best practise](./inverted_index.md)

61
doc/src/index_sorting.md Normal file
View File

@@ -0,0 +1,61 @@
- [Index Sorting](#index-sorting)
+ [Why Sorting](#why-sorting)
* [Compression](#compression)
* [Top-N Optimization](#top-n-optimization)
* [Pruning](#pruning)
* [Other](#other)
+ [Usage](#usage)
# Index Sorting
Tantivy allows you to sort the index according to a property.
## Why Sorting
Presorting an index has several advantages:
###### Compression
When data is sorted it is easier to compress the data. E.g. the numbers sequence [5, 2, 3, 1, 4] would be sorted to [1, 2, 3, 4, 5].
If we apply delta encoding this list would be unsorted [5, -3, 1, -2, 3] vs. [1, 1, 1, 1, 1].
Compression ratio is mainly affected on the fast field of the sorted property, every thing else is likely unaffected.
###### Top-N Optimization
When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
Note: Tantivy 0.16 does not do this optimization yet.
###### Pruning
Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
Note: Tantivy 0.16 does not do this optimization yet.
###### Other?
In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?)
## Usage
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used.
```
let settings = IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
};
let mut index_builder = Index::builder().schema(schema);
index_builder = index_builder.settings(settings);
let index = index_builder.create_in_ram().unwrap();
```
## Implementation details
Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073).
In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets).

View File

@@ -96,7 +96,7 @@ fn main() -> tantivy::Result<()> {
);
// ... and add it to the `IndexWriter`.
index_writer.add_document(old_man_doc);
index_writer.add_document(old_man_doc)?;
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.
@@ -110,7 +110,7 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
))?;
// Multivalued field just need to be repeated.
index_writer.add_document(doc!(
@@ -120,7 +120,7 @@ fn main() -> tantivy::Result<()> {
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
));
))?;
// This is an example, so we will only index 3 documents
// here. You can check out tantivy's tutorial to index

View File

@@ -86,12 +86,10 @@ impl Collector for StatsCollector {
fn merge_fruits(&self, segment_stats: Vec<Option<Stats>>) -> tantivy::Result<Option<Stats>> {
let mut stats = Stats::default();
for segment_stats_opt in segment_stats {
if let Some(segment_stats) = segment_stats_opt {
stats.count += segment_stats.count;
stats.sum += segment_stats.sum;
stats.squared_sum += segment_stats.squared_sum;
}
for segment_stats in segment_stats.into_iter().flatten() {
stats.count += segment_stats.count;
stats.sum += segment_stats.sum;
stats.squared_sum += segment_stats.squared_sum;
}
Ok(stats.non_zero_count())
}
@@ -147,23 +145,23 @@ fn main() -> tantivy::Result<()> {
product_description => "While it is ok for short distance travel, this broom \
was designed quiditch. It will up your game.",
price => 30_200u64
));
))?;
index_writer.add_document(doc!(
product_name => "Turbulobroom",
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
You'll enjoy its sharp turns, and rapid acceleration",
price => 29_240u64
));
))?;
index_writer.add_document(doc!(
product_name => "Broomio",
product_description => "Great value for the price. This broom is a market favorite",
price => 21_240u64
));
))?;
index_writer.add_document(doc!(
product_name => "Whack a Mole",
product_description => "Prime quality bat.",
price => 5_200u64
));
))?;
index_writer.commit()?;
let reader = index.reader()?;

View File

@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
));
))?;
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
@@ -79,14 +79,14 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the
debris of the winters flooding; and sycamores with mottled, white, recumbent
limbs and branches that arch over the pool"#
));
))?;
index_writer.add_document(doc!(
title => "Frankenstein",
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
enterprise which you have regarded with such evil forebodings. I arrived here
yesterday, and my first task is to assure my dear sister of my welfare and
increasing confidence in the success of my undertaking."#
));
))?;
index_writer.commit()?;
let reader = index.reader()?;

View File

@@ -76,15 +76,15 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc!(
isbn => "978-0099908401",
title => "The old Man and the see"
));
))?;
index_writer.add_document(doc!(
isbn => "978-0140177398",
title => "Of Mice and Men",
));
))?;
index_writer.add_document(doc!(
title => "Frankentein", //< Oops there is a typo here.
isbn => "978-9176370711",
));
))?;
index_writer.commit()?;
let reader = index.reader()?;
@@ -122,7 +122,7 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc!(
title => "Frankenstein",
isbn => "978-9176370711",
));
))?;
// You are guaranteed that your clients will only observe your index in
// the state it was in after a commit.

View File

@@ -35,35 +35,35 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc!(
name => "Cat",
classification => Facet::from("/Felidae/Felinae/Felis")
));
))?;
index_writer.add_document(doc!(
name => "Canada lynx",
classification => Facet::from("/Felidae/Felinae/Lynx")
));
))?;
index_writer.add_document(doc!(
name => "Cheetah",
classification => Facet::from("/Felidae/Felinae/Acinonyx")
));
))?;
index_writer.add_document(doc!(
name => "Tiger",
classification => Facet::from("/Felidae/Pantherinae/Panthera")
));
))?;
index_writer.add_document(doc!(
name => "Lion",
classification => Facet::from("/Felidae/Pantherinae/Panthera")
));
))?;
index_writer.add_document(doc!(
name => "Jaguar",
classification => Facet::from("/Felidae/Pantherinae/Panthera")
));
))?;
index_writer.add_document(doc!(
name => "Sunda clouded leopard",
classification => Facet::from("/Felidae/Pantherinae/Neofelis")
));
))?;
index_writer.add_document(doc!(
name => "Fossa",
classification => Facet::from("/Eupleridae/Cryptoprocta")
));
))?;
index_writer.commit()?;
let reader = index.reader()?;

View File

@@ -20,14 +20,14 @@ fn main() -> tantivy::Result<()> {
title => "Fried egg",
ingredient => Facet::from("/ingredient/egg"),
ingredient => Facet::from("/ingredient/oil"),
));
))?;
index_writer.add_document(doc!(
title => "Scrambled egg",
ingredient => Facet::from("/ingredient/egg"),
ingredient => Facet::from("/ingredient/butter"),
ingredient => Facet::from("/ingredient/milk"),
ingredient => Facet::from("/ingredient/salt"),
));
))?;
index_writer.add_document(doc!(
title => "Egg rolls",
ingredient => Facet::from("/ingredient/egg"),
@@ -36,7 +36,7 @@ fn main() -> tantivy::Result<()> {
ingredient => Facet::from("/ingredient/oil"),
ingredient => Facet::from("/ingredient/tortilla-wrap"),
ingredient => Facet::from("/ingredient/mushroom"),
));
))?;
index_writer.commit()?;
let reader = index.reader()?;

View File

@@ -7,7 +7,7 @@ use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED};
use tantivy::{doc, Index, Result};
fn run() -> Result<()> {
fn main() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field
let mut schema_builder = Schema::builder();
@@ -19,7 +19,7 @@ fn run() -> Result<()> {
{
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
for year in 1950u64..2019u64 {
index_writer.add_document(doc!(year_field => year));
index_writer.add_document(doc!(year_field => year))?;
}
index_writer.commit()?;
// The index will be a range of years
@@ -33,7 +33,3 @@ fn run() -> Result<()> {
assert_eq!(num_60s_books, 10);
Ok(())
}
fn main() {
run().unwrap()
}

View File

@@ -25,9 +25,9 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
index_writer.add_document(doc!(title => "The Old Man and the Sea"));
index_writer.add_document(doc!(title => "Of Mice and Men"));
index_writer.add_document(doc!(title => "The modern Promotheus"));
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
index_writer.commit()?;
let reader = index.reader()?;

View File

@@ -29,7 +29,7 @@ use std::sync::{Arc, RwLock};
use std::thread;
use std::time::Duration;
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::{doc, Index, IndexWriter, Opstamp};
use tantivy::{doc, Index, IndexWriter, Opstamp, TantivyError};
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -59,10 +59,11 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
))?;
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(20));
}
Result::<(), TantivyError>::Ok(())
});
// # Second indexing thread.
@@ -78,11 +79,12 @@ fn main() -> tantivy::Result<()> {
index_writer_rlock.add_document(doc!(
title => "Manufacturing consent",
body => "Some great book description..."
))
))?
};
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(10));
}
Result::<(), TantivyError>::Ok(())
});
// # In the main thread, we commit 10 times, once every 500ms.
@@ -90,7 +92,7 @@ fn main() -> tantivy::Result<()> {
let opstamp: Opstamp = {
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
let mut index_writer_wlock = index_writer.write().unwrap();
index_writer_wlock.commit().unwrap()
index_writer_wlock.commit()?
};
println!("committed with opstamp {}", opstamp);
thread::sleep(Duration::from_millis(500));

View File

@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
let old_man_doc = doc!(title => title_tok, body => body_tok);
// ... now let's just add it to the IndexWriter
index_writer.add_document(old_man_doc);
index_writer.add_document(old_man_doc)?;
// Pretokenized text can also be fed as JSON
let short_man_json = r#"{
@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
let short_man_doc = schema.parse_document(short_man_json)?;
index_writer.add_document(short_man_doc);
index_writer.add_document(short_man_doc)?;
// Let's commit changes
index_writer.commit()?;
@@ -106,9 +106,7 @@ fn main() -> tantivy::Result<()> {
IndexRecordOption::Basic,
);
let (top_docs, count) = searcher
.search(&query, &(TopDocs::with_limit(2), Count))
.unwrap();
let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
assert_eq!(count, 2);
@@ -129,9 +127,7 @@ fn main() -> tantivy::Result<()> {
IndexRecordOption::Basic,
);
let (_top_docs, count) = searcher
.search(&query, &(TopDocs::with_limit(2), Count))
.unwrap();
let (_top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
assert_eq!(count, 0);

View File

@@ -40,7 +40,7 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
))?;
// ...
index_writer.commit()?;

View File

@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
));
))?;
index_writer.add_document(doc!(
title => "Of Mice and Men",
@@ -80,7 +80,7 @@ fn main() -> tantivy::Result<()> {
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
))?;
index_writer.add_document(doc!(
title => "Frankenstein",
@@ -88,7 +88,7 @@ fn main() -> tantivy::Result<()> {
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
));
))?;
index_writer.commit()?;

View File

@@ -9,8 +9,8 @@ description = "Fast field codecs used by tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
common = { path = "../common/" }
tantivy-bitpacker = { path = "../bitpacker/" }
common = { version = "0.1", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
prettytable-rs = {version="0.8.0", optional= true}
rand = {version="0.8.3", optional= true}

View File

@@ -118,7 +118,7 @@ mod tests {
);
}
}
let actual_compression = data.len() as f32 / out.len() as f32;
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
(estimation, actual_compression)
}
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {

View File

@@ -239,11 +239,21 @@ mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) {
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<
LinearInterpolFastFieldSerializer,
LinearInterpolFastFieldReader,
>(data, name);
>(data, name)
}
#[test]
fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large");
assert!(actual_compression < 0.01);
assert!(estimate < 0.01);
}
#[test]

View File

@@ -57,7 +57,7 @@ struct Function {
impl Function {
fn calc_slope(&mut self) {
let num_vals = self.end_pos - self.start_pos;
get_slope(self.value_start_pos, self.value_end_pos, num_vals);
self.slope = get_slope(self.value_start_pos, self.value_end_pos, num_vals);
}
// split the interpolation into two function, change self and return the second split
fn split(&mut self, split_pos: u64, split_pos_value: u64) -> Function {
@@ -378,11 +378,22 @@ mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) {
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<
MultiLinearInterpolFastFieldSerializer,
MultiLinearInterpolFastFieldReader,
>(data, name);
>(data, name)
}
#[test]
fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large");
assert!(actual_compression < 0.2);
assert!(estimate < 0.20);
assert!(estimate > 0.15);
assert!(actual_compression > 0.01);
}
#[test]
@@ -414,9 +425,11 @@ mod tests {
fn rand() {
for _ in 0..10 {
let mut data = (5_000..20_000)
.map(|_| rand::random::<u64>() as u64)
.map(|_| rand::random::<u32>() as u64)
.collect::<Vec<_>>();
create_and_validate(&data, "random");
let (estimate, actual_compression) = create_and_validate(&data, "random");
dbg!(estimate);
dbg!(actual_compression);
data.reverse();
create_and_validate(&data, "random");

View File

@@ -1,9 +1,10 @@
[package]
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
name = "ownedbytes"
version = "0.1.0"
version = "0.2.0"
edition = "2018"
description = "Expose data as static slice"
license = "MIT"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

View File

@@ -76,6 +76,19 @@ impl OwnedBytes {
(left, right)
}
/// Splits the right part of the `OwnedBytes` at the given offset.
///
/// `self` is truncated to `split_len`, left with the remaining bytes.
pub fn split_off(&mut self, split_len: usize) -> OwnedBytes {
let right_box_stable_deref = self.box_stable_deref.clone();
let right_piece = OwnedBytes {
data: &self.data[split_len..],
box_stable_deref: right_box_stable_deref,
};
self.data = &self.data[..split_len];
right_piece
}
/// Returns true iff this `OwnedBytes` is empty.
#[inline]
pub fn is_empty(&self) -> bool {
@@ -124,6 +137,35 @@ impl fmt::Debug for OwnedBytes {
}
}
impl PartialEq for OwnedBytes {
fn eq(&self, other: &OwnedBytes) -> bool {
self.as_slice() == other.as_slice()
}
}
impl Eq for OwnedBytes {}
impl PartialEq<[u8]> for OwnedBytes {
fn eq(&self, other: &[u8]) -> bool {
self.as_slice() == other
}
}
impl PartialEq<str> for OwnedBytes {
fn eq(&self, other: &str) -> bool {
self.as_slice() == other.as_bytes()
}
}
impl<'a, T: ?Sized> PartialEq<&'a T> for OwnedBytes
where
OwnedBytes: PartialEq<T>,
{
fn eq(&self, other: &&'a T) -> bool {
*self == **other
}
}
impl Deref for OwnedBytes {
type Target = [u8];
@@ -287,4 +329,14 @@ mod tests {
assert_eq!(right.as_slice(), b"");
}
}
#[test]
fn test_split_off() {
let mut data = OwnedBytes::new(b"abcdef".as_ref());
assert_eq!(data, "abcdef");
assert_eq!(data.split_off(2), "cdef");
assert_eq!(data, "ab");
assert_eq!(data.split_off(1), "b");
assert_eq!(data, "a");
}
}

View File

@@ -5,9 +5,9 @@ authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
description = """Search engine library"""
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
homepage = "https://github.com/tantivy-search/tantivy"
repository = "https://github.com/tantivy-search/tantivy"
documentation = "https://quickwit-inc.github.io/tantivy/tantivy/index.html"
homepage = "https://github.com/quickwit-inc/tantivy"
repository = "https://github.com/quickwit-inc/tantivy"
readme = "README.md"
keywords = ["search", "information", "retrieval"]
edition = "2018"

View File

@@ -20,10 +20,10 @@ use crate::SegmentReader;
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer(3_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// index_writer.add_document(doc!(title => "The Name of the Wind")).unwrap();
/// index_writer.add_document(doc!(title => "The Diary of Muadib")).unwrap();
/// index_writer.add_document(doc!(title => "A Dairy Cow")).unwrap();
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl")).unwrap();
/// assert!(index_writer.commit().is_ok());
///
/// let reader = index.reader().unwrap();

View File

@@ -103,23 +103,23 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// title => "The Name of the Wind",
/// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/fiction/fantasy")
/// ));
/// ))?;
/// index_writer.add_document(doc!(
/// title => "Dune",
/// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/fiction/sci-fi")
/// ));
/// ))?;
/// index_writer.add_document(doc!(
/// title => "La Vénus d'Ille",
/// facet => Facet::from("/lang/fr"),
/// facet => Facet::from("/category/fiction/fantasy"),
/// facet => Facet::from("/category/fiction/horror")
/// ));
/// ))?;
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/biography")
/// ));
/// ))?;
/// index_writer.commit()?;
/// }
/// let reader = index.reader()?;
@@ -470,13 +470,13 @@ mod tests {
use std::iter;
#[test]
fn test_facet_collector_drilldown() {
fn test_facet_collector_drilldown() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| {
@@ -491,14 +491,14 @@ mod tests {
for i in 0..num_facets * 10 {
let mut doc = Document::new();
doc.add_facet(facet_field, facets[i % num_facets].clone());
index_writer.add_document(doc);
index_writer.add_document(doc)?;
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top1"));
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
let counts = searcher.search(&AllQuery, &facet_collector)?;
{
let facets: Vec<(String, u64)> = counts
@@ -518,6 +518,7 @@ mod tests {
.collect::<Vec<_>>()
);
}
Ok(())
}
#[test]
@@ -530,27 +531,28 @@ mod tests {
}
#[test]
fn test_doc_unsorted_multifacet() {
fn test_doc_unsorted_multifacet() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facets", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/subjects/A/a").unwrap(),
facet_field => Facet::from_text(&"/subjects/B/a").unwrap(),
facet_field => Facet::from_text(&"/subjects/A/b").unwrap(),
facet_field => Facet::from_text(&"/subjects/B/b").unwrap(),
));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1);
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet("/subjects");
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
let counts = searcher.search(&AllQuery, &facet_collector)?;
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
assert_eq!(facets[0].1, 1);
Ok(())
}
#[test]
@@ -562,16 +564,16 @@ mod tests {
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/A").unwrap(),
));
))?;
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/B").unwrap(),
));
))?;
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/C/A").unwrap(),
));
))?;
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/D/C/A").unwrap(),
));
))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
@@ -637,7 +639,7 @@ mod tests {
let mut index_writer = index.writer_for_tests().unwrap();
for doc in docs {
index_writer.add_document(doc);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
@@ -677,7 +679,7 @@ mod tests {
let mut index_writer = index.writer_for_tests()?;
for doc in docs {
index_writer.add_document(doc);
index_writer.add_document(doc)?;
}
index_writer.commit()?;

View File

@@ -25,34 +25,37 @@ use crate::{Score, SegmentReader, TantivyError};
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
/// use tantivy::{doc, DocAddress, Index};
///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64));
/// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64));
/// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64));
/// assert!(index_writer.commit().is_ok());
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64))?;
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64))?;
/// index_writer.commit()?;
///
/// let reader = index.reader().unwrap();
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary").unwrap();
/// let query = query_parser.parse_query("diary")?;
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &no_filter_collector).unwrap();
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
///
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
///
/// assert_eq!(filtered_top_docs.len(), 0);
/// # Ok(())
/// # }
/// ```
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
where

View File

@@ -226,10 +226,10 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(val_field=>12i64));
writer.add_document(doc!(val_field=>-30i64));
writer.add_document(doc!(val_field=>-12i64));
writer.add_document(doc!(val_field=>-10i64));
writer.add_document(doc!(val_field=>12i64))?;
writer.add_document(doc!(val_field=>-30i64))?;
writer.add_document(doc!(val_field=>-12i64))?;
writer.add_document(doc!(val_field=>-10i64))?;
writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
@@ -247,13 +247,13 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(val_field=>12i64));
writer.add_document(doc!(val_field=>12i64))?;
writer.commit()?;
writer.add_document(doc!(val_field=>-30i64));
writer.add_document(doc!(val_field=>-30i64))?;
writer.commit()?;
writer.add_document(doc!(val_field=>-12i64));
writer.add_document(doc!(val_field=>-12i64))?;
writer.commit()?;
writer.add_document(doc!(val_field=>-10i64));
writer.add_document(doc!(val_field=>-10i64))?;
writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
@@ -271,9 +271,9 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)));
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)));
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)));
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)))?;
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)))?;
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)))?;
writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();

View File

@@ -48,10 +48,10 @@ use tantivy::collector::{Count, TopDocs};
# let mut index_writer = index.writer(3_000_000)?;
# index_writer.add_document(doc!(
# title => "The Name of the Wind",
# ));
# ))?;
# index_writer.add_document(doc!(
# title => "The Diary of Muadib",
# ));
# ))?;
# index_writer.commit()?;
# let reader = index.reader()?;
# let searcher = reader.searcher();
@@ -178,9 +178,9 @@ pub trait Collector: Sync + Send {
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
let mut segment_collector = self.for_segment(segment_ord as u32, reader)?;
if let Some(delete_bitset) = reader.delete_bitset() {
if let Some(alive_bitset) = reader.alive_bitset() {
weight.for_each(reader, &mut |doc, score| {
if delete_bitset.is_alive(doc) {
if alive_bitset.is_alive(doc) {
segment_collector.collect(doc, score);
}
})?;

View File

@@ -112,19 +112,19 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, Index};
///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
/// index_writer.commit()?;
///
/// let mut index_writer = index.writer(3_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// assert!(index_writer.commit().is_ok());
///
/// let reader = index.reader().unwrap();
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// let mut collectors = MultiCollector::new();
@@ -139,6 +139,8 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
///
/// assert_eq!(count, 2);
/// assert_eq!(top_docs.len(), 2);
/// # Ok(())
/// # }
/// ```
#[allow(clippy::type_complexity)]
#[derive(Default)]
@@ -252,24 +254,24 @@ mod tests {
use crate::Term;
#[test]
fn test_multi_collector() {
fn test_multi_collector() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text=>"abc"));
index_writer.add_document(doc!(text=>"abc abc abc"));
index_writer.add_document(doc!(text=>"abc abc"));
index_writer.commit().unwrap();
index_writer.add_document(doc!(text=>""));
index_writer.add_document(doc!(text=>"abc abc abc abc"));
index_writer.add_document(doc!(text=>"abc"));
index_writer.commit().unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text=>"abc"))?;
index_writer.add_document(doc!(text=>"abc abc abc"))?;
index_writer.add_document(doc!(text=>"abc abc"))?;
index_writer.commit()?;
index_writer.add_document(doc!(text=>""))?;
index_writer.add_document(doc!(text=>"abc abc abc abc"))?;
index_writer.add_document(doc!(text=>"abc"))?;
index_writer.commit()?;
}
let searcher = index.reader().unwrap().searcher();
let searcher = index.reader()?.searcher();
let term = Term::from_field_text(text, "abc");
let query = TermQuery::new(term, IndexRecordOption::Basic);
@@ -280,5 +282,6 @@ mod tests {
assert_eq!(count_handler.extract(&mut multifruits), 5);
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
Ok(())
}
}

View File

@@ -25,7 +25,7 @@ pub const TEST_COLLECTOR_WITHOUT_SCORE: TestCollector = TestCollector {
};
#[test]
pub fn test_filter_collector() {
pub fn test_filter_collector() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT);
let price = schema_builder.add_u64_field("price", FAST);
@@ -33,25 +33,25 @@ pub fn test_filter_collector() {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()));
assert!(index_writer.commit().is_ok());
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()))?;
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()))?;
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()))?;
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()))?;
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()))?;
index_writer.commit()?;
let reader = index.reader().unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title]);
let query = query_parser.parse_query("diary").unwrap();
let query = query_parser.parse_query("diary")?;
let filter_some_collector = FilterCollector::new(
price,
&|value: u64| value > 20_120u64,
TopDocs::with_limit(2),
);
let top_docs = searcher.search(&query, &filter_some_collector).unwrap();
let top_docs = searcher.search(&query, &filter_some_collector)?;
assert_eq!(top_docs.len(), 1);
assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
@@ -67,9 +67,10 @@ pub fn test_filter_collector() {
}
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
let filtered_date_docs = searcher.search(&query, &filter_dates_collector).unwrap();
let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?;
assert_eq!(filtered_date_docs.len(), 2);
Ok(())
}
/// Stores all of the doc ids.
@@ -274,8 +275,8 @@ fn make_test_searcher() -> crate::Result<crate::LeasedItem<Searcher>> {
let schema = Schema::builder().build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(Document::default());
index_writer.add_document(Document::default());
index_writer.add_document(Document::default())?;
index_writer.add_document(Document::default())?;
index_writer.commit()?;
Ok(index.reader()?.searcher())
}

View File

@@ -70,9 +70,7 @@ where
/// # Panics
/// The method panics if limit is 0
pub fn with_limit(limit: usize) -> TopCollector<T> {
if limit < 1 {
panic!("Limit must be strictly greater than 0.");
}
assert!(limit >= 1, "Limit must be strictly greater than 0.");
Self {
limit,
offset: 0,

View File

@@ -94,27 +94,30 @@ where
/// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, DocAddress, Index};
///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// assert!(index_writer.commit().is_ok());
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
/// index_writer.commit()?;
///
/// let reader = index.reader().unwrap();
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary").unwrap();
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
/// let query = query_parser.parse_query("diary")?;
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2))?;
///
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
/// # Ok(())
/// # }
/// ```
pub struct TopDocs(TopCollector<Score>);
@@ -180,29 +183,32 @@ impl TopDocs {
/// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{doc, DocAddress, Index};
///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"));
/// assert!(index_writer.commit().is_ok());
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"))?;
/// index_writer.commit()?;
///
/// let reader = index.reader().unwrap();
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary").unwrap();
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1)).unwrap();
/// let query = query_parser.parse_query("diary")?;
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1))?;
///
/// assert_eq!(top_docs.len(), 2);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 4));
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
/// Ok(())
/// # }
/// ```
pub fn and_offset(self, offset: usize) -> TopDocs {
TopDocs(self.0.and_offset(offset))
@@ -234,11 +240,11 @@ impl TopDocs {
/// #
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64));
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
/// # assert!(index_writer.commit().is_ok());
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64))?;
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64))?;
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64))?;
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64))?;
/// # index_writer.commit()?;
/// # let reader = index.reader()?;
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
@@ -316,9 +322,9 @@ impl TopDocs {
/// #
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64));
/// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64));
/// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64));
/// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64))?;
/// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64))?;
/// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64))?;
/// # assert!(index_writer.commit().is_ok());
/// # let reader = index.reader()?;
/// # let top_docs = docs_sorted_by_revenue(&reader.searcher(), &AllQuery, rating)?;
@@ -417,9 +423,9 @@ impl TopDocs {
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// let product_name = index.schema().get_field("product_name").unwrap();
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
/// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64));
/// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64));
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64))?;
/// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64))?;
/// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64))?;
/// index_writer.commit()?;
/// Ok(index)
/// }
@@ -527,9 +533,9 @@ impl TopDocs {
/// #
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
/// let boosted: Field = index.schema().get_field("boosted").unwrap();
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64));
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64));
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64));
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64))?;
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64))?;
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64))?;
/// # index_writer.commit()?;
/// // ...
/// # let user_query = "diary";
@@ -629,10 +635,10 @@ impl Collector for TopDocs {
let heap_len = self.0.limit + self.0.offset;
let mut heap: BinaryHeap<ComparableDoc<Score, DocId>> = BinaryHeap::with_capacity(heap_len);
if let Some(delete_bitset) = reader.delete_bitset() {
if let Some(alive_bitset) = reader.alive_bitset() {
let mut threshold = Score::MIN;
weight.for_each_pruning(threshold, reader, &mut |doc, score| {
if delete_bitset.is_deleted(doc) {
if alive_bitset.is_deleted(doc) {
return threshold;
}
let heap_item = ComparableDoc {
@@ -713,20 +719,18 @@ mod tests {
use crate::Score;
use crate::{DocAddress, DocId, SegmentReader};
fn make_index() -> Index {
fn make_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
index_writer.add_document(doc!(text_field=>"I like Droopy"));
assert!(index_writer.commit().is_ok());
}
index
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."))?;
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"))?;
index_writer.add_document(doc!(text_field=>"I like Droopy"))?;
index_writer.commit()?;
Ok(index)
}
fn assert_results_equals(results: &[(Score, DocAddress)], expected: &[(Score, DocAddress)]) {
@@ -737,17 +741,15 @@ mod tests {
}
#[test]
fn test_top_collector_not_at_capacity_without_offset() {
let index = make_index();
fn test_top_collector_not_at_capacity_without_offset() -> crate::Result<()> {
let index = make_index()?;
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let text_query = query_parser.parse_query("droopy tax")?;
let score_docs: Vec<(Score, DocAddress)> = index
.reader()
.unwrap()
.reader()?
.searcher()
.search(&text_query, &TopDocs::with_limit(4))
.unwrap();
.search(&text_query, &TopDocs::with_limit(4))?;
assert_results_equals(
&score_docs,
&[
@@ -756,11 +758,12 @@ mod tests {
(0.48527452, DocAddress::new(0, 0)),
],
);
Ok(())
}
#[test]
fn test_top_collector_not_at_capacity_with_offset() {
let index = make_index();
let index = make_index().unwrap();
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
@@ -775,7 +778,7 @@ mod tests {
#[test]
fn test_top_collector_at_capacity() {
let index = make_index();
let index = make_index().unwrap();
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
@@ -796,7 +799,7 @@ mod tests {
#[test]
fn test_top_collector_at_capacity_with_offset() {
let index = make_index();
let index = make_index().unwrap();
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
@@ -817,7 +820,7 @@ mod tests {
#[test]
fn test_top_collector_stable_sorting() {
let index = make_index();
let index = make_index().unwrap();
// using AllQuery to get a constant score
let searcher = index.reader().unwrap().searcher();
@@ -848,29 +851,35 @@ mod tests {
const SIZE: &str = "size";
#[test]
fn test_top_field_collector_not_at_capacity() {
fn test_top_field_collector_not_at_capacity() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field(TITLE, TEXT);
let size = schema_builder.add_u64_field(SIZE, FAST);
let schema = schema_builder.build();
let (index, query) = index("beer", title, schema, |index_writer| {
index_writer.add_document(doc!(
title => "bottle of beer",
size => 12u64,
));
index_writer.add_document(doc!(
title => "growler of beer",
size => 64u64,
));
index_writer.add_document(doc!(
title => "pint of beer",
size => 16u64,
));
index_writer
.add_document(doc!(
title => "bottle of beer",
size => 12u64,
))
.unwrap();
index_writer
.add_document(doc!(
title => "growler of beer",
size => 64u64,
))
.unwrap();
index_writer
.add_document(doc!(
title => "pint of beer",
size => 16u64,
))
.unwrap();
});
let searcher = index.reader().unwrap().searcher();
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
assert_eq!(
&top_docs[..],
&[
@@ -879,6 +888,7 @@ mod tests {
(12, DocAddress::new(0, 0))
]
);
Ok(())
}
#[test]
@@ -894,12 +904,12 @@ mod tests {
index_writer.add_document(doc!(
name => "Paul Robeson",
birthday => pr_birthday
));
))?;
let mr_birthday = crate::DateTime::from_str("1947-11-08T00:00:00+00:00")?;
index_writer.add_document(doc!(
name => "Minnie Riperton",
birthday => mr_birthday
));
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
@@ -926,11 +936,11 @@ mod tests {
index_writer.add_document(doc!(
city => "georgetown",
altitude => -1i64,
));
))?;
index_writer.add_document(doc!(
city => "tokyo",
altitude => 40i64,
));
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
@@ -956,11 +966,11 @@ mod tests {
index_writer.add_document(doc!(
city => "georgetown",
altitude => -1.0f64,
));
))?;
index_writer.add_document(doc!(
city => "tokyo",
altitude => 40f64,
));
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
@@ -983,10 +993,12 @@ mod tests {
let size = schema_builder.add_u64_field(SIZE, FAST);
let schema = schema_builder.build();
let (index, _) = index("beer", title, schema, |index_writer| {
index_writer.add_document(doc!(
title => "bottle of beer",
size => 12u64,
));
index_writer
.add_document(doc!(
title => "bottle of beer",
size => 12u64,
))
.unwrap();
});
let searcher = index.reader().unwrap().searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
@@ -1003,7 +1015,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(size=>1u64));
index_writer.add_document(doc!(size=>1u64))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0);
@@ -1020,7 +1032,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(size=>1u64));
index_writer.add_document(doc!(size=>1u64))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0);
@@ -1033,30 +1045,26 @@ mod tests {
}
#[test]
fn test_tweak_score_top_collector_with_offset() {
let index = make_index();
fn test_tweak_score_top_collector_with_offset() -> crate::Result<()> {
let index = make_index()?;
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let text_query = query_parser.parse_query("droopy tax")?;
let collector = TopDocs::with_limit(2).and_offset(1).tweak_score(
move |_segment_reader: &SegmentReader| move |doc: DocId, _original_score: Score| doc,
);
let score_docs: Vec<(u32, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &collector)
.unwrap();
let score_docs: Vec<(u32, DocAddress)> =
index.reader()?.searcher().search(&text_query, &collector)?;
assert_eq!(
score_docs,
vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),]
);
Ok(())
}
#[test]
fn test_custom_score_top_collector_with_offset() {
let index = make_index();
let index = make_index().unwrap();
let field = index.schema().get_field("text").unwrap();
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();

View File

@@ -1,396 +0,0 @@
use std::fmt;
use std::u64;
#[derive(Clone, Copy, Eq, PartialEq)]
pub(crate) struct TinySet(u64);
impl fmt::Debug for TinySet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.into_iter().collect::<Vec<u32>>().fmt(f)
}
}
pub struct TinySetIterator(TinySet);
impl Iterator for TinySetIterator {
type Item = u32;
fn next(&mut self) -> Option<Self::Item> {
self.0.pop_lowest()
}
}
impl IntoIterator for TinySet {
type Item = u32;
type IntoIter = TinySetIterator;
fn into_iter(self) -> Self::IntoIter {
TinySetIterator(self)
}
}
impl TinySet {
/// Returns an empty `TinySet`.
pub fn empty() -> TinySet {
TinySet(0u64)
}
pub fn clear(&mut self) {
self.0 = 0u64;
}
/// Returns the complement of the set in `[0, 64[`.
fn complement(self) -> TinySet {
TinySet(!self.0)
}
/// Returns true iff the `TinySet` contains the element `el`.
pub fn contains(self, el: u32) -> bool {
!self.intersect(TinySet::singleton(el)).is_empty()
}
/// Returns the number of elements in the TinySet.
pub fn len(self) -> u32 {
self.0.count_ones()
}
/// Returns the intersection of `self` and `other`
pub fn intersect(self, other: TinySet) -> TinySet {
TinySet(self.0 & other.0)
}
/// Creates a new `TinySet` containing only one element
/// within `[0; 64[`
#[inline]
pub fn singleton(el: u32) -> TinySet {
TinySet(1u64 << u64::from(el))
}
/// Insert a new element within [0..64[
#[inline]
pub fn insert(self, el: u32) -> TinySet {
self.union(TinySet::singleton(el))
}
/// Insert a new element within [0..64[
#[inline]
pub fn insert_mut(&mut self, el: u32) -> bool {
let old = *self;
*self = old.insert(el);
old != *self
}
/// Returns the union of two tinysets
#[inline]
pub fn union(self, other: TinySet) -> TinySet {
TinySet(self.0 | other.0)
}
/// Returns true iff the `TinySet` is empty.
#[inline]
pub fn is_empty(self) -> bool {
self.0 == 0u64
}
/// Returns the lowest element in the `TinySet`
/// and removes it.
#[inline]
pub fn pop_lowest(&mut self) -> Option<u32> {
if self.is_empty() {
None
} else {
let lowest = self.0.trailing_zeros() as u32;
self.0 ^= TinySet::singleton(lowest).0;
Some(lowest)
}
}
/// Returns a `TinySet` than contains all values up
/// to limit excluded.
///
/// The limit is assumed to be strictly lower than 64.
pub fn range_lower(upper_bound: u32) -> TinySet {
TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
}
/// Returns a `TinySet` that contains all values greater
/// or equal to the given limit, included. (and up to 63)
///
/// The limit is assumed to be strictly lower than 64.
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
TinySet::range_lower(from_included).complement()
}
}
#[derive(Clone)]
pub struct BitSet {
tinysets: Box<[TinySet]>,
len: usize,
max_value: u32,
}
fn num_buckets(max_val: u32) -> u32 {
(max_val + 63u32) / 64u32
}
impl BitSet {
/// Create a new `BitSet` that may contain elements
/// within `[0, max_val[`.
pub fn with_max_value(max_value: u32) -> BitSet {
let num_buckets = num_buckets(max_value);
let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
BitSet {
tinysets: tinybisets,
len: 0,
max_value,
}
}
/// Removes all elements from the `BitSet`.
pub fn clear(&mut self) {
for tinyset in self.tinysets.iter_mut() {
*tinyset = TinySet::empty();
}
}
/// Returns the number of elements in the `BitSet`.
pub fn len(&self) -> usize {
self.len
}
/// Inserts an element in the `BitSet`
pub fn insert(&mut self, el: u32) {
// we do not check saturated els.
let higher = el / 64u32;
let lower = el % 64u32;
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
1
} else {
0
};
}
/// Returns true iff the elements is in the `BitSet`.
pub fn contains(&self, el: u32) -> bool {
self.tinyset(el / 64u32).contains(el % 64)
}
/// Returns the first non-empty `TinySet` associated to a bucket lower
/// or greater than bucket.
///
/// Reminder: the tiny set with the bucket `bucket`, represents the
/// elements from `bucket * 64` to `(bucket+1) * 64`.
pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
self.tinysets[bucket as usize..]
.iter()
.cloned()
.position(|tinyset| !tinyset.is_empty())
.map(|delta_bucket| bucket + delta_bucket as u32)
}
pub fn max_value(&self) -> u32 {
self.max_value
}
/// Returns the tiny bitset representing the
/// the set restricted to the number range from
/// `bucket * 64` to `(bucket + 1) * 64`.
pub(crate) fn tinyset(&self, bucket: u32) -> TinySet {
self.tinysets[bucket as usize]
}
}
#[cfg(test)]
mod tests {
use super::BitSet;
use super::TinySet;
use crate::docset::{DocSet, TERMINATED};
use crate::query::BitSetDocSet;
use crate::tests;
use crate::tests::generate_nonunique_unsorted;
use std::collections::BTreeSet;
use std::collections::HashSet;
#[test]
fn test_tiny_set() {
assert!(TinySet::empty().is_empty());
{
let mut u = TinySet::empty().insert(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none())
}
{
let mut u = TinySet::empty().insert(1u32).insert(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none())
}
{
let mut u = TinySet::empty().insert(2u32);
assert_eq!(u.pop_lowest(), Some(2u32));
u.insert_mut(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty().insert(63u32);
assert_eq!(u.pop_lowest(), Some(63u32));
assert!(u.pop_lowest().is_none());
}
}
#[test]
fn test_bitset() {
let test_against_hashset = |els: &[u32], max_value: u32| {
let mut hashset: HashSet<u32> = HashSet::new();
let mut bitset = BitSet::with_max_value(max_value);
for &el in els {
assert!(el < max_value);
hashset.insert(el);
bitset.insert(el);
}
for el in 0..max_value {
assert_eq!(hashset.contains(&el), bitset.contains(el));
}
assert_eq!(bitset.max_value(), max_value);
};
test_against_hashset(&[], 0);
test_against_hashset(&[], 1);
test_against_hashset(&[0u32], 1);
test_against_hashset(&[0u32], 100);
test_against_hashset(&[1u32, 2u32], 4);
test_against_hashset(&[99u32], 100);
test_against_hashset(&[63u32], 64);
test_against_hashset(&[62u32, 63u32], 64);
}
#[test]
fn test_bitset_large() {
let arr = generate_nonunique_unsorted(100_000, 5_000);
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
let mut bitset = BitSet::with_max_value(100_000);
for el in arr {
btreeset.insert(el);
bitset.insert(el);
}
for i in 0..100_000 {
assert_eq!(btreeset.contains(&i), bitset.contains(i));
}
assert_eq!(btreeset.len(), bitset.len());
let mut bitset_docset = BitSetDocSet::from(bitset);
let mut remaining = true;
for el in btreeset.into_iter() {
assert!(remaining);
assert_eq!(bitset_docset.doc(), el);
remaining = bitset_docset.advance() != TERMINATED;
}
assert!(!remaining);
}
#[test]
fn test_bitset_num_buckets() {
use super::num_buckets;
assert_eq!(num_buckets(0u32), 0);
assert_eq!(num_buckets(1u32), 1);
assert_eq!(num_buckets(64u32), 1);
assert_eq!(num_buckets(65u32), 2);
assert_eq!(num_buckets(128u32), 2);
assert_eq!(num_buckets(129u32), 3);
}
#[test]
fn test_tinyset_range() {
assert_eq!(
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
[0, 1, 2]
);
assert!(TinySet::range_lower(0).is_empty());
assert_eq!(
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
(0u32..63u32).collect::<Vec<_>>()
);
assert_eq!(
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
[0]
);
assert_eq!(
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
[0, 1]
);
assert_eq!(
TinySet::range_greater_or_equal(3)
.into_iter()
.collect::<Vec<u32>>(),
(3u32..64u32).collect::<Vec<_>>()
);
}
#[test]
fn test_bitset_len() {
let mut bitset = BitSet::with_max_value(1_000);
assert_eq!(bitset.len(), 0);
bitset.insert(3u32);
assert_eq!(bitset.len(), 1);
bitset.insert(103u32);
assert_eq!(bitset.len(), 2);
bitset.insert(3u32);
assert_eq!(bitset.len(), 2);
bitset.insert(103u32);
assert_eq!(bitset.len(), 2);
bitset.insert(104u32);
assert_eq!(bitset.len(), 3);
}
#[test]
fn test_bitset_clear() {
let mut bitset = BitSet::with_max_value(1_000);
let els = tests::sample(1_000, 0.01f64);
for &el in &els {
bitset.insert(el);
}
assert!(els.iter().all(|el| bitset.contains(*el)));
bitset.clear();
for el in 0u32..1000u32 {
assert!(!bitset.contains(el));
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::BitSet;
use super::TinySet;
use test;
#[bench]
fn bench_tinyset_pop(b: &mut test::Bencher) {
b.iter(|| {
let mut tinyset = TinySet::singleton(test::black_box(31u32));
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
});
}
#[bench]
fn bench_tinyset_sum(b: &mut test::Bencher) {
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
b.iter(|| {
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
});
}
#[bench]
fn bench_tinyarr_sum(b: &mut test::Bencher) {
let v = [10u32, 14u32, 21u32];
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
}
#[bench]
fn bench_bitset_initialize(b: &mut test::Bencher) {
b.iter(|| BitSet::with_max_value(1_000_000));
}
}

View File

@@ -1,203 +0,0 @@
mod bitset;
mod composite_file;
pub use self::bitset::BitSet;
pub(crate) use self::bitset::TinySet;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use byteorder::LittleEndian as Endianness;
pub use common::CountingWriter;
pub use common::{
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
};
pub use common::{BinarySerializable, DeserializeFrom, FixedSize};
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
/// Has length trait
pub trait HasLen {
/// Return length
fn len(&self) -> usize;
/// Returns true iff empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
}
const HIGHEST_BIT: u64 = 1 << 63;
/// Maps a `i64` to `u64`
///
/// For simplicity, tantivy internally handles `i64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `i64` to `u64` so that
/// `-2^63 .. 2^63-1` is mapped
/// to
/// `0 .. 2^64-1`
/// in that order.
///
/// This is more suited than simply casting (`val as u64`)
/// because of bitpacking.
///
/// Imagine a list of `i64` ranging from -10 to 10.
/// When casting negative values, the negative values are projected
/// to values over 2^63, and all values end up requiring 64 bits.
///
/// # See also
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
#[inline]
pub fn i64_to_u64(val: i64) -> u64 {
(val as u64) ^ HIGHEST_BIT
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline]
pub fn u64_to_i64(val: u64) -> i64 {
(val ^ HIGHEST_BIT) as i64
}
/// Maps a `f64` to `u64`
///
/// For simplicity, tantivy internally handles `f64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved.
///
/// This is more suited than simply casting (`val as u64`)
/// which would truncate the result
///
/// # Reference
///
/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/)
/// explains the mapping in a clear manner.
///
/// # See also
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
#[inline]
pub fn f64_to_u64(val: f64) -> u64 {
let bits = val.to_bits();
if val.is_sign_positive() {
bits ^ HIGHEST_BIT
} else {
!bits
}
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline]
pub fn u64_to_f64(val: u64) -> f64 {
f64::from_bits(if val & HIGHEST_BIT != 0 {
val ^ HIGHEST_BIT
} else {
!val
})
}
#[cfg(test)]
pub(crate) mod test {
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use common::{BinarySerializable, FixedSize};
use proptest::prelude::*;
use std::f64;
use tantivy_bitpacker::compute_num_bits;
pub use tantivy_bitpacker::minmax;
fn test_i64_converter_helper(val: i64) {
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
}
fn test_f64_converter_helper(val: f64) {
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
}
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
proptest! {
#[test]
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
let left_u64 = f64_to_u64(left);
let right_u64 = f64_to_u64(right);
assert_eq!(left_u64 < right_u64, left < right);
}
}
#[test]
fn test_i64_converter() {
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
test_i64_converter_helper(0i64);
test_i64_converter_helper(i64::min_value());
test_i64_converter_helper(i64::max_value());
for i in -1000i64..1000i64 {
test_i64_converter_helper(i);
}
}
#[test]
fn test_f64_converter() {
test_f64_converter_helper(f64::INFINITY);
test_f64_converter_helper(f64::NEG_INFINITY);
test_f64_converter_helper(0.0);
test_f64_converter_helper(-0.0);
test_f64_converter_helper(1.0);
test_f64_converter_helper(-1.0);
}
#[test]
fn test_f64_order() {
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
.contains(&f64_to_u64(f64::NAN))); //nan is not a number
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
}
#[test]
fn test_compute_num_bits() {
assert_eq!(compute_num_bits(1), 1u8);
assert_eq!(compute_num_bits(0), 0u8);
assert_eq!(compute_num_bits(2), 2u8);
assert_eq!(compute_num_bits(3), 2u8);
assert_eq!(compute_num_bits(4), 3u8);
assert_eq!(compute_num_bits(255), 8u8);
assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
#[test]
fn test_max_doc() {
// this is the first time I write a unit test for a constant.
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
assert!((super::MAX_DOC_LIMIT as i32) < 0);
}
#[test]
fn test_minmax_empty() {
let vals: Vec<u32> = vec![];
assert_eq!(minmax(vals.into_iter()), None);
}
#[test]
fn test_minmax_one() {
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
}
#[test]
fn test_minmax_two() {
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
}
}

View File

@@ -120,11 +120,11 @@ impl IndexBuilder {
/// Creates a new index in a given filepath.
/// The index will use the `MMapDirectory`.
///
/// If a previous index was in this directory, then its meta file will be destroyed.
/// If a previous index was in this directory, it returns an `IndexAlreadyExists` error.
#[cfg(feature = "mmap")]
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
if Index::exists(&mmap_directory)? {
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::open(directory_path)?);
if Index::exists(&*mmap_directory)? {
return Err(TantivyError::IndexAlreadyExists);
}
self.create(mmap_directory)
@@ -139,7 +139,7 @@ impl IndexBuilder {
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
#[cfg(feature = "mmap")]
pub fn create_from_tempdir(self) -> crate::Result<Index> {
let mmap_directory = MmapDirectory::create_from_tempdir()?;
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::create_from_tempdir()?);
self.create(mmap_directory)
}
fn get_expect_schema(&self) -> crate::Result<Schema> {
@@ -149,8 +149,9 @@ impl IndexBuilder {
.ok_or(TantivyError::IndexBuilderMissingArgument("schema"))
}
/// Opens or creates a new index in the provided directory
pub fn open_or_create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
if !Index::exists(&dir)? {
pub fn open_or_create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
let dir = dir.into();
if !Index::exists(&*dir)? {
return self.create(dir);
}
let index = Index::open(dir)?;
@@ -165,7 +166,8 @@ impl IndexBuilder {
/// Creates a new index given an implementation of the trait `Directory`.
///
/// If a directory previously existed, it will be erased.
fn create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
fn create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
let dir = dir.into();
let directory = ManagedDirectory::wrap(dir)?;
save_new_metas(
self.get_expect_schema()?,
@@ -198,7 +200,7 @@ impl Index {
/// Examines the directory to see if it contains an index.
///
/// Effectively, it only checks for the presence of the `meta.json` file.
pub fn exists<Dir: Directory>(dir: &Dir) -> Result<bool, OpenReadError> {
pub fn exists(dir: &dyn Directory) -> Result<bool, OpenReadError> {
dir.exists(&META_FILEPATH)
}
@@ -229,7 +231,8 @@ impl Index {
/// Creates a new index using the `RamDirectory`.
///
/// The index will be allocated in anonymous memory.
/// This should only be used for unit tests.
/// This is useful for indexing small set of documents
/// for instances like unit test or temporary in memory index.
pub fn create_in_ram(schema: Schema) -> Index {
IndexBuilder::new().schema(schema).create_in_ram().unwrap()
}
@@ -237,7 +240,7 @@ impl Index {
/// Creates a new index in a given filepath.
/// The index will use the `MMapDirectory`.
///
/// If a previous index was in this directory, then its meta file will be destroyed.
/// If a previous index was in this directory, then it returns an `IndexAlreadyExists` error.
#[cfg(feature = "mmap")]
pub fn create_in_dir<P: AsRef<Path>>(
directory_path: P,
@@ -249,7 +252,11 @@ impl Index {
}
/// Opens or creates a new index in the provided directory
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> crate::Result<Index> {
pub fn open_or_create<T: Into<Box<dyn Directory>>>(
dir: T,
schema: Schema,
) -> crate::Result<Index> {
let dir = dir.into();
IndexBuilder::new().schema(schema).open_or_create(dir)
}
@@ -269,11 +276,12 @@ impl Index {
/// Creates a new index given an implementation of the trait `Directory`.
///
/// If a directory previously existed, it will be erased.
pub fn create<Dir: Directory>(
dir: Dir,
pub fn create<T: Into<Box<dyn Directory>>>(
dir: T,
schema: Schema,
settings: IndexSettings,
) -> crate::Result<Index> {
let dir: Box<dyn Directory> = dir.into();
let mut builder = IndexBuilder::new().schema(schema);
builder = builder.settings(settings);
builder.create(dir)
@@ -364,7 +372,8 @@ impl Index {
}
/// Open the index using the provided directory
pub fn open<D: Directory>(directory: D) -> crate::Result<Index> {
pub fn open<T: Into<Box<dyn Directory>>>(directory: T) -> crate::Result<Index> {
let directory = directory.into();
let directory = ManagedDirectory::wrap(directory)?;
let inventory = SegmentMetaInventory::default();
let metas = load_metas(&directory, &inventory)?;
@@ -394,9 +403,7 @@ impl Index {
///
/// # Errors
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
///
/// # Panics
/// If the heap size per thread is too small, panics.
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument`
pub fn writer_with_num_threads(
&self,
num_threads: usize,
@@ -438,14 +445,13 @@ impl Index {
/// Creates a multithreaded writer
///
/// Tantivy will automatically define the number of threads to use, but
/// no more than [`MAX_NUM_THREAD`] threads.
/// no more than 8 threads.
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split
/// between a given number of threads.
///
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument`
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> crate::Result<IndexWriter> {
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
@@ -523,7 +529,22 @@ impl Index {
/// Returns the set of corrupted files
pub fn validate_checksum(&self) -> crate::Result<HashSet<PathBuf>> {
self.directory.list_damaged().map_err(Into::into)
let managed_files = self.directory.list_managed_files();
let active_segments_files: HashSet<PathBuf> = self
.searchable_segment_metas()?
.iter()
.flat_map(|segment_meta| segment_meta.list_files())
.collect();
let active_existing_files: HashSet<&PathBuf> =
active_segments_files.intersection(&managed_files).collect();
let mut damaged_files = HashSet::new();
for path in active_existing_files {
if !self.directory.validate_checksum(path)? {
damaged_files.insert((*path).clone());
}
}
Ok(damaged_files)
}
}
@@ -561,15 +582,15 @@ mod tests {
#[test]
fn test_index_exists() {
let directory = RamDirectory::create();
assert!(!Index::exists(&directory).unwrap());
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
assert!(!Index::exists(directory.as_ref()).unwrap());
assert!(Index::create(
directory.clone(),
throw_away_schema(),
IndexSettings::default()
)
.is_ok());
assert!(Index::exists(&directory).unwrap());
assert!(Index::exists(directory.as_ref()).unwrap());
}
#[test]
@@ -582,27 +603,27 @@ mod tests {
#[test]
fn open_or_create_should_open() {
let directory = RamDirectory::create();
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
assert!(Index::create(
directory.clone(),
throw_away_schema(),
IndexSettings::default()
)
.is_ok());
assert!(Index::exists(&directory).unwrap());
assert!(Index::exists(directory.as_ref()).unwrap());
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
}
#[test]
fn create_should_wipeoff_existing() {
let directory = RamDirectory::create();
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
assert!(Index::create(
directory.clone(),
throw_away_schema(),
IndexSettings::default()
)
.is_ok());
assert!(Index::exists(&directory).unwrap());
assert!(Index::exists(directory.as_ref()).unwrap());
assert!(Index::create(
directory,
Schema::builder().build(),
@@ -636,7 +657,7 @@ mod tests {
}
#[test]
fn test_index_on_commit_reload_policy() {
fn test_index_on_commit_reload_policy() -> crate::Result<()> {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create_in_ram(schema);
@@ -646,7 +667,7 @@ mod tests {
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &index, &reader);
test_index_on_commit_reload_policy_aux(field, &index, &reader)
}
#[cfg(feature = "mmap")]
@@ -658,7 +679,7 @@ mod tests {
use tempfile::TempDir;
#[test]
fn test_index_on_commit_reload_policy_mmap() {
fn test_index_on_commit_reload_policy_mmap() -> crate::Result<()> {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new().unwrap();
@@ -670,7 +691,7 @@ mod tests {
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &index, &reader);
test_index_on_commit_reload_policy_aux(field, &index, &reader)
}
#[test]
@@ -685,7 +706,7 @@ mod tests {
.reload_policy(ReloadPolicy::Manual)
.try_into()?;
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.add_document(doc!(field=>1u64))?;
let (sender, receiver) = crossbeam::channel::unbounded();
let _handle = index.directory_mut().watch(WatchCallback::new(move || {
let _ = sender.send(());
@@ -699,7 +720,7 @@ mod tests {
}
#[test]
fn test_index_on_commit_reload_policy_different_directories() {
fn test_index_on_commit_reload_policy_different_directories() -> crate::Result<()> {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new().unwrap();
@@ -712,10 +733,14 @@ mod tests {
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
test_index_on_commit_reload_policy_aux(field, &write_index, &reader)
}
}
fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
fn test_index_on_commit_reload_policy_aux(
field: Field,
index: &Index,
reader: &IndexReader,
) -> crate::Result<()> {
let mut reader_index = reader.index();
let (sender, receiver) = crossbeam::channel::unbounded();
let _watch_handle = reader_index
@@ -723,9 +748,9 @@ mod tests {
.watch(WatchCallback::new(move || {
let _ = sender.send(());
}));
let mut writer = index.writer_for_tests().unwrap();
let mut writer = index.writer_for_tests()?;
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.add_document(doc!(field=>1u64))?;
writer.commit().unwrap();
// We need a loop here because it is possible for notify to send more than
// one modify event. It was observed on CI on MacOS.
@@ -735,7 +760,7 @@ mod tests {
break;
}
}
writer.add_document(doc!(field=>2u64));
writer.add_document(doc!(field=>2u64))?;
writer.commit().unwrap();
// ... Same as above
loop {
@@ -744,37 +769,37 @@ mod tests {
break;
}
}
Ok(())
}
// This test will not pass on windows, because windows
// prevent deleting files that are MMapped.
#[cfg(not(target_os = "windows"))]
#[test]
fn garbage_collect_works_as_intended() {
fn garbage_collect_works_as_intended() -> crate::Result<()> {
let directory = RamDirectory::create();
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create(directory.clone(), schema, IndexSettings::default()).unwrap();
let index = Index::create(directory.clone(), schema, IndexSettings::default())?;
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
for i in 0u64..8_000u64 {
writer.add_document(doc!(field => i));
writer.add_document(doc!(field => i))?;
}
let (sender, receiver) = crossbeam::channel::unbounded();
let _handle = directory.watch(WatchCallback::new(move || {
let _ = sender.send(());
}));
writer.commit().unwrap();
writer.commit()?;
let mem_right_after_commit = directory.total_mem_usage();
assert!(receiver.recv().is_ok());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
.try_into()?;
assert_eq!(reader.searcher().num_docs(), 8_000);
writer.wait_merging_threads().unwrap();
writer.wait_merging_threads()?;
let mem_right_after_merge_finished = directory.total_mem_usage();
reader.reload().unwrap();
@@ -786,5 +811,6 @@ mod tests {
mem_right_after_merge_finished,
mem_right_after_commit
);
Ok(())
}
}

View File

@@ -101,6 +101,7 @@ impl SegmentMeta {
/// Returns the list of files that
/// are required for the segment meta.
/// Note: Some of the returned files may not exist depending on the state of the segment.
///
/// This is useful as the way tantivy removes files
/// is by removing all files that have been created by tantivy

View File

@@ -1,6 +1,5 @@
use std::io;
use crate::common::BinarySerializable;
use crate::directory::FileSlice;
use crate::positions::PositionReader;
use crate::postings::TermInfo;
@@ -8,6 +7,7 @@ use crate::postings::{BlockSegmentPostings, SegmentPostings};
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::termdict::TermDictionary;
use common::BinarySerializable;
/// The inverted index reader is in charge of accessing
/// the inverted index associated to a specific field.

View File

@@ -88,7 +88,7 @@ impl Searcher {
&self.segment_readers
}
/// Returns the segment_reader associated with the given segment_ordinal
/// Returns the segment_reader associated with the given segment_ord
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
&self.segment_readers[segment_ord as usize]
}

View File

@@ -2,8 +2,11 @@ use crate::core::InvertedIndexReader;
use crate::core::Segment;
use crate::core::SegmentComponent;
use crate::core::SegmentId;
use crate::directory::CompositeFile;
use crate::directory::FileSlice;
use crate::fastfield::DeleteBitSet;
use crate::error::DataCorruption;
use crate::fastfield::intersect_alive_bitsets;
use crate::fastfield::AliveBitSet;
use crate::fastfield::FacetReader;
use crate::fastfield::FastFieldReaders;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
@@ -14,7 +17,6 @@ use crate::space_usage::SegmentSpaceUsage;
use crate::store::StoreReader;
use crate::termdict::TermDictionary;
use crate::DocId;
use crate::{common::CompositeFile, error::DataCorruption};
use fail::fail_point;
use std::fmt;
use std::sync::Arc;
@@ -46,7 +48,7 @@ pub struct SegmentReader {
fieldnorm_readers: FieldNormReaders,
store_file: FileSlice,
delete_bitset_opt: Option<DeleteBitSet>,
alive_bitset_opt: Option<AliveBitSet>,
schema: Schema,
}
@@ -71,14 +73,12 @@ impl SegmentReader {
/// Return the number of documents that have been
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
self.delete_bitset()
.map(|delete_set| delete_set.num_deleted() as DocId)
.unwrap_or(0u32)
self.max_doc - self.num_docs
}
/// Returns true iff some of the documents of the segment have been deleted.
pub fn has_deletes(&self) -> bool {
self.delete_bitset().is_some()
self.num_deleted_docs() > 0
}
/// Accessor to a segment's fast field reader given a field.
@@ -141,6 +141,14 @@ impl SegmentReader {
/// Open a new segment for reading.
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
Self::open_with_custom_alive_set(segment, None)
}
/// Open a new segment for reading.
pub fn open_with_custom_alive_set(
segment: &Segment,
custom_bitset: Option<AliveBitSet>,
) -> crate::Result<SegmentReader> {
let termdict_file = segment.open_read(SegmentComponent::Terms)?;
let termdict_composite = CompositeFile::open(&termdict_file)?;
@@ -165,29 +173,36 @@ impl SegmentReader {
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fast_field_readers =
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let delete_bitset_opt = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::Delete)?;
let delete_bitset = DeleteBitSet::open(delete_data)?;
Some(delete_bitset)
let original_bitset = if segment.meta().has_deletes() {
let delete_file_slice = segment.open_read(SegmentComponent::Delete)?;
let delete_data = delete_file_slice.read_bytes()?;
Some(AliveBitSet::open(delete_data))
} else {
None
};
let alive_bitset_opt = intersect_alive_bitset(original_bitset, custom_bitset);
let max_doc = segment.meta().max_doc();
let num_docs = alive_bitset_opt
.as_ref()
.map(|alive_bitset| alive_bitset.num_alive_docs() as u32)
.unwrap_or(max_doc);
Ok(SegmentReader {
inv_idx_reader_cache: Default::default(),
max_doc: segment.meta().max_doc(),
num_docs: segment.meta().num_docs(),
num_docs,
max_doc,
termdict_composite,
postings_composite,
fast_fields_readers: fast_field_readers,
fieldnorm_readers,
segment_id: segment.id(),
store_file,
delete_bitset_opt,
alive_bitset_opt,
positions_composite,
schema,
})
@@ -273,21 +288,25 @@ impl SegmentReader {
/// Returns the bitset representing
/// the documents that have been deleted.
pub fn delete_bitset(&self) -> Option<&DeleteBitSet> {
self.delete_bitset_opt.as_ref()
pub fn alive_bitset(&self) -> Option<&AliveBitSet> {
self.alive_bitset_opt.as_ref()
}
/// Returns true iff the `doc` is marked
/// as deleted.
pub fn is_deleted(&self, doc: DocId) -> bool {
self.delete_bitset()
self.alive_bitset()
.map(|delete_set| delete_set.is_deleted(doc))
.unwrap_or(false)
}
/// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive(&self) -> impl Iterator<Item = DocId> + '_ {
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
pub fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
if let Some(alive_bitset) = &self.alive_bitset_opt {
Box::new(alive_bitset.iter_alive())
} else {
Box::new(0u32..self.max_doc)
}
}
/// Summarize total space usage of this segment.
@@ -300,14 +319,29 @@ impl SegmentReader {
self.fast_fields_readers.space_usage(),
self.fieldnorm_readers.space_usage(),
self.get_store_reader()?.space_usage(),
self.delete_bitset_opt
self.alive_bitset_opt
.as_ref()
.map(DeleteBitSet::space_usage)
.map(AliveBitSet::space_usage)
.unwrap_or(0),
))
}
}
fn intersect_alive_bitset(
left_opt: Option<AliveBitSet>,
right_opt: Option<AliveBitSet>,
) -> Option<AliveBitSet> {
match (left_opt, right_opt) {
(Some(left), Some(right)) => {
assert_eq!(left.bitset().max_value(), right.bitset().max_value());
Some(intersect_alive_bitsets(left, right))
}
(Some(left), None) => Some(left),
(None, Some(right)) => Some(right),
(None, None) => None,
}
}
impl fmt::Debug for SegmentReader {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "SegmentReader({:?})", self.segment_id)
@@ -330,10 +364,10 @@ mod test {
{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"));
index_writer.add_document(doc!(name => "cap"));
index_writer.add_document(doc!(name => "tantivy"))?;
index_writer.add_document(doc!(name => "horse"))?;
index_writer.add_document(doc!(name => "jockey"))?;
index_writer.add_document(doc!(name => "cap"))?;
// we should now have one segment with two docs
index_writer.delete_term(Term::from_field_text(name, "horse"));
index_writer.delete_term(Term::from_field_text(name, "cap"));
@@ -356,10 +390,10 @@ mod test {
{
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"));
index_writer.add_document(doc!(name => "cap"));
index_writer.add_document(doc!(name => "tantivy"))?;
index_writer.add_document(doc!(name => "horse"))?;
index_writer.add_document(doc!(name => "jockey"))?;
index_writer.add_document(doc!(name => "cap"))?;
// we should now have one segment with two docs
index_writer.commit()?;
}

View File

@@ -1,18 +1,17 @@
use crate::common::BinarySerializable;
use crate::common::CountingWriter;
use crate::common::VInt;
use crate::directory::FileSlice;
use crate::directory::{TerminatingWrite, WritePtr};
use crate::schema::Field;
use crate::space_usage::FieldUsage;
use crate::space_usage::PerFieldSpaceUsage;
use common::BinarySerializable;
use common::CountingWriter;
use common::HasLen;
use common::VInt;
use std::collections::HashMap;
use std::io::{self, Read, Write};
use std::iter::ExactSizeIterator;
use std::ops::Range;
use super::HasLen;
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
pub struct FileAddr {
field: Field,
@@ -188,10 +187,10 @@ impl CompositeFile {
mod test {
use super::{CompositeFile, CompositeWrite};
use crate::common::BinarySerializable;
use crate::common::VInt;
use crate::directory::{Directory, RamDirectory};
use crate::schema::Field;
use common::BinarySerializable;
use common::VInt;
use std::io::Write;
use std::path::Path;

View File

@@ -230,3 +230,15 @@ where
Box::new(self.clone())
}
}
impl Clone for Box<dyn Directory> {
fn clone(&self) -> Self {
self.box_clone()
}
}
impl<T: Directory + 'static> From<T> for Box<dyn Directory> {
fn from(t: T) -> Self {
Box::new(t)
}
}

View File

@@ -1,7 +1,7 @@
use stable_deref_trait::StableDeref;
use crate::common::HasLen;
use crate::directory::OwnedBytes;
use common::HasLen;
use std::fmt;
use std::ops::Range;
use std::sync::{Arc, Weak};
@@ -32,12 +32,6 @@ impl FileHandle for &'static [u8] {
}
}
impl<T: Deref<Target = [u8]>> HasLen for T {
fn len(&self) -> usize {
self.deref().len()
}
}
impl<B> From<B> for FileSlice
where
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync,
@@ -178,7 +172,7 @@ impl HasLen for FileSlice {
#[cfg(test)]
mod tests {
use super::{FileHandle, FileSlice};
use crate::common::HasLen;
use common::HasLen;
use std::io;
#[test]

View File

@@ -1,10 +1,10 @@
use crate::directory::error::Incompatibility;
use crate::directory::FileSlice;
use crate::{
common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen},
directory::{AntiCallToken, TerminatingWrite},
Version, INDEX_FORMAT_VERSION,
};
use common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen};
use crc32fast::Hasher;
use serde::{Deserialize, Serialize};
use std::io;
@@ -156,10 +156,8 @@ mod tests {
use crate::directory::footer::Footer;
use crate::directory::OwnedBytes;
use crate::{
common::BinarySerializable,
directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice},
};
use crate::directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice};
use common::BinarySerializable;
use std::io;
#[test]

View File

@@ -1,4 +1,4 @@
use crate::core::{MANAGED_FILEPATH, META_FILEPATH};
use crate::core::MANAGED_FILEPATH;
use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError};
use crate::directory::footer::{Footer, FooterProxy};
use crate::directory::GarbageCollectionResult;
@@ -64,7 +64,7 @@ fn save_managed_paths(
impl ManagedDirectory {
/// Wraps a directory as managed directory.
pub fn wrap<Dir: Directory>(directory: Dir) -> crate::Result<ManagedDirectory> {
pub fn wrap(directory: Box<dyn Directory>) -> crate::Result<ManagedDirectory> {
match directory.atomic_read(&MANAGED_FILEPATH) {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
@@ -76,14 +76,14 @@ impl ManagedDirectory {
)
})?;
Ok(ManagedDirectory {
directory: Box::new(directory),
directory,
meta_informations: Arc::new(RwLock::new(MetaInformation {
managed_paths: managed_files,
})),
})
}
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
directory: Box::new(directory),
directory,
meta_informations: Arc::default(),
}),
io_err @ Err(OpenReadError::IoError { .. }) => Err(io_err.err().unwrap().into()),
@@ -248,24 +248,15 @@ impl ManagedDirectory {
Ok(footer.crc() == crc)
}
/// List files for which checksum does not match content
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
let mut managed_paths = self
/// List all managed files
pub fn list_managed_files(&self) -> HashSet<PathBuf> {
let managed_paths = self
.meta_informations
.read()
.expect("Managed directory rlock poisoned in list damaged.")
.managed_paths
.clone();
managed_paths.remove(*META_FILEPATH);
let mut damaged_files = HashSet::new();
for path in managed_paths {
if !self.validate_checksum(&path)? {
damaged_files.insert(path);
}
}
Ok(damaged_files)
managed_paths
}
}
@@ -336,7 +327,6 @@ mod tests_mmap_specific {
use crate::directory::{Directory, ManagedDirectory, MmapDirectory, TerminatingWrite};
use std::collections::HashSet;
use std::fs::OpenOptions;
use std::io::Write;
use std::path::{Path, PathBuf};
use tempfile::TempDir;
@@ -350,7 +340,7 @@ mod tests_mmap_specific {
let test_path2: &'static Path = Path::new("some_path_for_test_2");
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap();
let write_file = managed_directory.open_write(test_path1).unwrap();
write_file.terminate().unwrap();
managed_directory
@@ -365,7 +355,7 @@ mod tests_mmap_specific {
}
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap();
assert!(managed_directory.exists(test_path1).unwrap());
assert!(!managed_directory.exists(test_path2).unwrap());
let living_files: HashSet<PathBuf> = HashSet::new();
@@ -384,7 +374,7 @@ mod tests_mmap_specific {
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap();
let mut write = managed_directory.open_write(test_path1).unwrap();
write.write_all(&[0u8, 1u8]).unwrap();
write.terminate().unwrap();
@@ -405,39 +395,4 @@ mod tests_mmap_specific {
}
assert!(!managed_directory.exists(test_path1).unwrap());
}
#[test]
fn test_checksum() -> crate::Result<()> {
let test_path1: &'static Path = Path::new("some_path_for_test");
let test_path2: &'static Path = Path::new("other_test_path");
let tempdir = TempDir::new().unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let mmap_directory = MmapDirectory::open(&tempdir_path)?;
let managed_directory = ManagedDirectory::wrap(mmap_directory)?;
let mut write = managed_directory.open_write(test_path1)?;
write.write_all(&[0u8, 1u8])?;
write.terminate()?;
let mut write = managed_directory.open_write(test_path2)?;
write.write_all(&[3u8, 4u8, 5u8])?;
write.terminate()?;
let read_file = managed_directory.open_read(test_path2)?.read_bytes()?;
assert_eq!(read_file.as_slice(), &[3u8, 4u8, 5u8]);
assert!(managed_directory.list_damaged().unwrap().is_empty());
let mut corrupted_path = tempdir_path;
corrupted_path.push(test_path2);
let mut file = OpenOptions::new().write(true).open(&corrupted_path)?;
file.write_all(&[255u8])?;
file.flush()?;
drop(file);
let damaged = managed_directory.list_damaged()?;
assert_eq!(damaged.len(), 1);
assert!(damaged.contains(test_path2));
Ok(())
}
}

View File

@@ -74,20 +74,12 @@ pub struct CacheInfo {
pub mmapped: Vec<PathBuf>,
}
#[derive(Default)]
struct MmapCache {
counters: CacheCounters,
cache: HashMap<PathBuf, WeakArcBytes>,
}
impl Default for MmapCache {
fn default() -> MmapCache {
MmapCache {
counters: CacheCounters::default(),
cache: HashMap::new(),
}
}
}
impl MmapCache {
fn get_info(&self) -> CacheInfo {
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
@@ -242,7 +234,7 @@ impl MmapDirectory {
}
let fd = open_opts.open(&self.inner.root_path)?;
fd.sync_all()?;
fd.sync_data()?;
Ok(())
}
@@ -296,8 +288,7 @@ impl Write for SafeFileWriter {
}
fn flush(&mut self) -> io::Result<()> {
self.0.flush()?;
self.0.sync_all()
Ok(())
}
}
@@ -309,7 +300,9 @@ impl Seek for SafeFileWriter {
impl TerminatingWrite for SafeFileWriter {
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
self.flush()
self.0.flush()?;
self.0.sync_data()?;
Ok(())
}
}
@@ -339,6 +332,7 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> {
let mut tempfile = tempfile::Builder::new().tempfile_in(&parent_path)?;
tempfile.write_all(content)?;
tempfile.flush()?;
tempfile.as_file_mut().sync_data()?;
tempfile.into_temp_path().persist(path)?;
Ok(())
}
@@ -485,13 +479,14 @@ mod tests {
// The following tests are specific to the MmapDirectory
use super::*;
use crate::indexer::LogMergePolicy;
use crate::Index;
use crate::ReloadPolicy;
use crate::{common::HasLen, indexer::LogMergePolicy};
use crate::{
schema::{Schema, SchemaBuilder, TEXT},
IndexSettings,
};
use common::HasLen;
#[test]
fn test_open_non_existent_path() {
@@ -581,8 +576,8 @@ mod tests {
}
#[test]
fn test_mmap_released() {
let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
fn test_mmap_released() -> crate::Result<()> {
let mmap_directory = MmapDirectory::create_from_tempdir()?;
let mut schema_builder: SchemaBuilder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
@@ -591,31 +586,30 @@ mod tests {
let index =
Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_num_segments(3);
index_writer.set_merge_policy(Box::new(log_merge_policy));
for _num_commits in 0..10 {
for _ in 0..10 {
index_writer.add_document(doc!(text_field=>"abc"));
index_writer.add_document(doc!(text_field=>"abc"))?;
}
index_writer.commit().unwrap();
index_writer.commit()?;
}
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
.try_into()?;
for _ in 0..4 {
index_writer.add_document(doc!(text_field=>"abc"));
index_writer.commit().unwrap();
reader.reload().unwrap();
index_writer.add_document(doc!(text_field=>"abc"))?;
index_writer.commit()?;
reader.reload()?;
}
index_writer.wait_merging_threads().unwrap();
index_writer.wait_merging_threads()?;
reader.reload().unwrap();
reader.reload()?;
let num_segments = reader.searcher().segment_readers().len();
assert!(num_segments <= 4);
let num_components_except_deletes_and_tempstore =
@@ -626,5 +620,6 @@ mod tests {
);
}
assert!(mmap_directory.get_cache_info().mmapped.is_empty());
Ok(())
}
}

View File

@@ -20,6 +20,9 @@ mod watch_event_router;
/// Errors specific to the directory module.
pub mod error;
mod composite_file;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::directory::DirectoryLock;
pub use self::directory::{Directory, DirectoryClone};
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};

View File

@@ -1,9 +1,10 @@
use crate::core::META_FILEPATH;
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
use crate::directory::AntiCallToken;
use crate::directory::WatchCallbackList;
use crate::directory::{Directory, FileSlice, WatchCallback, WatchHandle};
use crate::directory::{TerminatingWrite, WritePtr};
use crate::{common::HasLen, core::META_FILEPATH};
use common::HasLen;
use fail::fail_point;
use std::collections::HashMap;
use std::fmt;
@@ -17,13 +18,6 @@ use super::FileHandle;
/// Writer associated with the `RamDirectory`
///
/// The Writer just writes a buffer.
///
/// # Panics
///
/// On drop, if the writer was left in a *dirty* state.
/// That is, if flush was not called after the last call
/// to write.
///
struct VecWriter {
path: PathBuf,
shared_directory: RamDirectory,
@@ -45,7 +39,7 @@ impl VecWriter {
impl Drop for VecWriter {
fn drop(&mut self) {
if !self.is_flushed {
panic!(
warn!(
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.",
self.path
)
@@ -220,14 +214,8 @@ impl Directory for RamDirectory {
}
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> {
fail_point!("RamDirectory::atomic_write", |msg| Err(io::Error::new(
io::ErrorKind::Other,
msg.unwrap_or_else(|| "Undefined".to_string())
)));
let path_buf = PathBuf::from(path);
self.fs.write().unwrap().write(path_buf, data);
if path == *META_FILEPATH {
let _ = self.fs.write().unwrap().watch_router.broadcast();
}

View File

@@ -118,15 +118,6 @@ mod ram_directory_tests {
}
}
#[test]
#[should_panic]
fn ram_directory_panics_if_flush_forgotten() {
let test_path: &'static Path = Path::new("some_path_for_test");
let ram_directory = RamDirectory::create();
let mut write_file = ram_directory.open_write(test_path).unwrap();
assert!(write_file.write_all(&[4]).is_ok());
}
fn test_simple(directory: &dyn Directory) -> crate::Result<()> {
let test_path: &'static Path = Path::new("some_path_for_test");
let mut write_file = directory.open_write(test_path)?;

View File

@@ -1,4 +1,4 @@
use crate::fastfield::DeleteBitSet;
use crate::fastfield::AliveBitSet;
use crate::DocId;
use std::borrow::Borrow;
use std::borrow::BorrowMut;
@@ -85,11 +85,11 @@ pub trait DocSet: Send {
/// Returns the number documents matching.
/// Calling this method consumes the `DocSet`.
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
let mut count = 0u32;
let mut doc = self.doc();
while doc != TERMINATED {
if !delete_bitset.is_deleted(doc) {
if alive_bitset.is_alive(doc) {
count += 1u32;
}
doc = self.advance();
@@ -130,8 +130,8 @@ impl<'a> DocSet for &'a mut dyn DocSet {
(**self).size_hint()
}
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
(**self).count(delete_bitset)
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
(**self).count(alive_bitset)
}
fn count_including_deleted(&mut self) -> u32 {
@@ -160,9 +160,9 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.size_hint()
}
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count(delete_bitset)
unboxed.count(alive_bitset)
}
fn count_including_deleted(&mut self) -> u32 {

View File

@@ -0,0 +1,224 @@
use crate::space_usage::ByteCount;
use crate::DocId;
use common::intersect_bitsets;
use common::BitSet;
use common::ReadOnlyBitSet;
use ownedbytes::OwnedBytes;
use std::io;
use std::io::Write;
/// Write a alive `BitSet`
///
/// where `alive_bitset` is the set of alive `DocId`.
/// Warning: this function does not call terminate. The caller is in charge of
/// closing the writer properly.
pub fn write_alive_bitset<T: Write>(alive_bitset: &BitSet, writer: &mut T) -> io::Result<()> {
alive_bitset.serialize(writer)?;
Ok(())
}
/// Set of alive `DocId`s.
#[derive(Clone)]
pub struct AliveBitSet {
num_alive_docs: usize,
bitset: ReadOnlyBitSet,
}
/// Intersects two AliveBitSets in a new one.
/// The two bitsets need to have the same max_value.
pub fn intersect_alive_bitsets(left: AliveBitSet, right: AliveBitSet) -> AliveBitSet {
assert_eq!(left.bitset().max_value(), right.bitset().max_value());
let bitset = intersect_bitsets(left.bitset(), right.bitset());
let num_alive_docs = bitset.len();
AliveBitSet {
num_alive_docs,
bitset,
}
}
impl AliveBitSet {
#[cfg(test)]
pub(crate) fn for_test_from_deleted_docs(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet {
assert!(deleted_docs.iter().all(|&doc| doc < max_doc));
let mut bitset = BitSet::with_max_value_and_full(max_doc);
for &doc in deleted_docs {
bitset.remove(doc);
}
let mut alive_bitset_buffer = Vec::new();
write_alive_bitset(&bitset, &mut alive_bitset_buffer).unwrap();
let alive_bitset_bytes = OwnedBytes::new(alive_bitset_buffer);
Self::open(alive_bitset_bytes)
}
pub(crate) fn from_bitset(bitset: &BitSet) -> AliveBitSet {
let readonly_bitset = ReadOnlyBitSet::from(bitset);
AliveBitSet::from(readonly_bitset)
}
/// Opens a delete bitset given its file.
pub fn open(bytes: OwnedBytes) -> AliveBitSet {
let bitset = ReadOnlyBitSet::open(bytes);
AliveBitSet::from(bitset)
}
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
#[inline]
pub fn is_alive(&self, doc: DocId) -> bool {
self.bitset.contains(doc)
}
/// Returns true iff the document has been marked as deleted.
#[inline]
pub fn is_deleted(&self, doc: DocId) -> bool {
!self.is_alive(doc)
}
/// Iterate over the alive doc_ids.
#[inline]
pub fn iter_alive(&self) -> impl Iterator<Item = DocId> + '_ {
self.bitset.iter()
}
/// Get underlying bitset
#[inline]
pub fn bitset(&self) -> &ReadOnlyBitSet {
&self.bitset
}
/// The number of deleted docs
pub fn num_alive_docs(&self) -> usize {
self.num_alive_docs
}
/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
self.bitset().num_bytes()
}
}
impl From<ReadOnlyBitSet> for AliveBitSet {
fn from(bitset: ReadOnlyBitSet) -> AliveBitSet {
let num_alive_docs = bitset.len();
AliveBitSet {
num_alive_docs,
bitset,
}
}
}
#[cfg(test)]
mod tests {
use super::AliveBitSet;
#[test]
fn test_alive_bitset_empty() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[], 10);
for doc in 0..10 {
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
assert!(!alive_bitset.is_deleted(doc));
}
assert_eq!(alive_bitset.num_alive_docs(), 10);
}
#[test]
fn test_alive_bitset() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[1, 9], 10);
assert!(alive_bitset.is_alive(0));
assert!(alive_bitset.is_deleted(1));
assert!(alive_bitset.is_alive(2));
assert!(alive_bitset.is_alive(3));
assert!(alive_bitset.is_alive(4));
assert!(alive_bitset.is_alive(5));
assert!(alive_bitset.is_alive(6));
assert!(alive_bitset.is_alive(6));
assert!(alive_bitset.is_alive(7));
assert!(alive_bitset.is_alive(8));
assert!(alive_bitset.is_deleted(9));
for doc in 0..10 {
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
}
assert_eq!(alive_bitset.num_alive_docs(), 8);
}
#[test]
fn test_alive_bitset_iter_minimal() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[7], 8);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]);
}
#[test]
fn test_alive_bitset_iter_small() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 2, 3, 6], 7);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, vec![1, 4, 5]);
}
#[test]
fn test_alive_bitset_iter() {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000], 1001);
let data: Vec<_> = alive_bitset.iter_alive().collect();
assert_eq!(data, (2..=999).collect::<Vec<_>>());
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::AliveBitSet;
use rand::prelude::IteratorRandom;
use rand::thread_rng;
use test::Bencher;
fn get_alive() -> Vec<u32> {
let mut data = (0..1_000_000_u32).collect::<Vec<u32>>();
for _ in 0..(1_000_000) * 1 / 8 {
remove_rand(&mut data);
}
data
}
fn remove_rand(raw: &mut Vec<u32>) {
let i = (0..raw.len()).choose(&mut thread_rng()).unwrap();
raw.remove(i);
}
#[bench]
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_deletebitset_access(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| {
(0..1_000_000_u32)
.filter(|doc| alive_bitset.is_alive(*doc))
.collect::<Vec<_>>()
});
}
#[bench]
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| {
(0..1_000_000_u32)
.filter(|doc| alive_bitset.is_alive(*doc))
.collect::<Vec<_>>()
});
}
}

View File

@@ -18,11 +18,11 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(bytes_field=>vec![0u8, 1, 2, 3]));
index_writer.add_document(doc!(bytes_field=>vec![]));
index_writer.add_document(doc!(bytes_field=>vec![255u8]));
index_writer.add_document(doc!(bytes_field=>vec![1u8, 3, 5, 7, 9]));
index_writer.add_document(doc!(bytes_field=>vec![0u8; 1000]));
index_writer.add_document(doc!(bytes_field=>vec![0u8, 1, 2, 3]))?;
index_writer.add_document(doc!(bytes_field=>vec![]))?;
index_writer.add_document(doc!(bytes_field=>vec![255u8]))?;
index_writer.add_document(doc!(bytes_field=>vec![1u8, 3, 5, 7, 9]))?;
index_writer.add_document(doc!(bytes_field=>vec![0u8; 1000]))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
@@ -47,7 +47,7 @@ mod tests {
index_writer.add_document(doc!(
field => b"tantivy".as_ref(),
field => b"lucene".as_ref()
));
))?;
index_writer.commit()?;
Ok(index.reader()?.searcher())
}

View File

@@ -1,143 +0,0 @@
use crate::common::{BitSet, HasLen};
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::directory::WritePtr;
use crate::space_usage::ByteCount;
use crate::DocId;
use std::io;
use std::io::Write;
/// Write a delete `BitSet`
///
/// where `delete_bitset` is the set of deleted `DocId`.
/// Warning: this function does not call terminate. The caller is in charge of
/// closing the writer properly.
pub fn write_delete_bitset(
delete_bitset: &BitSet,
max_doc: u32,
writer: &mut WritePtr,
) -> io::Result<()> {
let mut byte = 0u8;
let mut shift = 0u8;
for doc in 0..max_doc {
if delete_bitset.contains(doc) {
byte |= 1 << shift;
}
if shift == 7 {
writer.write_all(&[byte])?;
shift = 0;
byte = 0;
} else {
shift += 1;
}
}
if max_doc % 8 > 0 {
writer.write_all(&[byte])?;
}
Ok(())
}
/// Set of deleted `DocId`s.
#[derive(Clone)]
pub struct DeleteBitSet {
data: OwnedBytes,
num_deleted: usize,
}
impl DeleteBitSet {
#[cfg(test)]
pub(crate) fn for_test(docs: &[DocId], max_doc: u32) -> DeleteBitSet {
use crate::directory::{Directory, RamDirectory, TerminatingWrite};
use std::path::Path;
assert!(docs.iter().all(|&doc| doc < max_doc));
let mut bitset = BitSet::with_max_value(max_doc);
for &doc in docs {
bitset.insert(doc);
}
let directory = RamDirectory::create();
let path = Path::new("dummydeletebitset");
let mut wrt = directory.open_write(path).unwrap();
write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap();
wrt.terminate().unwrap();
let file = directory.open_read(path).unwrap();
Self::open(file).unwrap()
}
/// Opens a delete bitset given its file.
pub fn open(file: FileSlice) -> crate::Result<DeleteBitSet> {
let bytes = file.read_bytes()?;
let num_deleted: usize = bytes
.as_slice()
.iter()
.map(|b| b.count_ones() as usize)
.sum();
Ok(DeleteBitSet {
data: bytes,
num_deleted,
})
}
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
pub fn is_alive(&self, doc: DocId) -> bool {
!self.is_deleted(doc)
}
/// Returns true iff the document has been marked as deleted.
#[inline]
pub fn is_deleted(&self, doc: DocId) -> bool {
let byte_offset = doc / 8u32;
let b: u8 = self.data.as_slice()[byte_offset as usize];
let shift = (doc & 7u32) as u8;
b & (1u8 << shift) != 0
}
/// The number of deleted docs
pub fn num_deleted(&self) -> usize {
self.num_deleted
}
/// Summarize total space usage of this bitset.
pub fn space_usage(&self) -> ByteCount {
self.data.len()
}
}
impl HasLen for DeleteBitSet {
fn len(&self) -> usize {
self.num_deleted
}
}
#[cfg(test)]
mod tests {
use super::DeleteBitSet;
use crate::common::HasLen;
#[test]
fn test_delete_bitset_empty() {
let delete_bitset = DeleteBitSet::for_test(&[], 10);
for doc in 0..10 {
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc));
}
assert_eq!(delete_bitset.len(), 0);
}
#[test]
fn test_delete_bitset() {
let delete_bitset = DeleteBitSet::for_test(&[1, 9], 10);
assert!(delete_bitset.is_alive(0));
assert!(delete_bitset.is_deleted(1));
assert!(delete_bitset.is_alive(2));
assert!(delete_bitset.is_alive(3));
assert!(delete_bitset.is_alive(4));
assert!(delete_bitset.is_alive(5));
assert!(delete_bitset.is_alive(6));
assert!(delete_bitset.is_alive(6));
assert!(delete_bitset.is_alive(7));
assert!(delete_bitset.is_alive(8));
assert!(delete_bitset.is_deleted(9));
for doc in 0..10 {
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc));
}
assert_eq!(delete_bitset.len(), 2);
}
}

View File

@@ -95,7 +95,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
@@ -118,7 +118,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
@@ -141,7 +141,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
@@ -164,7 +164,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
@@ -187,8 +187,8 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.add_document(Document::default());
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
index_writer.add_document(Document::default())?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
@@ -210,8 +210,8 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(Document::default());
index_writer.add_document(Document::default());
index_writer.add_document(Document::default())?;
index_writer.add_document(Document::default())?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher

View File

@@ -23,9 +23,10 @@ values stored.
Read access performance is comparable to that of an array lookup.
*/
pub use self::alive_bitset::intersect_alive_bitsets;
pub use self::alive_bitset::write_alive_bitset;
pub use self::alive_bitset::AliveBitSet;
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
pub use self::delete::write_delete_bitset;
pub use self::delete::DeleteBitSet;
pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
@@ -40,14 +41,14 @@ pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use crate::schema::Cardinality;
use crate::schema::FieldType;
use crate::schema::Value;
use crate::DocId;
use crate::{
chrono::{NaiveDateTime, Utc},
schema::Type,
};
use crate::{common, DocId};
mod alive_bitset;
mod bytes;
mod delete;
mod error;
mod facet_reader;
mod multivalued;
@@ -213,8 +214,7 @@ fn value_to_u64(value: &Value) -> u64 {
mod tests {
use super::*;
use crate::common::CompositeFile;
use crate::common::HasLen;
use crate::directory::CompositeFile;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::Field;
@@ -222,6 +222,7 @@ mod tests {
use crate::schema::FAST;
use crate::schema::{Document, IntOptions};
use crate::{Index, SegmentId, SegmentReader};
use common::HasLen;
use once_cell::sync::Lazy;
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
@@ -496,18 +497,18 @@ mod tests {
}
#[test]
fn test_merge_missing_date_fast_field() {
fn test_merge_missing_date_fast_field() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()));
index_writer.commit().unwrap();
index_writer.add_document(doc!());
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()))?;
index_writer.commit()?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
let reader = index.reader()?;
let segment_ids: Vec<SegmentId> = reader
.searcher()
.segment_readers()
@@ -516,10 +517,10 @@ mod tests {
.collect();
assert_eq!(segment_ids.len(), 2);
let merge_future = index_writer.merge(&segment_ids[..]);
let merge_res = futures::executor::block_on(merge_future);
assert!(merge_res.is_ok());
assert!(reader.reload().is_ok());
futures::executor::block_on(merge_future)?;
reader.reload()?;
assert_eq!(reader.searcher().segment_readers().len(), 1);
Ok(())
}
#[test]
@@ -528,7 +529,7 @@ mod tests {
}
#[test]
fn test_datefastfield() {
fn test_datefastfield() -> crate::Result<()> {
use crate::fastfield::FastValue;
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date", FAST);
@@ -538,22 +539,22 @@ mod tests {
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
date_field => crate::DateTime::from_u64(1i64.to_u64()),
multi_date_field => crate::DateTime::from_u64(2i64.to_u64()),
multi_date_field => crate::DateTime::from_u64(3i64.to_u64())
));
))?;
index_writer.add_document(doc!(
date_field => crate::DateTime::from_u64(4i64.to_u64())
));
))?;
index_writer.add_document(doc!(
multi_date_field => crate::DateTime::from_u64(5i64.to_u64()),
multi_date_field => crate::DateTime::from_u64(6i64.to_u64())
));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
@@ -580,6 +581,7 @@ mod tests {
assert_eq!(dates[0].timestamp(), 5i64);
assert_eq!(dates[1].timestamp(), 6i64);
}
Ok(())
}
}
@@ -588,7 +590,7 @@ mod bench {
use super::tests::FIELD;
use super::tests::{generate_permutation, SCHEMA};
use super::*;
use crate::common::CompositeFile;
use crate::directory::CompositeFile;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::FastFieldReader;
use std::collections::HashMap;

View File

@@ -8,17 +8,25 @@ pub use self::writer::MultiValuedFastFieldWriter;
mod tests {
use crate::collector::TopDocs;
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::Cardinality;
use crate::schema::Facet;
use crate::schema::IntOptions;
use crate::schema::Schema;
use crate::schema::INDEXED;
use crate::Document;
use crate::Index;
use crate::Term;
use chrono::Duration;
use futures::executor::block_on;
use proptest::prop_oneof;
use proptest::proptest;
use proptest::strategy::Strategy;
use test_env_log::test;
#[test]
fn test_multivalued_u64() {
fn test_multivalued_u64() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field(
"multifield",
@@ -26,17 +34,17 @@ mod tests {
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=>1u64, field=>3u64));
index_writer.add_document(doc!());
index_writer.add_document(doc!(field=>4u64));
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
assert!(index_writer.commit().is_ok());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>1u64, field=>3u64))?;
index_writer.add_document(doc!())?;
index_writer.add_document(doc!(field=>4u64))?;
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64))?;
index_writer.commit()?;
let searcher = index.reader().unwrap().searcher();
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
let mut vals = Vec::new();
let multi_value_reader = segment_reader.fast_fields().u64s(field).unwrap();
let multi_value_reader = segment_reader.fast_fields().u64s(field)?;
{
multi_value_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[4u64]);
@@ -49,10 +57,11 @@ mod tests {
multi_value_reader.get_vals(1, &mut vals);
assert!(vals.is_empty());
}
Ok(())
}
#[test]
fn test_multivalued_date() {
fn test_multivalued_date() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field(
"multi_date_field",
@@ -65,40 +74,37 @@ mod tests {
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
let first_time_stamp = chrono::Utc::now();
index_writer.add_document(
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
);
index_writer.add_document(doc!(time_i=>0i64));
)?;
index_writer.add_document(doc!(time_i=>0i64))?;
// add one second
index_writer
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64));
index_writer.add_document(
doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64),
)?;
// add another second
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64))?;
// add three seconds
index_writer
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(
doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64),
)?;
index_writer.commit()?;
let reader = index.reader().unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
let reader = searcher.segment_reader(0);
assert_eq!(reader.num_docs(), 5);
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
let query = parser.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))?;
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
assert_eq!(results.len(), 1);
for (_score, doc_address) in results {
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
let retrieved_doc = searcher.doc(doc_address)?;
assert_eq!(
retrieved_doc
.get_first(date_field)
@@ -120,12 +126,8 @@ mod tests {
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
let query = parser.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))?;
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
assert_eq!(results.len(), 1);
@@ -157,10 +159,8 @@ mod tests {
(first_time_stamp + Duration::seconds(1)).to_rfc3339(),
(first_time_stamp + Duration::seconds(3)).to_rfc3339()
);
let query = parser.parse_query(&range_q).expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
let query = parser.parse_query(&range_q)?;
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
assert_eq!(results.len(), 2);
for (i, doc_pair) in results.iter().enumerate() {
@@ -188,16 +188,16 @@ mod tests {
retrieved_doc
.get_first(time_i)
.expect("cannot find value")
.i64_value()
.expect("value not of i64 type"),
time_i_val
.i64_value(),
Some(time_i_val)
);
}
}
Ok(())
}
#[test]
fn test_multivalued_i64() {
fn test_multivalued_i64() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_i64_field(
"multifield",
@@ -205,14 +205,14 @@ mod tests {
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(field=> 1i64, field => 3i64));
index_writer.add_document(doc!());
index_writer.add_document(doc!(field=> -4i64));
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
assert!(index_writer.commit().is_ok());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(field=> 1i64, field => 3i64))?;
index_writer.add_document(doc!())?;
index_writer.add_document(doc!(field=> -4i64))?;
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64))?;
index_writer.commit()?;
let searcher = index.reader().unwrap().searcher();
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
let mut vals = Vec::new();
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
@@ -224,18 +224,125 @@ mod tests {
assert!(vals.is_empty());
multi_value_reader.get_vals(3, &mut vals);
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
Ok(())
}
fn test_multivalued_no_panic(ops: &[IndexingOp]) -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field(
"multifield",
IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
match id % 3 {
0 => {
index_writer.add_document(doc!())?;
}
1 => {
let mut doc = Document::new();
for _ in 0..5001 {
doc.add_u64(field, id as u64);
}
index_writer.add_document(doc)?;
}
_ => {
let mut doc = Document::new();
doc.add_u64(field, id as u64);
index_writer.add_document(doc)?;
}
};
}
IndexingOp::DeleteDoc { id } => {
index_writer.delete_term(Term::from_field_u64(field, id as u64));
}
IndexingOp::Commit => {
index_writer.commit().unwrap();
}
IndexingOp::Merge => {
let segment_ids = index.searchable_segment_ids()?;
if segment_ids.len() >= 2 {
block_on(index_writer.merge(&segment_ids))?;
index_writer.segment_updater().wait_merging_thread()?;
}
}
}
}
index_writer.commit()?;
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
if !segment_ids.is_empty() {
block_on(index_writer.merge(&segment_ids)).unwrap();
assert!(index_writer.wait_merging_threads().is_ok());
}
}
Ok(())
}
#[derive(Debug, Clone, Copy)]
enum IndexingOp {
AddDoc { id: u32 },
DeleteDoc { id: u32 },
Commit,
Merge,
}
fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
(0u32..10u32).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u32..10u32).prop_map(|id| IndexingOp::AddDoc { id }),
(0u32..2u32).prop_map(|_| IndexingOp::Commit),
(0u32..1u32).prop_map(|_| IndexingOp::Merge),
]
}
proptest! {
#[test]
fn test_multivalued_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
}
}
#[test]
fn test_multivalued_proptest_off_by_one_bug_1151() {
use IndexingOp::*;
let ops = [
AddDoc { id: 3 },
AddDoc { id: 1 },
AddDoc { id: 3 },
Commit,
Merge,
];
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
}
#[test]
#[ignore]
fn test_many_facets() {
fn test_many_facets() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_facet_field("facetfield", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
for i in 0..100_000 {
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
index_writer
.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())))?;
}
assert!(index_writer.commit().is_ok());
index_writer.commit()?;
Ok(())
}
}

View File

@@ -94,24 +94,22 @@ mod tests {
use crate::schema::{Cardinality, Facet, IntOptions, Schema, INDEXED};
#[test]
fn test_multifastfield_reader() {
fn test_multifastfield_reader() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facets", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index
.writer_for_tests()
.expect("Failed to create index writer.");
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
facet_field => Facet::from("/category/cat2"),
facet_field => Facet::from("/category/cat1"),
));
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")));
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")));
index_writer.commit().expect("Commit failed");
let searcher = index.reader().unwrap().searcher();
))?;
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")))?;
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
let mut facet_reader = segment_reader.facet_reader(facet_field)?;
let mut facet = Facet::root();
{
@@ -145,10 +143,11 @@ mod tests {
facet_reader.facet_ords(2, &mut vals);
assert_eq!(&vals[..], &[4]);
}
Ok(())
}
#[test]
fn test_multifastfield_reader_min_max() {
fn test_multifastfield_reader_min_max() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field_options = IntOptions::default()
.set_indexed()
@@ -163,15 +162,16 @@ mod tests {
item_field => 2i64,
item_field => 3i64,
item_field => -2i64,
));
index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64));
index_writer.add_document(doc!(item_field => 4i64));
index_writer.commit().expect("Commit failed");
let searcher = index.reader().unwrap().searcher();
))?;
index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64))?;
index_writer.add_document(doc!(item_field => 4i64))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
let field_reader = segment_reader.fast_fields().i64s(item_field).unwrap();
let field_reader = segment_reader.fast_fields().i64s(item_field)?;
assert_eq!(field_reader.min_value(), -2);
assert_eq!(field_reader.max_value(), 6);
Ok(())
}
}

View File

@@ -1,6 +1,5 @@
use super::FastValue;
use crate::common::BinarySerializable;
use crate::common::CompositeFile;
use crate::directory::CompositeFile;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::directory::{Directory, RamDirectory, WritePtr};
@@ -8,6 +7,7 @@ use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
use crate::schema::Schema;
use crate::schema::FAST;
use crate::DocId;
use common::BinarySerializable;
use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;

View File

@@ -1,4 +1,4 @@
use crate::common::CompositeFile;
use crate::directory::CompositeFile;
use crate::directory::FileSlice;
use crate::fastfield::MultiValuedFastFieldReader;
use crate::fastfield::{BitpackedFastFieldReader, FastFieldNotAvailableError};

View File

@@ -1,8 +1,8 @@
use crate::common::BinarySerializable;
use crate::common::CompositeWrite;
use crate::common::CountingWriter;
use crate::directory::CompositeWrite;
use crate::directory::WritePtr;
use crate::schema::Field;
use common::BinarySerializable;
use common::CountingWriter;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
@@ -105,9 +105,7 @@ impl CompositeFastFieldSerializer {
&fastfield_accessor,
&mut estimations,
);
if let Some(broken_estimation) = estimations
.iter()
.find(|estimation| estimation.0 == f32::NAN)
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
{
warn!(
"broken estimation for fast field codec {}",

View File

@@ -1,12 +1,12 @@
use super::multivalued::MultiValuedFastFieldWriter;
use super::serializer::FastFieldStats;
use super::FastFieldDataAccess;
use crate::common;
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
use crate::termdict::TermOrdinal;
use common;
use fnv::FnvHashMap;
use std::collections::HashMap;
use std::io;

View File

@@ -1,5 +1,5 @@
use super::{fieldnorm_to_id, id_to_fieldnorm};
use crate::common::CompositeFile;
use crate::directory::CompositeFile;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::schema::Field;

View File

@@ -1,4 +1,4 @@
use crate::common::CompositeWrite;
use crate::directory::CompositeWrite;
use crate::directory::WritePtr;
use crate::schema::Field;
use std::io;

View File

@@ -1,4 +1,8 @@
use crate::schema;
use crate::Index;
use crate::IndexSettings;
use crate::IndexSortByField;
use crate::Order;
use crate::Searcher;
use crate::{doc, schema::*};
use rand::thread_rng;
@@ -35,7 +39,7 @@ fn test_functional_store() -> crate::Result<()> {
let mut doc_set: Vec<u64> = Vec::new();
let mut doc_id = 0u64;
for iteration in 0..500 {
for iteration in 0..get_num_iterations() {
dbg!(iteration);
let num_docs: usize = rng.gen_range(0..4);
if !doc_set.is_empty() {
@@ -45,7 +49,7 @@ fn test_functional_store() -> crate::Result<()> {
}
for _ in 0..num_docs {
doc_set.push(doc_id);
index_writer.add_document(doc!(id_field=>doc_id));
index_writer.add_document(doc!(id_field=>doc_id))?;
doc_id += 1;
}
index_writer.commit()?;
@@ -56,16 +60,37 @@ fn test_functional_store() -> crate::Result<()> {
Ok(())
}
fn get_num_iterations() -> usize {
std::env::var("NUM_FUNCTIONAL_TEST_ITERATIONS")
.map(|str| str.parse().unwrap())
.unwrap_or(2000)
}
#[test]
#[ignore]
fn test_functional_indexing() -> crate::Result<()> {
fn test_functional_indexing_sorted() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INDEXED);
let id_field = schema_builder.add_u64_field("id", INDEXED | FAST);
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
let text_field_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
let text_field = schema_builder.add_text_field("text_field", text_field_options);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema)?;
let mut index_builder = Index::builder().schema(schema);
index_builder = index_builder.settings(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "id".to_string(),
order: Order::Desc,
}),
..Default::default()
});
let index = index_builder.create_from_tempdir().unwrap();
let reader = index.reader()?;
let mut rng = thread_rng();
@@ -75,7 +100,7 @@ fn test_functional_indexing() -> crate::Result<()> {
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..200 {
for _ in 0..get_num_iterations() {
let random_val = rng.gen_range(0..20);
if random_val == 0 {
index_writer.commit()?;
@@ -98,7 +123,85 @@ fn test_functional_indexing() -> crate::Result<()> {
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
}
index_writer.add_document(doc);
doc.add_text(text_field, get_text());
index_writer.add_document(doc)?;
}
}
Ok(())
}
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
proident, sunt in culpa qui officia deserunt mollit anim id est \
laborum.";
fn get_text() -> String {
use rand::seq::SliceRandom;
let mut rng = thread_rng();
let tokens: Vec<_> = LOREM.split(' ').collect();
let random_val = rng.gen_range(0..20);
(0..random_val)
.map(|_| tokens.choose(&mut rng).unwrap())
.cloned()
.collect::<Vec<_>>()
.join(" ")
}
#[test]
#[ignore]
fn test_functional_indexing_unsorted() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
let text_field_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
let text_field = schema_builder.add_text_field("text_field", text_field_options);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema)?;
let reader = index.reader()?;
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..get_num_iterations() {
let random_val = rng.gen_range(0..20);
if random_val == 0 {
index_writer.commit()?;
committed_docs.extend(&uncommitted_docs);
uncommitted_docs.clear();
reader.reload()?;
let searcher = reader.searcher();
// check that everything is correct.
check_index_content(
&searcher,
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
)?;
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u64(id_field, random_val);
index_writer.delete_term(doc_id_term);
} else {
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
}
doc.add_text(text_field, get_text());
index_writer.add_document(doc)?;
}
}
Ok(())

324
src/indexer/demuxer.rs Normal file
View File

@@ -0,0 +1,324 @@
use common::BitSet;
use itertools::Itertools;
use crate::fastfield::AliveBitSet;
use crate::{merge_filtered_segments, Directory, Index, IndexSettings, Segment, SegmentOrdinal};
/// DemuxMapping can be used to reorganize data from multiple segments.
///
/// DemuxMapping is useful in a multitenant settings, in which each document might actually belong to a different tenant.
/// It allows to reorganize documents as follows:
///
/// e.g. if you have two tenant ids TENANT_A and TENANT_B and two segments with
/// the documents (simplified)
/// Seg 1 [TENANT_A, TENANT_B]
/// Seg 2 [TENANT_A, TENANT_B]
///
/// You may want to group your documents to
/// Seg 1 [TENANT_A, TENANT_A]
/// Seg 2 [TENANT_B, TENANT_B]
///
/// Demuxing is the tool for that.
/// Semantically you can define a mapping from [old segment ordinal, old doc_id] -> [new segment ordinal].
#[derive(Debug, Default)]
pub struct DemuxMapping {
/// [index old segment ordinal] -> [index doc_id] = new segment ordinal
mapping: Vec<DocIdToSegmentOrdinal>,
}
/// DocIdToSegmentOrdinal maps from doc_id within a segment to the new segment ordinal for demuxing.
///
/// For every source segment there is a `DocIdToSegmentOrdinal` to distribute its doc_ids.
#[derive(Debug, Default)]
pub struct DocIdToSegmentOrdinal {
doc_id_index_to_segment_ord: Vec<SegmentOrdinal>,
}
impl DocIdToSegmentOrdinal {
/// Creates a new DocIdToSegmentOrdinal with size of num_doc_ids.
/// Initially all doc_ids point to segment ordinal 0 and need to be set
/// the via `set` method.
pub fn with_max_doc(max_doc: usize) -> Self {
DocIdToSegmentOrdinal {
doc_id_index_to_segment_ord: vec![0; max_doc],
}
}
/// Returns the number of documents in this mapping.
/// It should be equal to the `max_doc` of the segment it targets.
pub fn max_doc(&self) -> u32 {
self.doc_id_index_to_segment_ord.len() as u32
}
/// Associates a doc_id with an output `SegmentOrdinal`.
pub fn set(&mut self, doc_id: u32, segment_ord: SegmentOrdinal) {
self.doc_id_index_to_segment_ord[doc_id as usize] = segment_ord;
}
/// Iterates over the new SegmentOrdinal in the order of the doc_id.
pub fn iter(&self) -> impl Iterator<Item = SegmentOrdinal> + '_ {
self.doc_id_index_to_segment_ord.iter().cloned()
}
}
impl DemuxMapping {
/// Adds a DocIdToSegmentOrdinal. The order of the pus calls
/// defines the old segment ordinal. e.g. first push = ordinal 0.
pub fn add(&mut self, segment_mapping: DocIdToSegmentOrdinal) {
self.mapping.push(segment_mapping);
}
/// Returns the old number of segments.
pub fn get_old_num_segments(&self) -> usize {
self.mapping.len()
}
}
fn docs_for_segment_ord(
doc_id_to_segment_ord: &DocIdToSegmentOrdinal,
target_segment_ord: SegmentOrdinal,
) -> AliveBitSet {
let mut bitset = BitSet::with_max_value(doc_id_to_segment_ord.max_doc());
for doc_id in doc_id_to_segment_ord
.iter()
.enumerate()
.filter(|(_doc_id, new_segment_ord)| *new_segment_ord == target_segment_ord)
.map(|(doc_id, _)| doc_id)
{
// add document if segment ordinal = target segment ordinal
bitset.insert(doc_id as u32);
}
AliveBitSet::from_bitset(&bitset)
}
fn get_alive_bitsets(
demux_mapping: &DemuxMapping,
target_segment_ord: SegmentOrdinal,
) -> Vec<AliveBitSet> {
demux_mapping
.mapping
.iter()
.map(|doc_id_to_segment_ord| {
docs_for_segment_ord(doc_id_to_segment_ord, target_segment_ord)
})
.collect_vec()
}
/// Demux the segments according to `demux_mapping`. See `DemuxMapping`.
/// The number of output_directories need to match max new segment ordinal from `demux_mapping`.
///
/// The ordinal of `segments` need to match the ordinals provided in `demux_mapping`.
pub fn demux(
segments: &[Segment],
demux_mapping: &DemuxMapping,
target_settings: IndexSettings,
output_directories: Vec<Box<dyn Directory>>,
) -> crate::Result<Vec<Index>> {
let mut indices = vec![];
for (target_segment_ord, output_directory) in output_directories.into_iter().enumerate() {
let delete_bitsets = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
.into_iter()
.map(Some)
.collect_vec();
let index = merge_filtered_segments(
segments,
target_settings.clone(),
delete_bitsets,
output_directory,
)?;
indices.push(index);
}
Ok(indices)
}
#[cfg(test)]
mod tests {
use crate::{
collector::TopDocs,
directory::RamDirectory,
query::QueryParser,
schema::{Schema, TEXT},
DocAddress, Term,
};
use super::*;
#[test]
fn test_demux_map_to_deletebitset() {
let max_value = 2;
let mut demux_mapping = DemuxMapping::default();
//segment ordinal 0 mapping
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
doc_id_to_segment.set(0, 1);
doc_id_to_segment.set(1, 0);
demux_mapping.add(doc_id_to_segment);
//segment ordinal 1 mapping
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
doc_id_to_segment.set(0, 1);
doc_id_to_segment.set(1, 1);
demux_mapping.add(doc_id_to_segment);
{
let bit_sets_for_demuxing_to_segment_ord_0 = get_alive_bitsets(&demux_mapping, 0);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_0[0].is_deleted(0),
true
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_0[0].is_deleted(1),
false
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_0[1].is_deleted(0),
true
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_0[1].is_deleted(1),
true
);
}
{
let bit_sets_for_demuxing_to_segment_ord_1 = get_alive_bitsets(&demux_mapping, 1);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_1[0].is_deleted(0),
false
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_1[0].is_deleted(1),
true
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_1[1].is_deleted(0),
false
);
assert_eq!(
bit_sets_for_demuxing_to_segment_ord_1[1].is_deleted(1),
false
);
}
}
#[test]
fn test_demux_segments() -> crate::Result<()> {
let first_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"texto1"))?;
index_writer.add_document(doc!(text_field=>"texto2"))?;
index_writer.commit()?;
index
};
let second_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"texto3"))?;
index_writer.add_document(doc!(text_field=>"texto4"))?;
index_writer.delete_term(Term::from_field_text(text_field, "4"));
index_writer.commit()?;
index
};
let mut segments: Vec<Segment> = Vec::new();
segments.extend(first_index.searchable_segments()?);
segments.extend(second_index.searchable_segments()?);
let target_settings = first_index.settings().clone();
let mut demux_mapping = DemuxMapping::default();
{
let max_value = 2;
//segment ordinal 0 mapping
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
doc_id_to_segment.set(0, 1);
doc_id_to_segment.set(1, 0);
demux_mapping.add(doc_id_to_segment);
//segment ordinal 1 mapping
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
doc_id_to_segment.set(0, 1);
doc_id_to_segment.set(1, 1);
demux_mapping.add(doc_id_to_segment);
}
assert_eq!(demux_mapping.get_old_num_segments(), 2);
let demuxed_indices = demux(
&segments,
&demux_mapping,
target_settings,
vec![
Box::new(RamDirectory::default()),
Box::new(RamDirectory::default()),
],
)?;
{
let index = &demuxed_indices[0];
let segments = index.searchable_segments()?;
assert_eq!(segments.len(), 1);
let segment_metas = segments[0].meta();
assert_eq!(segment_metas.num_deleted_docs(), 0);
assert_eq!(segment_metas.num_docs(), 1);
let searcher = index.reader().unwrap().searcher();
{
let text_field = index.schema().get_field("text").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("texto1"), vec![] as Vec<u32>);
assert_eq!(do_search("texto2"), vec![0]);
}
}
{
let index = &demuxed_indices[1];
let segments = index.searchable_segments()?;
assert_eq!(segments.len(), 1);
let segment_metas = segments[0].meta();
assert_eq!(segment_metas.num_deleted_docs(), 0);
assert_eq!(segment_metas.num_docs(), 3);
let searcher = index.reader().unwrap().searcher();
{
let text_field = index.schema().get_field("text").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("texto1"), vec![0]);
assert_eq!(do_search("texto2"), vec![] as Vec<u32>);
assert_eq!(do_search("texto3"), vec![1]);
assert_eq!(do_search("texto4"), vec![2]);
}
}
Ok(())
}
}

View File

@@ -2,23 +2,23 @@
//! to get mappings from old doc_id to new doc_id and vice versa, after sorting
//!
use super::{merger::SegmentReaderWithOrdinal, SegmentWriter};
use super::SegmentWriter;
use crate::{
schema::{Field, Schema},
DocId, IndexSortByField, Order, TantivyError,
DocId, IndexSortByField, Order, SegmentOrdinal, TantivyError,
};
use std::{cmp::Reverse, ops::Index};
/// Struct to provide mapping from new doc_id to old doc_id and segment.
#[derive(Clone)]
pub(crate) struct SegmentDocidMapping<'a> {
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
pub(crate) struct SegmentDocIdMapping {
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
is_trivial: bool,
}
impl<'a> SegmentDocidMapping<'a> {
impl SegmentDocIdMapping {
pub(crate) fn new(
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
is_trivial: bool,
) -> Self {
Self {
@@ -26,7 +26,7 @@ impl<'a> SegmentDocidMapping<'a> {
is_trivial,
}
}
pub(crate) fn iter(&self) -> impl Iterator<Item = &(DocId, SegmentReaderWithOrdinal)> {
pub(crate) fn iter(&self) -> impl Iterator<Item = &(DocId, SegmentOrdinal)> {
self.new_doc_id_to_old_and_segment.iter()
}
pub(crate) fn len(&self) -> usize {
@@ -40,15 +40,15 @@ impl<'a> SegmentDocidMapping<'a> {
self.is_trivial
}
}
impl<'a> Index<usize> for SegmentDocidMapping<'a> {
type Output = (DocId, SegmentReaderWithOrdinal<'a>);
impl Index<usize> for SegmentDocIdMapping {
type Output = (DocId, SegmentOrdinal);
fn index(&self, idx: usize) -> &Self::Output {
&self.new_doc_id_to_old_and_segment[idx]
}
}
impl<'a> IntoIterator for SegmentDocidMapping<'a> {
type Item = (DocId, SegmentReaderWithOrdinal<'a>);
impl IntoIterator for SegmentDocIdMapping {
type Item = (DocId, SegmentOrdinal);
type IntoIter = std::vec::IntoIter<Self::Item>;
fn into_iter(self) -> Self::IntoIter {
@@ -146,7 +146,7 @@ mod tests_indexsorting {
fn create_test_index(
index_settings: Option<IndexSettings>,
text_field_options: TextOptions,
) -> Index {
) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
@@ -166,19 +166,20 @@ mod tests_indexsorting {
if let Some(settings) = index_settings {
index_builder = index_builder.settings(settings);
}
let index = index_builder.create_in_ram().unwrap();
let index = index_builder.create_in_ram()?;
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(my_number=>40_u64));
index_writer
.add_document(doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64));
index_writer.add_document(doc!(my_number=>100_u64));
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(my_number=>40_u64))?;
index_writer.add_document(
doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64),
)?;
index_writer.add_document(doc!(my_number=>100_u64))?;
index_writer.add_document(
doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"),
);
index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ));
index_writer.commit().unwrap();
index
)?;
index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ))?;
index_writer.commit()?;
Ok(index)
}
fn get_text_options() -> TextOptions {
TextOptions::default().set_indexing_options(
@@ -203,7 +204,7 @@ mod tests_indexsorting {
for option in options {
//let options = get_text_options();
// no index_sort
let index = create_test_index(None, option.clone());
let index = create_test_index(None, option.clone())?;
let my_text_field = index.schema().get_field("text_field").unwrap();
let searcher = index.reader()?.searcher();
@@ -225,7 +226,7 @@ mod tests_indexsorting {
..Default::default()
}),
option.clone(),
);
)?;
let my_text_field = index.schema().get_field("text_field").unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
@@ -257,7 +258,7 @@ mod tests_indexsorting {
..Default::default()
}),
option.clone(),
);
)?;
let my_string_field = index.schema().get_field("text_field").unwrap();
let searcher = index.reader()?.searcher();
@@ -287,7 +288,7 @@ mod tests_indexsorting {
#[test]
fn test_sort_index_get_documents() -> crate::Result<()> {
// default baseline
let index = create_test_index(None, get_text_options());
let index = create_test_index(None, get_text_options())?;
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
{
@@ -316,7 +317,7 @@ mod tests_indexsorting {
..Default::default()
}),
get_text_options(),
);
)?;
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
{
@@ -341,7 +342,7 @@ mod tests_indexsorting {
..Default::default()
}),
get_text_options(),
);
)?;
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
{
@@ -356,7 +357,7 @@ mod tests_indexsorting {
#[test]
fn test_sort_index_test_string_field() -> crate::Result<()> {
let index = create_test_index(None, get_text_options());
let index = create_test_index(None, get_text_options())?;
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
@@ -376,7 +377,7 @@ mod tests_indexsorting {
..Default::default()
}),
get_text_options(),
);
)?;
let my_string_field = index.schema().get_field("string_field").unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
@@ -407,7 +408,7 @@ mod tests_indexsorting {
..Default::default()
}),
get_text_options(),
);
)?;
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
@@ -443,7 +444,7 @@ mod tests_indexsorting {
..Default::default()
}),
get_text_options(),
);
)?;
assert_eq!(
index.settings().sort_by_field.as_ref().unwrap().field,
"my_number".to_string()

View File

@@ -1,7 +1,6 @@
use super::operation::{AddOperation, UserOperation};
use super::segment_updater::SegmentUpdater;
use super::PreparedCommit;
use crate::common::BitSet;
use crate::core::Index;
use crate::core::Segment;
use crate::core::SegmentComponent;
@@ -12,9 +11,10 @@ use crate::directory::TerminatingWrite;
use crate::directory::{DirectoryLock, GarbageCollectionResult};
use crate::docset::{DocSet, TERMINATED};
use crate::error::TantivyError;
use crate::fastfield::write_delete_bitset;
use crate::fastfield::write_alive_bitset;
use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping;
use crate::indexer::index_writer_status::IndexWriterStatus;
use crate::indexer::operation::DeleteOperation;
use crate::indexer::stamper::Stamper;
use crate::indexer::MergePolicy;
@@ -24,17 +24,18 @@ use crate::schema::Document;
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::Opstamp;
use common::BitSet;
use crossbeam::channel;
use futures::executor::block_on;
use futures::future::Future;
use smallvec::smallvec;
use smallvec::SmallVec;
use std::mem;
use std::ops::Range;
use std::sync::Arc;
use std::thread;
use std::thread::JoinHandle;
use super::{AddBatch, AddBatchReceiver, AddBatchSender};
// Size of the margin for the heap. A segment is closed when the remaining memory
// in the heap goes below MARGIN_IN_BYTES.
pub const MARGIN_IN_BYTES: usize = 1_000_000;
@@ -50,15 +51,12 @@ pub const MAX_NUM_THREAD: usize = 8;
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
// Group of operations.
// Most of the time, users will send operation one-by-one, but it can be useful to
// send them as a small block to ensure that
// - all docs in the operation will happen on the same segment and continuous doc_ids.
// - all operations in the group are committed at the same time, making the group
// atomic.
type OperationGroup = SmallVec<[AddOperation; 4]>;
type OperationSender = channel::Sender<OperationGroup>;
type OperationReceiver = channel::Receiver<OperationGroup>;
fn error_in_index_worker_thread(context: &str) -> TantivyError {
TantivyError::ErrorInThread(format!(
"{}. A worker thread encounterred an error (io::Error most likely) or panicked.",
context
))
}
/// `IndexWriter` is the user entry-point to add document to an index.
///
@@ -77,8 +75,8 @@ pub struct IndexWriter {
workers_join_handle: Vec<JoinHandle<crate::Result<()>>>,
operation_receiver: OperationReceiver,
operation_sender: OperationSender,
index_writer_status: IndexWriterStatus,
operation_sender: AddBatchSender,
segment_updater: SegmentUpdater,
@@ -93,7 +91,7 @@ pub struct IndexWriter {
}
fn compute_deleted_bitset(
delete_bitset: &mut BitSet,
alive_bitset: &mut BitSet,
segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor,
doc_opstamps: &DocToOpstampMapping,
@@ -114,7 +112,7 @@ fn compute_deleted_bitset(
let mut doc_matching_deleted_term = docset.doc();
while doc_matching_deleted_term != TERMINATED {
if doc_opstamps.is_deleted(doc_matching_deleted_term, delete_op.opstamp) {
delete_bitset.insert(doc_matching_deleted_term);
alive_bitset.remove(doc_matching_deleted_term);
might_have_changed = true;
}
doc_matching_deleted_term = docset.advance();
@@ -141,7 +139,7 @@ pub(crate) fn advance_deletes(
return Ok(());
}
if segment_entry.delete_bitset().is_none() && segment_entry.delete_cursor().get().is_none() {
if segment_entry.alive_bitset().is_none() && segment_entry.delete_cursor().get().is_none() {
// There has been no `DeleteOperation` between the segment status and `target_opstamp`.
return Ok(());
}
@@ -149,38 +147,32 @@ pub(crate) fn advance_deletes(
let segment_reader = SegmentReader::open(&segment)?;
let max_doc = segment_reader.max_doc();
let mut delete_bitset: BitSet = match segment_entry.delete_bitset() {
Some(previous_delete_bitset) => (*previous_delete_bitset).clone(),
None => BitSet::with_max_value(max_doc),
let mut alive_bitset: BitSet = match segment_entry.alive_bitset() {
Some(previous_alive_bitset) => (*previous_alive_bitset).clone(),
None => BitSet::with_max_value_and_full(max_doc),
};
let num_deleted_docs_before = segment.meta().num_deleted_docs();
compute_deleted_bitset(
&mut delete_bitset,
&mut alive_bitset,
&segment_reader,
segment_entry.delete_cursor(),
&DocToOpstampMapping::None,
target_opstamp,
)?;
// TODO optimize
// It should be possible to do something smarter by manipulation bitsets directly
// to compute this union.
if let Some(seg_delete_bitset) = segment_reader.delete_bitset() {
for doc in 0u32..max_doc {
if seg_delete_bitset.is_deleted(doc) {
delete_bitset.insert(doc);
}
}
if let Some(seg_alive_bitset) = segment_reader.alive_bitset() {
alive_bitset.intersect_update(seg_alive_bitset.bitset());
}
let num_deleted_docs: u32 = delete_bitset.len() as u32;
let num_alive_docs: u32 = alive_bitset.len() as u32;
let num_deleted_docs = max_doc - num_alive_docs;
if num_deleted_docs > num_deleted_docs_before {
// There are new deletes. We need to write a new delete file.
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
let mut delete_file = segment.open_write(SegmentComponent::Delete)?;
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
write_alive_bitset(&alive_bitset, &mut delete_file)?;
delete_file.terminate()?;
}
@@ -191,10 +183,10 @@ pub(crate) fn advance_deletes(
fn index_documents(
memory_budget: usize,
segment: Segment,
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> crate::Result<bool> {
) -> crate::Result<()> {
let schema = segment.schema();
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
@@ -213,7 +205,7 @@ fn index_documents(
}
if !segment_updater.is_alive() {
return Ok(false);
return Ok(());
}
let max_doc = segment_writer.max_doc();
@@ -226,21 +218,20 @@ fn index_documents(
let segment_with_max_doc = segment.with_max_doc(max_doc);
let delete_bitset_opt =
apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;
let alive_bitset_opt = apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;
let meta = segment_with_max_doc.meta().clone();
meta.untrack_temp_docstore();
// update segment_updater inventory to remove tempstore
let segment_entry = SegmentEntry::new(meta, delete_cursor, delete_bitset_opt);
let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt);
block_on(segment_updater.schedule_add_segment(segment_entry))?;
Ok(true)
Ok(())
}
/// `doc_opstamps` is required to be non-empty.
fn apply_deletes(
segment: &Segment,
mut delete_cursor: &mut DeleteCursor,
delete_cursor: &mut DeleteCursor,
doc_opstamps: &[Opstamp],
) -> crate::Result<Option<BitSet>> {
if delete_cursor.get().is_none() {
@@ -259,11 +250,11 @@ fn apply_deletes(
let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps);
let max_doc = segment.meta().max_doc();
let mut deleted_bitset = BitSet::with_max_value(max_doc);
let mut deleted_bitset = BitSet::with_max_value_and_full(max_doc);
let may_have_deletes = compute_deleted_bitset(
&mut deleted_bitset,
&segment_reader,
&mut delete_cursor,
delete_cursor,
&doc_to_opstamps,
max_doc_opstamp,
)?;
@@ -287,8 +278,7 @@ impl IndexWriter {
/// should work at the same time.
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument`
pub(crate) fn new(
index: &Index,
num_threads: usize,
@@ -306,7 +296,7 @@ impl IndexWriter {
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
return Err(TantivyError::InvalidArgument(err_msg));
}
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let delete_queue = DeleteQueue::new();
@@ -324,7 +314,7 @@ impl IndexWriter {
heap_size_in_bytes_per_thread,
index: index.clone(),
operation_receiver: document_receiver,
index_writer_status: IndexWriterStatus::from(document_receiver),
operation_sender: document_sender,
segment_updater,
@@ -348,6 +338,11 @@ impl IndexWriter {
self.operation_sender = sender;
}
/// Accessor to the index.
pub fn index(&self) -> &Index {
&self.index
}
/// If there are some merging threads, blocks until they all finish their work and
/// then drop the `IndexWriter`.
pub fn wait_merging_threads(mut self) -> crate::Result<()> {
@@ -359,16 +354,14 @@ impl IndexWriter {
for join_handle in former_workers_handles {
join_handle
.join()
.expect("Indexing Worker thread panicked")
.map_err(|_| {
TantivyError::ErrorInThread("Error in indexing worker thread.".into())
})?;
.map_err(|_| error_in_index_worker_thread("Worker thread panicked."))?
.map_err(|_| error_in_index_worker_thread("Worker thread failed."))?;
}
let result = self
.segment_updater
.wait_merging_thread()
.map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into()));
.map_err(|_| error_in_index_worker_thread("Failed to join merging thread."));
if let Err(ref e) = result {
error!("Some merging thread failed {:?}", e);
@@ -396,10 +389,18 @@ impl IndexWriter {
self.index.new_segment()
}
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver> {
self.index_writer_status
.operation_receiver()
.ok_or_else(|| crate::TantivyError::ErrorInThread("The index writer was killed. It can happen if an indexing worker encounterred an Io error for instance.".to_string()))
}
/// Spawns a new worker thread for indexing.
/// The thread consumes documents from the pipeline.
fn add_indexing_worker(&mut self) -> crate::Result<()> {
let document_receiver_clone = self.operation_receiver.clone();
let document_receiver_clone = self.operation_receiver()?;
let index_writer_bomb = self.index_writer_status.create_bomb();
let mut segment_updater = self.segment_updater.clone();
let mut delete_cursor = self.delete_queue.cursor();
@@ -410,32 +411,31 @@ impl IndexWriter {
.name(format!("thrd-tantivy-index{}", self.worker_id))
.spawn(move || {
loop {
let mut document_iterator =
document_receiver_clone.clone().into_iter().peekable();
let mut document_iterator = document_receiver_clone
.clone()
.into_iter()
.filter(|batch| !batch.is_empty())
.peekable();
// the peeking here is to avoid
// creating a new segment's files
// The peeking here is to avoid creating a new segment's files
// if no document are available.
//
// this is a valid guarantee as the
// peeked document now belongs to
// This is a valid guarantee as the peeked document now belongs to
// our local iterator.
if let Some(operations) = document_iterator.peek() {
if let Some(first) = operations.first() {
delete_cursor.skip_to(first.opstamp);
} else {
return Ok(());
}
if let Some(batch) = document_iterator.peek() {
assert!(!batch.is_empty());
delete_cursor.skip_to(batch[0].opstamp);
} else {
// No more documents.
// Happens when there is a commit, or if the `IndexWriter`
// It happens when there is a commit, or if the `IndexWriter`
// was dropped.
index_writer_bomb.defuse();
return Ok(());
}
let segment = index.new_segment();
index_documents(
mem_budget,
segment,
index.new_segment(),
&mut document_iterator,
&mut segment_updater,
delete_cursor.clone(),
@@ -465,10 +465,8 @@ impl IndexWriter {
}
/// Detects and removes the files that are not used by the index anymore.
pub fn garbage_collect_files(
&self,
) -> impl Future<Output = crate::Result<GarbageCollectionResult>> {
self.segment_updater.schedule_garbage_collect()
pub async fn garbage_collect_files(&self) -> crate::Result<GarbageCollectionResult> {
self.segment_updater.schedule_garbage_collect().await
}
/// Deletes all documents from the index
@@ -491,7 +489,7 @@ impl IndexWriter {
/// let index = Index::create_in_ram(schema.clone());
///
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
/// index_writer.add_document(doc!(title => "The modern Promotheus"));
/// index_writer.add_document(doc!(title => "The modern Promotheus"))?;
/// index_writer.commit()?;
///
/// let clear_res = index_writer.delete_all_documents().unwrap();
@@ -535,12 +533,11 @@ impl IndexWriter {
/// when no documents are remaining.
///
/// Returns the former segment_ready channel.
#[allow(unused_must_use)]
fn recreate_document_channel(&mut self) -> OperationReceiver {
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
fn recreate_document_channel(&mut self) {
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
mem::replace(&mut self.operation_sender, document_sender);
mem::replace(&mut self.operation_receiver, document_receiver)
self.operation_sender = document_sender;
self.index_writer_status = IndexWriterStatus::from(document_receiver);
}
/// Rollback to the last commit
@@ -556,7 +553,7 @@ impl IndexWriter {
// marks the segment updater as killed. From now on, all
// segment updates will be ignored.
self.segment_updater.kill();
let document_receiver = self.operation_receiver.clone();
let document_receiver = self.operation_receiver();
// take the directory lock to create a new index_writer.
let directory_lock = self
@@ -696,14 +693,10 @@ impl IndexWriter {
/// The opstamp is an increasing `u64` that can
/// be used by the client to align commits with its own
/// document queue.
pub fn add_document(&self, document: Document) -> Opstamp {
pub fn add_document(&self, document: Document) -> crate::Result<Opstamp> {
let opstamp = self.stamper.stamp();
let add_operation = AddOperation { opstamp, document };
let send_result = self.operation_sender.send(smallvec![add_operation]);
if let Err(e) = send_result {
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
}
opstamp
self.send_add_documents_batch(smallvec![AddOperation { opstamp, document }])?;
Ok(opstamp)
}
/// Gets a range of stamps from the stamper and "pops" the last stamp
@@ -716,11 +709,7 @@ impl IndexWriter {
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
let Range { start, end } = self.stamper.stamps(count + 1u64);
let last_opstamp = end - 1;
let stamps = Range {
start,
end: last_opstamp,
};
(last_opstamp, stamps)
(last_opstamp, start..last_opstamp)
}
/// Runs a group of document operations ensuring that the operations are
@@ -739,16 +728,20 @@ impl IndexWriter {
/// Like adds and deletes (see `IndexWriter.add_document` and
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
/// visible to readers only after calling `commit()`.
pub fn run(&self, user_operations: Vec<UserOperation>) -> Opstamp {
let count = user_operations.len() as u64;
pub fn run<I>(&self, user_operations: I) -> crate::Result<Opstamp>
where
I: IntoIterator<Item = UserOperation>,
I::IntoIter: ExactSizeIterator,
{
let user_operations_it = user_operations.into_iter();
let count = user_operations_it.len() as u64;
if count == 0 {
return self.stamper.stamp();
return Ok(self.stamper.stamp());
}
let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
let mut adds = OperationGroup::default();
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
let mut adds = AddBatch::default();
for (user_op, opstamp) in user_operations_it.zip(stamps) {
match user_op {
UserOperation::Delete(term) => {
let delete_operation = DeleteOperation { opstamp, term };
@@ -760,12 +753,16 @@ impl IndexWriter {
}
}
}
let send_result = self.operation_sender.send(adds);
if let Err(e) = send_result {
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
};
self.send_add_documents_batch(adds)?;
Ok(batch_opstamp)
}
batch_opstamp
fn send_add_documents_batch(&self, add_ops: AddBatch) -> crate::Result<()> {
if self.index_writer_status.is_alive() && self.operation_sender.send(add_ops).is_ok() {
Ok(())
} else {
Err(error_in_index_worker_thread("An index writer was killed."))
}
}
}
@@ -831,7 +828,7 @@ mod tests {
UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")),
];
let batch_opstamp1 = index_writer.run(operations);
let batch_opstamp1 = index_writer.run(operations).unwrap();
assert_eq!(batch_opstamp1, 2u64);
}
@@ -842,14 +839,18 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "hello1"));
index_writer.add_document(doc!(text_field => "hello2"));
index_writer
.add_document(doc!(text_field => "hello1"))
.unwrap();
index_writer
.add_document(doc!(text_field => "hello2"))
.unwrap();
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 0);
assert_eq!(searcher.segment_reader(0u32).num_docs(), 2);
index_writer.delete_term(Term::from_field_text(text_field, "hello1"));
assert!(index_writer.commit().is_ok());
@@ -857,7 +858,7 @@ mod tests {
assert!(reader.reload().is_ok());
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
assert_eq!(searcher.segment_reader(0u32).num_docs(), 1);
let previous_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
@@ -869,7 +870,7 @@ mod tests {
assert!(reader.reload().is_ok());
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
assert_eq!(searcher.segment_reader(0u32).num_docs(), 1);
let after_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
assert_eq!(after_delete_opstamp, previous_delete_opstamp);
@@ -900,7 +901,7 @@ mod tests {
UserOperation::Delete(b_term),
];
index_writer.run(operations);
index_writer.run(operations).unwrap();
index_writer.commit().expect("failed to commit");
reader.reload().expect("failed to load searchers");
@@ -930,10 +931,10 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer(3_000_000).unwrap();
let operations1 = vec![];
let batch_opstamp1 = index_writer.run(operations1);
let batch_opstamp1 = index_writer.run(operations1).unwrap();
assert_eq!(batch_opstamp1, 0u64);
let operations2 = vec![];
let batch_opstamp2 = index_writer.run(operations2);
let batch_opstamp2 = index_writer.run(operations2).unwrap();
assert_eq!(batch_opstamp2, 1u64);
}
@@ -993,15 +994,14 @@ mod tests {
}
#[test]
fn test_commit_and_rollback() {
fn test_commit_and_rollback() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
.try_into()?;
let num_docs_containing = |s: &str| {
let searcher = reader.searcher();
let term = Term::from_field_text(text_field, s);
@@ -1010,136 +1010,127 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer(3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"));
index_writer.rollback().unwrap();
let mut index_writer = index.writer(3_000_000)?;
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.rollback()?;
assert_eq!(index_writer.commit_opstamp(), 0u64);
assert_eq!(num_docs_containing("a"), 0);
{
index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"c"));
}
assert!(index_writer.commit().is_ok());
reader.reload().unwrap();
index_writer.add_document(doc!(text_field=>"b"))?;
index_writer.add_document(doc!(text_field=>"c"))?;
index_writer.commit()?;
reader.reload()?;
assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 1);
assert_eq!(num_docs_containing("c"), 1);
}
reader.reload().unwrap();
reader.reload()?;
reader.searcher();
Ok(())
}
#[test]
fn test_with_merges() {
fn test_with_merges() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
.try_into()?;
let num_docs_containing = |s: &str| {
let term_a = Term::from_field_text(text_field, s);
reader.searcher().doc_freq(&term_a).unwrap()
};
{
// writing the segment
let mut index_writer = index.writer(12_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
}
index_writer.commit().expect("commit failed");
for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
}
// this should create 8 segments and trigger a merge.
index_writer.commit().expect("commit failed");
index_writer
.wait_merging_threads()
.expect("waiting merging thread failed");
reader.reload().unwrap();
assert_eq!(num_docs_containing("a"), 200);
assert!(index.searchable_segments().unwrap().len() < 8);
// writing the segment
let mut index_writer = index.writer(12_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"))?;
}
index_writer.commit()?;
for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"))?;
}
// this should create 8 segments and trigger a merge.
index_writer.commit()?;
index_writer.wait_merging_threads()?;
reader.reload()?;
assert_eq!(num_docs_containing("a"), 200);
assert!(index.searchable_segments()?.len() < 8);
Ok(())
}
#[test]
fn test_prepare_with_commit_message() {
fn test_prepare_with_commit_message() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
// writing the segment
let mut index_writer = index.writer(12_000_000)?;
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"))?;
}
{
let mut prepared_commit = index_writer.prepare_commit()?;
prepared_commit.set_payload("first commit");
prepared_commit.commit()?;
}
{
let metas = index.load_metas()?;
assert_eq!(metas.payload.unwrap(), "first commit");
}
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"))?;
}
index_writer.commit()?;
{
let metas = index.load_metas()?;
assert!(metas.payload.is_none());
}
Ok(())
}
#[test]
fn test_prepare_but_rollback() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
// writing the segment
let mut index_writer = index.writer(12_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(4, 12_000_000)?;
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
index_writer.add_document(doc!(text_field => "a"))?;
}
{
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
let mut prepared_commit = index_writer.prepare_commit()?;
prepared_commit.set_payload("first commit");
prepared_commit.commit().expect("commit failed");
prepared_commit.abort()?;
}
{
let metas = index.load_metas().unwrap();
assert_eq!(metas.payload.unwrap(), "first commit");
}
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
}
index_writer.commit().unwrap();
{
let metas = index.load_metas().unwrap();
assert!(metas.payload.is_none());
}
}
}
#[test]
fn test_prepare_but_rollback() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "a"));
}
{
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit");
prepared_commit.abort().expect("commit failed");
}
{
let metas = index.load_metas().unwrap();
let metas = index.load_metas()?;
assert!(metas.payload.is_none());
}
for _doc in 0..100 {
index_writer.add_document(doc!(text_field => "b"));
index_writer.add_document(doc!(text_field => "b"))?;
}
index_writer.commit().unwrap();
index_writer.commit()?;
}
let num_docs_containing = |s: &str| {
let term_a = Term::from_field_text(text_field, s);
index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap()
.try_into()?
.searcher()
.doc_freq(&term_a)
.unwrap()
};
assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 100);
assert_eq!(num_docs_containing("a")?, 0);
assert_eq!(num_docs_containing("b")?, 100);
Ok(())
}
#[test]
@@ -1160,7 +1151,7 @@ mod tests {
};
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
let add_tstamp = index_writer.add_document(doc!(text_field => "a")).unwrap();
let commit_tstamp = index_writer.commit().unwrap();
assert!(commit_tstamp > add_tstamp);
index_writer.delete_all_documents().unwrap();
@@ -1177,7 +1168,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
let add_tstamp = index_writer.add_document(doc!(text_field => "a")).unwrap();
// commit documents - they are now available
let first_commit = index_writer.commit();
@@ -1196,7 +1187,7 @@ mod tests {
// add new documents again
for _ in 0..100 {
index_writer.add_document(doc!(text_field => "b"));
index_writer.add_document(doc!(text_field => "b")).unwrap();
}
// rollback to last commit, when index was empty
@@ -1230,7 +1221,7 @@ mod tests {
assert!(index_writer.commit().is_ok());
// add one simple doc
index_writer.add_document(doc!(text_field => "a"));
index_writer.add_document(doc!(text_field => "a")).unwrap();
assert!(index_writer.commit().is_ok());
let term_a = Term::from_field_text(text_field, "a");
@@ -1254,7 +1245,7 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
// add one simple doc
index_writer.add_document(doc!(text_field => "a"));
assert!(index_writer.add_document(doc!(text_field => "a")).is_ok());
let comm = index_writer.commit();
assert!(comm.is_ok());
let commit_tstamp = comm.unwrap();
@@ -1330,13 +1321,13 @@ mod tests {
// create and delete docs in same commit
for id in 0u64..5u64 {
index_writer.add_document(doc!(id_field => id));
index_writer.add_document(doc!(id_field => id))?;
}
for id in 2u64..4u64 {
index_writer.delete_term(Term::from_field_u64(id_field, id));
}
for id in 5u64..10u64 {
index_writer.add_document(doc!(id_field => id));
index_writer.add_document(doc!(id_field => id))?;
}
index_writer.commit()?;
index_reader.reload()?;
@@ -1361,13 +1352,24 @@ mod tests {
AddDoc { id: u64 },
DeleteDoc { id: u64 },
Commit,
Merge,
}
fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
(0u64..10u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u64..10u64).prop_map(|id| IndexingOp::AddDoc { id }),
(0u64..2u64).prop_map(|_| IndexingOp::Commit),
(0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
(0u64..20u64).prop_map(|id| IndexingOp::AddDoc { id }),
(0u64..1u64).prop_map(|_| IndexingOp::Commit),
(0u64..1u64).prop_map(|_| IndexingOp::Merge),
]
}
fn adding_operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
10 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
50 => (0u64..100u64).prop_map(|id| IndexingOp::AddDoc { id }),
2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit),
1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge),
]
}
@@ -1393,7 +1395,7 @@ mod tests {
fn test_operation_strategy(
ops: &[IndexingOp],
sort_index: bool,
force_merge: bool,
force_end_merge: bool,
) -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED);
@@ -1435,12 +1437,16 @@ mod tests {
.settings(settings)
.create_in_ram()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let old_reader = index.reader()?;
for &op in ops {
match op {
IndexingOp::AddDoc { id } => {
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
index_writer
.add_document(doc!(id_field=>id, multi_numbers=> id, multi_numbers => id, text_field => id.to_string(), facet_field => facet, large_text_field=> LOREM));
.add_document(doc!(id_field=>id, multi_numbers=> id, multi_numbers => id, text_field => id.to_string(), facet_field => facet, large_text_field=> LOREM))?;
}
IndexingOp::DeleteDoc { id } => {
index_writer.delete_term(Term::from_field_u64(id_field, id));
@@ -1448,12 +1454,21 @@ mod tests {
IndexingOp::Commit => {
index_writer.commit()?;
}
IndexingOp::Merge => {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() >= 2 {
block_on(index_writer.merge(&segment_ids)).unwrap();
assert!(index_writer.segment_updater().wait_merging_thread().is_ok());
}
}
}
}
index_writer.commit()?;
let searcher = index.reader()?.searcher();
if force_merge {
if force_end_merge {
index_writer.wait_merging_threads()?;
let mut index_writer = index.writer_for_tests()?;
let segment_ids = index
@@ -1464,6 +1479,21 @@ mod tests {
assert!(index_writer.wait_merging_threads().is_ok());
}
}
old_reader.reload()?;
let old_searcher = old_reader.searcher();
let ids_old_searcher: HashSet<u64> = old_searcher
.segment_readers()
.iter()
.flat_map(|segment_reader| {
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
segment_reader
.doc_ids_alive()
.map(move |doc| ff_reader.get(doc))
})
.collect();
let ids: HashSet<u64> = searcher
.segment_readers()
.iter()
@@ -1476,6 +1506,19 @@ mod tests {
.collect();
let (expected_ids_and_num_occurences, deleted_ids) = expected_ids(ops);
let num_docs_expected = expected_ids_and_num_occurences
.iter()
.map(|(_, id_occurences)| *id_occurences as usize)
.sum::<usize>();
assert_eq!(searcher.num_docs() as usize, num_docs_expected);
assert_eq!(old_searcher.num_docs() as usize, num_docs_expected);
assert_eq!(
ids_old_searcher,
expected_ids_and_num_occurences
.keys()
.cloned()
.collect::<HashSet<_>>()
);
assert_eq!(
ids,
expected_ids_and_num_occurences
@@ -1500,7 +1543,7 @@ mod tests {
for segment_reader in searcher.segment_readers().iter() {
let store_reader = segment_reader.get_store_reader().unwrap();
// test store iterator
for doc in store_reader.iter(segment_reader.delete_bitset()) {
for doc in store_reader.iter(segment_reader.alive_bitset()) {
let id = doc
.unwrap()
.get_first(id_field)
@@ -1570,22 +1613,42 @@ mod tests {
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(20))]
#[test]
fn test_delete_with_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
fn test_delete_with_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
}
#[test]
fn test_delete_without_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
fn test_delete_without_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
}
#[test]
fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
fn test_delete_with_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
}
#[test]
fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
fn test_delete_without_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
assert!(test_operation_strategy(&ops[..], false, true).is_ok());
}
#[test]
fn test_delete_with_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
}
#[test]
fn test_delete_without_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
}
#[test]
fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
}
#[test]
fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..100)) {
assert!(test_operation_strategy(&ops[..], false, true).is_ok());
}
}
#[test]
@@ -1610,11 +1673,11 @@ mod tests {
let mut index_writer = index.writer_for_tests()?;
// We add a doc...
index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64));
index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64))?;
// And remove it.
index_writer.delete_term(Term::from_field_u64(id_field, 0u64));
// We add another doc.
index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64));
index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64))?;
// The expected result is a segment with
// maxdoc = 2
@@ -1626,19 +1689,19 @@ mod tests {
let segment_reader = searcher.segment_reader(0);
assert_eq!(segment_reader.max_doc(), 2);
assert_eq!(segment_reader.num_deleted_docs(), 1);
assert_eq!(segment_reader.num_docs(), 1);
Ok(())
}
#[test]
fn test_index_doc_missing_field() {
fn test_index_doc_missing_field() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let idfield = schema_builder.add_text_field("id", STRING);
schema_builder.add_text_field("optfield", STRING);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(idfield=>"myid"));
let commit = index_writer.commit();
assert!(commit.is_ok());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(idfield=>"myid"))?;
index_writer.commit()?;
Ok(())
}
}

View File

@@ -0,0 +1,118 @@
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, RwLock};
use super::AddBatchReceiver;
#[derive(Clone)]
pub(crate) struct IndexWriterStatus {
inner: Arc<Inner>,
}
impl IndexWriterStatus {
/// Returns true iff the index writer is alive.
pub fn is_alive(&self) -> bool {
self.inner.as_ref().is_alive()
}
/// Returns a copy of the operation receiver.
/// If the index writer was killed, returns None.
pub fn operation_receiver(&self) -> Option<AddBatchReceiver> {
let rlock = self
.inner
.receive_channel
.read()
.expect("This lock should never be poisoned");
rlock.as_ref().map(|receiver| receiver.clone())
}
/// Create an index writer bomb.
/// If dropped, the index writer status will be killed.
pub(crate) fn create_bomb(&self) -> IndexWriterBomb {
IndexWriterBomb {
inner: Some(self.inner.clone()),
}
}
}
struct Inner {
is_alive: AtomicBool,
receive_channel: RwLock<Option<AddBatchReceiver>>,
}
impl Inner {
fn is_alive(&self) -> bool {
self.is_alive.load(Ordering::Relaxed)
}
fn kill(&self) {
self.is_alive.store(false, Ordering::Relaxed);
self.receive_channel
.write()
.expect("This lock should never be poisoned")
.take();
}
}
impl From<AddBatchReceiver> for IndexWriterStatus {
fn from(receiver: AddBatchReceiver) -> Self {
IndexWriterStatus {
inner: Arc::new(Inner {
is_alive: AtomicBool::new(true),
receive_channel: RwLock::new(Some(receiver)),
}),
}
}
}
/// If dropped, the index writer will be killed.
/// To prevent this, clients can call `.defuse()`.
pub(crate) struct IndexWriterBomb {
inner: Option<Arc<Inner>>,
}
impl IndexWriterBomb {
/// Defuses the bomb.
///
/// This is the only way to drop the bomb without killing
/// the index writer.
pub fn defuse(mut self) {
self.inner = None;
}
}
impl Drop for IndexWriterBomb {
fn drop(&mut self) {
if let Some(inner) = self.inner.take() {
inner.kill();
}
}
}
#[cfg(test)]
mod tests {
use super::IndexWriterStatus;
use crossbeam::channel;
use std::mem;
#[test]
fn test_bomb_goes_boom() {
let (_tx, rx) = channel::bounded(10);
let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx);
assert!(index_writer_status.operation_receiver().is_some());
let bomb = index_writer_status.create_bomb();
assert!(index_writer_status.operation_receiver().is_some());
mem::drop(bomb);
// boom!
assert!(index_writer_status.operation_receiver().is_none());
}
#[test]
fn test_bomb_defused() {
let (_tx, rx) = channel::bounded(10);
let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx);
assert!(index_writer_status.operation_receiver().is_some());
let bomb = index_writer_status.create_bomb();
bomb.defuse();
assert!(index_writer_status.operation_receiver().is_some());
}
}

View File

@@ -114,7 +114,7 @@ mod tests {
use crate::Index;
#[test]
fn create_index_test_max_merge_issue_1035() {
fn create_index_test_max_merge_issue_1035() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intval", INDEXED);
let schema = schema_builder.build();
@@ -127,34 +127,34 @@ mod tests {
log_merge_policy.set_max_docs_before_merge(1);
log_merge_policy.set_min_layer_size(0);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(log_merge_policy));
// after every commit the merge checker is started, it will merge only segments with 1
// element in it because of the max_merge_size.
index_writer.add_document(doc!(int_field=>1_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>1_u64))?;
index_writer.commit()?;
index_writer.add_document(doc!(int_field=>2_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>2_u64))?;
index_writer.commit()?;
index_writer.add_document(doc!(int_field=>3_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>3_u64))?;
index_writer.commit()?;
index_writer.add_document(doc!(int_field=>4_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>4_u64))?;
index_writer.commit()?;
index_writer.add_document(doc!(int_field=>5_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>5_u64))?;
index_writer.commit()?;
index_writer.add_document(doc!(int_field=>6_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>6_u64))?;
index_writer.commit()?;
index_writer.add_document(doc!(int_field=>7_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>7_u64))?;
index_writer.commit()?;
index_writer.add_document(doc!(int_field=>8_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>8_u64))?;
index_writer.commit()?;
}
let _segment_ids = index
@@ -169,6 +169,7 @@ mod tests {
panic!("segment can't have more than two segments");
} // don't know how to wait for the merge, then it could be a simple eq
}
Ok(())
}
fn test_merge_policy() -> LogMergePolicy {

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,7 @@
#[cfg(test)]
mod tests {
use crate::fastfield::FastFieldReader;
use crate::fastfield::{AliveBitSet, FastFieldReader};
use crate::schema::IndexRecordOption;
use crate::{
collector::TopDocs,
schema::{Cardinality, TextFieldIndexing},
@@ -16,7 +17,7 @@ mod tests {
schema::{self, BytesOptions},
DocAddress,
};
use crate::{IndexSettings, Term};
use crate::{DocSet, IndexSettings, Postings, Term};
use futures::executor::block_on;
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
@@ -38,14 +39,17 @@ mod tests {
{
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")));
index_writer.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")));
assert!(index_writer.commit().is_ok());
index_writer
.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")))
.unwrap();
index_writer
.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")))
.unwrap();
index_writer.commit().unwrap();
index_writer
.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")))
.unwrap();
index_writer.commit().unwrap();
}
// Merging the segments
@@ -65,7 +69,7 @@ mod tests {
fn create_test_index(
index_settings: Option<IndexSettings>,
force_disjunct_segment_sort_values: bool,
) -> Index {
) -> crate::Result<Index> {
let mut schema_builder = schema::Schema::builder();
let int_options = IntOptions::default()
.set_fast(Cardinality::SingleValue)
@@ -94,32 +98,34 @@ mod tests {
if let Some(settings) = index_settings {
index_builder = index_builder.settings(settings);
}
let index = index_builder.create_in_ram().unwrap();
let index = index_builder.create_in_ram()?;
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
// segment 1 - range 1-3
index_writer.add_document(doc!(int_field=>1_u64));
index_writer.add_document(doc!(int_field=>1_u64))?;
index_writer.add_document(
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
);
index_writer.add_document(doc!(int_field=>1_u64, text_field=> "deleteme"));
)?;
index_writer.add_document(
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64),
);
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
)?;
index_writer.add_document(
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
)?;
assert!(index_writer.commit().is_ok());
index_writer.commit()?;
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64));
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?;
let in_val = if force_disjunct_segment_sort_values {
10_u64
} else {
1
};
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?;
index_writer.commit()?;
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
let int_vals = if force_disjunct_segment_sort_values {
[100_u64, 50]
@@ -128,26 +134,24 @@ mod tests {
};
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
);
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"));
)?;
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?;
index_writer.add_document(
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
);
)?;
index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
assert!(index_writer.commit().is_ok());
index_writer.commit()?;
}
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer_for_tests()?;
block_on(index_writer.merge(&segment_ids))?;
index_writer.wait_merging_threads()?;
}
index
Ok(index)
}
#[test]
@@ -180,7 +184,8 @@ mod tests {
..Default::default()
}),
force_disjunct_segment_sort_values,
);
)
.unwrap();
let int_field = index.schema().get_field("intval").unwrap();
let reader = index.reader().unwrap();
@@ -243,6 +248,36 @@ mod tests {
assert_eq!(do_search("biggest"), vec![0]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
assert_eq!(postings.term_freq(), 1);
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1]);
postings.advance();
assert_eq!(postings.term_freq(), 2);
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
}
// access doc store
{
let blubber_pos = if force_disjunct_segment_sort_values {
@@ -260,6 +295,70 @@ mod tests {
}
}
#[test]
fn test_merge_unsorted_index() {
let index = create_test_index(
Some(IndexSettings {
..Default::default()
}),
false,
)
.unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_readers().last().unwrap();
let searcher = index.reader().unwrap().searcher();
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![my_text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("some"), vec![1]);
assert_eq!(do_search("blubber"), vec![3]);
assert_eq!(do_search("biggest"), vec![4]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
assert_eq!(postings.term_freq(), 1);
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1]);
postings.advance();
assert_eq!(postings.term_freq(), 2);
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
}
}
#[test]
fn test_merge_sorted_index_asc() {
let index = create_test_index(
@@ -271,7 +370,8 @@ mod tests {
..Default::default()
}),
false,
);
)
.unwrap();
let int_field = index.schema().get_field("intval").unwrap();
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
@@ -314,7 +414,7 @@ mod tests {
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
@@ -339,6 +439,34 @@ mod tests {
assert_eq!(do_search("biggest"), vec![5]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
postings.advance();
postings.positions(&mut output);
assert_eq!(output, vec![1]);
}
// access doc store
{
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
@@ -422,14 +550,15 @@ mod bench_sorted_index_merge {
let doc_id_mapping = merger.generate_doc_id_mapping(&sort_by_field).unwrap();
b.iter(|| {
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, reader)|{
let u64_reader: DynamicFastFieldReader<u64> = reader.reader
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, ordinal)|{
let reader = &merger.readers[*ordinal as usize];
let u64_reader: DynamicFastFieldReader<u64> = reader
.fast_fields()
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
(doc_id, reader, u64_reader)
});
// add values in order of the new docids
// add values in order of the new doc_ids
let mut val = 0;
for (doc_id, _reader, field_reader) in sorted_doc_ids {
val = field_reader.get(*doc_id);
@@ -442,7 +571,7 @@ mod bench_sorted_index_merge {
Ok(())
}
#[bench]
fn create_sorted_index_create_docid_mapping(b: &mut Bencher) -> crate::Result<()> {
fn create_sorted_index_create_doc_id_mapping(b: &mut Bencher) -> crate::Result<()> {
let sort_by_field = IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,

View File

@@ -1,15 +1,17 @@
pub mod delete_queue;
pub mod demuxer;
pub mod doc_id_mapping;
mod doc_opstamp_mapping;
pub mod index_writer;
mod index_writer_status;
mod log_merge_policy;
mod merge_operation;
pub mod merge_policy;
pub mod merger;
mod merger_sorted_index_test;
pub mod operation;
mod prepared_commit;
pub mod prepared_commit;
mod segment_entry;
mod segment_manager;
mod segment_register;
@@ -18,6 +20,11 @@ pub mod segment_updater;
mod segment_writer;
mod stamper;
use crossbeam::channel;
use smallvec::SmallVec;
use crate::indexer::operation::AddOperation;
pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::MergeOperation;
@@ -26,12 +33,23 @@ pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::SegmentEntry;
pub use self::segment_manager::SegmentManager;
pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_updater::merge_segments;
pub use self::segment_updater::merge_filtered_segments;
pub use self::segment_updater::merge_indices;
pub use self::segment_writer::SegmentWriter;
/// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy;
// Batch of documents.
// Most of the time, users will send operation one-by-one, but it can be useful to
// send them as a small block to ensure that
// - all docs in the operation will happen on the same segment and continuous doc_ids.
// - all operations in the group are committed at the same time, making the group
// atomic.
type AddBatch = SmallVec<[AddOperation; 4]>;
type AddBatchSender = channel::Sender<AddBatch>;
type AddBatchReceiver = channel::Receiver<AddBatch>;
#[cfg(feature = "mmap")]
#[cfg(test)]
mod tests_mmap {
@@ -39,19 +57,20 @@ mod tests_mmap {
use crate::{Index, Term};
#[test]
fn test_advance_delete_bug() {
fn test_advance_delete_bug() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
let index = Index::create_from_tempdir(schema_builder.build())?;
let mut index_writer = index.writer_for_tests()?;
// there must be one deleted document in the segment
index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"b"))?;
index_writer.delete_term(Term::from_field_text(text_field, "b"));
// we need enough data to trigger the bug (at least 32 documents)
for _ in 0..32 {
index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"c"))?;
}
index_writer.commit().unwrap();
index_writer.commit().unwrap();
index_writer.commit()?;
index_writer.commit()?;
Ok(())
}
}

View File

@@ -18,25 +18,38 @@ impl<'a> PreparedCommit<'a> {
}
}
/// Returns the opstamp associated to the prepared commit.
pub fn opstamp(&self) -> Opstamp {
self.opstamp
}
/// Adds an arbitrary payload to the commit.
pub fn set_payload(&mut self, payload: &str) {
self.payload = Some(payload.to_string())
}
/// Rollbacks any change.
pub fn abort(self) -> crate::Result<Opstamp> {
self.index_writer.rollback()
}
/// Proceeds to commit.
/// See `.commit_async()`.
pub fn commit(self) -> crate::Result<Opstamp> {
block_on(self.commit_async())
}
/// Proceeds to commit.
///
/// Unfortunately, contrary to what `PrepareCommit` may suggests,
/// this operation is not at all really light.
/// At this point deletes have not been flushed yet.
pub async fn commit_async(self) -> crate::Result<Opstamp> {
info!("committing {}", self.opstamp);
let _ = block_on(
self.index_writer
.segment_updater()
.schedule_commit(self.opstamp, self.payload),
);
self.index_writer
.segment_updater()
.schedule_commit(self.opstamp, self.payload)
.await?;
Ok(self.opstamp)
}
}

View File

@@ -1,7 +1,7 @@
use crate::common::BitSet;
use crate::core::SegmentId;
use crate::core::SegmentMeta;
use crate::indexer::delete_queue::DeleteCursor;
use common::BitSet;
use std::fmt;
/// A segment entry describes the state of
@@ -9,18 +9,16 @@ use std::fmt;
///
/// In addition to segment `meta`,
/// it contains a few transient states
/// - `state` expresses whether the segment is already in the
/// middle of a merge
/// - `delete_bitset` is a bitset describing
/// documents that were deleted during the commit
/// - `alive_bitset` is a bitset describing
/// documents that were alive during the commit
/// itself.
/// - `delete_cursor` is the position in the delete queue.
/// Deletes happening before the cursor are reflected either
/// in the .del file or in the `delete_bitset`.
/// in the .del file or in the `alive_bitset`.
#[derive(Clone)]
pub struct SegmentEntry {
meta: SegmentMeta,
delete_bitset: Option<BitSet>,
alive_bitset: Option<BitSet>,
delete_cursor: DeleteCursor,
}
@@ -29,11 +27,11 @@ impl SegmentEntry {
pub fn new(
segment_meta: SegmentMeta,
delete_cursor: DeleteCursor,
delete_bitset: Option<BitSet>,
alive_bitset: Option<BitSet>,
) -> SegmentEntry {
SegmentEntry {
meta: segment_meta,
delete_bitset,
alive_bitset,
delete_cursor,
}
}
@@ -41,8 +39,8 @@ impl SegmentEntry {
/// Return a reference to the segment entry deleted bitset.
///
/// `DocId` in this bitset are flagged as deleted.
pub fn delete_bitset(&self) -> Option<&BitSet> {
self.delete_bitset.as_ref()
pub fn alive_bitset(&self) -> Option<&BitSet> {
self.alive_bitset.as_ref()
}
/// Set the `SegmentMeta` for this segment.

View File

@@ -66,13 +66,10 @@ impl SegmentRegister {
}
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
let mut segment_ids: Vec<SegmentMeta> = self
.segment_states
self.segment_states
.values()
.map(|segment_entry| segment_entry.meta().clone())
.collect();
segment_ids.sort_by_key(SegmentMeta::id);
segment_ids
.collect()
}
pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {

View File

@@ -7,6 +7,7 @@ use crate::core::SegmentId;
use crate::core::SegmentMeta;
use crate::core::META_FILEPATH;
use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
use crate::fastfield::AliveBitSet;
use crate::indexer::delete_queue::DeleteCursor;
use crate::indexer::index_writer::advance_deletes;
use crate::indexer::merge_operation::MergeOperationInventory;
@@ -19,12 +20,15 @@ use crate::indexer::{DefaultMergePolicy, MergePolicy};
use crate::indexer::{MergeCandidate, MergeOperation};
use crate::schema::Schema;
use crate::Opstamp;
use crate::TantivyError;
use fail::fail_point;
use futures::channel::oneshot;
use futures::executor::{ThreadPool, ThreadPoolBuilder};
use futures::future::Future;
use futures::future::TryFutureExt;
use std::borrow::BorrowMut;
use std::collections::HashSet;
use std::io;
use std::io::Write;
use std::ops::Deref;
use std::path::PathBuf;
@@ -74,6 +78,10 @@ fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()>
let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer.
writeln!(&mut buffer)?;
fail_point!("save_metas", |msg| Err(TantivyError::from(io::Error::new(
io::ErrorKind::Other,
msg.unwrap_or_else(|| "Undefined".to_string())
))));
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
Ok(())
@@ -159,9 +167,9 @@ fn merge(
/// meant to work if you have an IndexWriter running for the origin indices, or
/// the destination Index.
#[doc(hidden)]
pub fn merge_segments<Dir: Directory>(
pub fn merge_indices<T: Into<Box<dyn Directory>>>(
indices: &[Index],
output_directory: Dir,
output_directory: T,
) -> crate::Result<Index> {
if indices.is_empty() {
// If there are no indices to merge, there is no need to do anything.
@@ -170,19 +178,8 @@ pub fn merge_segments<Dir: Directory>(
));
}
let target_schema = indices[0].schema();
let target_settings = indices[0].settings().clone();
// let's check that all of the indices have the same schema
if indices
.iter()
.skip(1)
.any(|index| index.schema() != target_schema)
{
return Err(crate::TantivyError::InvalidArgument(
"Attempt to merge different schema indices".to_string(),
));
}
// let's check that all of the indices have the same index settings
if indices
.iter()
@@ -199,13 +196,61 @@ pub fn merge_segments<Dir: Directory>(
segments.extend(index.searchable_segments()?);
}
let mut merged_index = Index::create(output_directory, target_schema.clone(), target_settings)?;
let non_filter = segments.iter().map(|_| None).collect::<Vec<_>>();
merge_filtered_segments(&segments, target_settings, non_filter, output_directory)
}
/// Advanced: Merges a list of segments from different indices in a new index.
/// Additional you can provide a delete bitset for each segment to ignore doc_ids.
///
/// Returns `TantivyError` if the the indices list is empty or their
/// schemas don't match.
///
/// `output_directory`: is assumed to be empty.
///
/// # Warning
/// This function does NOT check or take the `IndexWriter` is running. It is not
/// meant to work if you have an IndexWriter running for the origin indices, or
/// the destination Index.
#[doc(hidden)]
pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
segments: &[Segment],
target_settings: IndexSettings,
filter_doc_ids: Vec<Option<AliveBitSet>>,
output_directory: T,
) -> crate::Result<Index> {
if segments.is_empty() {
// If there are no indices to merge, there is no need to do anything.
return Err(crate::TantivyError::InvalidArgument(
"No segments given to marge".to_string(),
));
}
let target_schema = segments[0].schema();
// let's check that all of the indices have the same schema
if segments
.iter()
.skip(1)
.any(|index| index.schema() != target_schema)
{
return Err(crate::TantivyError::InvalidArgument(
"Attempt to merge different schema indices".to_string(),
));
}
let mut merged_index = Index::create(
output_directory,
target_schema.clone(),
target_settings.clone(),
)?;
let merged_segment = merged_index.new_segment();
let merged_segment_id = merged_segment.id();
let merger: IndexMerger = IndexMerger::open(
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
merged_index.schema(),
merged_index.settings().clone(),
&segments[..],
segments,
filter_doc_ids,
)?;
let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
let num_docs = merger.write(segment_serializer)?;
@@ -225,7 +270,7 @@ pub fn merge_segments<Dir: Directory>(
);
let index_meta = IndexMeta {
index_settings: indices[0].load_metas()?.index_settings, // index_settings of all segments should be the same
index_settings: target_settings, // index_settings of all segments should be the same
segments: vec![segment_meta],
schema: target_schema,
opstamp: 0u64,
@@ -306,37 +351,39 @@ impl SegmentUpdater {
*self.merge_policy.write().unwrap() = arc_merge_policy;
}
fn schedule_future<T: 'static + Send, F: Future<Output = crate::Result<T>> + 'static + Send>(
async fn schedule_task<
T: 'static + Send,
F: Future<Output = crate::Result<T>> + 'static + Send,
>(
&self,
f: F,
) -> impl Future<Output = crate::Result<T>> {
let (sender, receiver) = oneshot::channel();
if self.is_alive() {
self.pool.spawn_ok(async move {
let _ = sender.send(f.await);
});
} else {
let _ = sender.send(Err(crate::TantivyError::SystemError(
task: F,
) -> crate::Result<T> {
if !self.is_alive() {
return Err(crate::TantivyError::SystemError(
"Segment updater killed".to_string(),
)));
));
}
receiver.unwrap_or_else(|_| {
let (sender, receiver) = oneshot::channel();
self.pool.spawn_ok(async move {
let task_result = task.await;
let _ = sender.send(task_result);
});
let task_result = receiver.await;
task_result.unwrap_or_else(|_| {
let err_msg =
"A segment_updater future did not success. This should never happen.".to_string();
Err(crate::TantivyError::SystemError(err_msg))
})
}
pub fn schedule_add_segment(
&self,
segment_entry: SegmentEntry,
) -> impl Future<Output = crate::Result<()>> {
pub async fn schedule_add_segment(&self, segment_entry: SegmentEntry) -> crate::Result<()> {
let segment_updater = self.clone();
self.schedule_future(async move {
self.schedule_task(async move {
segment_updater.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options().await;
Ok(())
})
.await
}
/// Orders `SegmentManager` to remove all segments
@@ -403,11 +450,9 @@ impl SegmentUpdater {
Ok(())
}
pub fn schedule_garbage_collect(
&self,
) -> impl Future<Output = crate::Result<GarbageCollectionResult>> {
pub async fn schedule_garbage_collect(&self) -> crate::Result<GarbageCollectionResult> {
let garbage_collect_future = garbage_collect_files(self.clone());
self.schedule_future(garbage_collect_future)
self.schedule_task(garbage_collect_future).await
}
/// List the files that are useful to the index.
@@ -425,13 +470,13 @@ impl SegmentUpdater {
files
}
pub fn schedule_commit(
pub(crate) async fn schedule_commit(
&self,
opstamp: Opstamp,
payload: Option<String>,
) -> impl Future<Output = crate::Result<()>> {
) -> crate::Result<()> {
let segment_updater: SegmentUpdater = self.clone();
self.schedule_future(async move {
self.schedule_task(async move {
let segment_entries = segment_updater.purge_deletes(opstamp)?;
segment_updater.segment_manager.commit(segment_entries);
segment_updater.save_metas(opstamp, payload)?;
@@ -439,6 +484,7 @@ impl SegmentUpdater {
segment_updater.consider_merge_options().await;
Ok(())
})
.await
}
fn store_meta(&self, index_meta: &IndexMeta) {
@@ -513,9 +559,7 @@ impl SegmentUpdater {
e
);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
assert!(!cfg!(test), "Merge failed.");
}
}
});
@@ -568,14 +612,14 @@ impl SegmentUpdater {
}
}
fn end_merge(
async fn end_merge(
&self,
merge_operation: MergeOperation,
mut after_merge_segment_entry: SegmentEntry,
) -> impl Future<Output = crate::Result<SegmentMeta>> {
) -> crate::Result<SegmentMeta> {
let segment_updater = self.clone();
let after_merge_segment_meta = after_merge_segment_entry.meta().clone();
let end_merge_future = self.schedule_future(async move {
self.schedule_task(async move {
info!("End merge {:?}", after_merge_segment_entry.meta());
{
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
@@ -594,9 +638,8 @@ impl SegmentUpdater {
merge_operation.segment_ids(),
advance_deletes_err
);
if cfg!(test) {
panic!("Merge failed.");
}
assert!(!cfg!(test), "Merge failed.");
// ... cancel merge
// `merge_operations` are tracked. As it is dropped, the
// the segment_ids will be available again for merge.
@@ -619,8 +662,9 @@ impl SegmentUpdater {
let _ = garbage_collect_files(segment_updater).await;
Ok(())
});
end_merge_future.map_ok(|_| after_merge_segment_meta)
})
.await?;
Ok(after_merge_segment_meta)
}
/// Wait for current merging threads.
@@ -646,11 +690,19 @@ impl SegmentUpdater {
#[cfg(test)]
mod tests {
use super::merge_segments;
use super::merge_indices;
use crate::collector::TopDocs;
use crate::directory::RamDirectory;
use crate::fastfield::AliveBitSet;
use crate::indexer::merge_policy::tests::MergeWheneverPossible;
use crate::indexer::merger::IndexMerger;
use crate::indexer::segment_updater::merge_filtered_segments;
use crate::query::QueryParser;
use crate::schema::*;
use crate::Directory;
use crate::DocAddress;
use crate::Index;
use crate::Segment;
#[test]
fn test_delete_during_merge() -> crate::Result<()> {
@@ -663,19 +715,19 @@ mod tests {
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.add_document(doc!(text_field=>"b"))?;
}
index_writer.commit()?;
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"));
index_writer.add_document(doc!(text_field=>"c"))?;
index_writer.add_document(doc!(text_field=>"d"))?;
}
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"));
index_writer.add_document(doc!(text_field=>"e"))?;
index_writer.add_document(doc!(text_field=>"f"))?;
index_writer.commit()?;
let term = Term::from_field_text(text_field, "a");
@@ -693,6 +745,50 @@ mod tests {
Ok(())
}
#[test]
fn delete_all_docs_min() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
// writing the segment
let mut index_writer = index.writer_for_tests()?;
for _ in 0..10 {
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.add_document(doc!(text_field=>"b"))?;
}
index_writer.commit()?;
let seg_ids = index.searchable_segment_ids()?;
// docs exist, should have at least 1 segment
assert!(!seg_ids.is_empty());
let term = Term::from_field_text(text_field, "a");
index_writer.delete_term(term);
index_writer.commit()?;
let term = Term::from_field_text(text_field, "b");
index_writer.delete_term(term);
index_writer.commit()?;
index_writer.wait_merging_threads()?;
let reader = index.reader()?;
assert_eq!(reader.searcher().num_docs(), 0);
let seg_ids = index.searchable_segment_ids()?;
assert!(seg_ids.is_empty());
reader.reload()?;
assert_eq!(reader.searcher().num_docs(), 0);
// empty segments should be erased
assert!(index.searchable_segment_metas()?.is_empty());
assert!(reader.searcher().segment_readers().is_empty());
Ok(())
}
#[test]
fn delete_all_docs() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
@@ -703,19 +799,19 @@ mod tests {
let mut index_writer = index.writer_for_tests()?;
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.add_document(doc!(text_field=>"b"))?;
}
index_writer.commit()?;
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"));
index_writer.add_document(doc!(text_field=>"c"))?;
index_writer.add_document(doc!(text_field=>"d"))?;
}
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"));
index_writer.add_document(doc!(text_field=>"e"))?;
index_writer.add_document(doc!(text_field=>"f"))?;
index_writer.commit()?;
let seg_ids = index.searchable_segment_ids()?;
@@ -755,8 +851,8 @@ mod tests {
// writing the segment
let mut index_writer = index.writer_for_tests()?;
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.add_document(doc!(text_field=>"b"))?;
}
index_writer.commit()?;
@@ -782,22 +878,22 @@ mod tests {
// writing two segments
let mut index_writer = index.writer_for_tests()?;
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"fizz"));
index_writer.add_document(doc!(text_field=>"buzz"));
index_writer.add_document(doc!(text_field=>"fizz"))?;
index_writer.add_document(doc!(text_field=>"buzz"))?;
}
index_writer.commit()?;
for _ in 0..1000 {
index_writer.add_document(doc!(text_field=>"foo"));
index_writer.add_document(doc!(text_field=>"bar"));
index_writer.add_document(doc!(text_field=>"foo"))?;
index_writer.add_document(doc!(text_field=>"bar"))?;
}
index_writer.commit()?;
indices.push(index);
}
assert_eq!(indices.len(), 3);
let output_directory = RamDirectory::default();
let index = merge_segments(&indices, output_directory)?;
let output_directory: Box<dyn Directory> = Box::new(RamDirectory::default());
let index = merge_indices(&indices, output_directory)?;
assert_eq!(index.schema(), schema);
let segments = index.searchable_segments()?;
@@ -811,7 +907,7 @@ mod tests {
#[test]
fn test_merge_empty_indices_array() {
let merge_result = merge_segments(&[], RamDirectory::default());
let merge_result = merge_indices(&[], RamDirectory::default());
assert!(merge_result.is_err());
}
@@ -822,7 +918,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"some text"));
index_writer.add_document(doc!(text_field=>"some text"))?;
index_writer.commit()?;
index
};
@@ -832,15 +928,197 @@ mod tests {
let body_field = schema_builder.add_text_field("body", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(body_field=>"some body"));
index_writer.add_document(doc!(body_field=>"some body"))?;
index_writer.commit()?;
index
};
// mismatched schema index list
let result = merge_segments(&[first_index, second_index], RamDirectory::default());
let result = merge_indices(&[first_index, second_index], RamDirectory::default());
assert!(result.is_err());
Ok(())
}
#[test]
fn test_merge_filtered_segments() -> crate::Result<()> {
let first_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"some text 1"))?;
index_writer.add_document(doc!(text_field=>"some text 2"))?;
index_writer.commit()?;
index
};
let second_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"some text 3"))?;
index_writer.add_document(doc!(text_field=>"some text 4"))?;
index_writer.delete_term(Term::from_field_text(text_field, "4"));
index_writer.commit()?;
index
};
let mut segments: Vec<Segment> = Vec::new();
segments.extend(first_index.searchable_segments()?);
segments.extend(second_index.searchable_segments()?);
let target_settings = first_index.settings().clone();
let filter_segment_1 = AliveBitSet::for_test_from_deleted_docs(&[1], 2);
let filter_segment_2 = AliveBitSet::for_test_from_deleted_docs(&[0], 2);
let filter_segments = vec![Some(filter_segment_1), Some(filter_segment_2)];
let merged_index = merge_filtered_segments(
&segments,
target_settings,
filter_segments,
RamDirectory::default(),
)?;
let segments = merged_index.searchable_segments()?;
assert_eq!(segments.len(), 1);
let segment_metas = segments[0].meta();
assert_eq!(segment_metas.num_deleted_docs(), 0);
assert_eq!(segment_metas.num_docs(), 1);
Ok(())
}
#[test]
fn test_merge_single_filtered_segments() -> crate::Result<()> {
let first_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"test text"))?;
index_writer.add_document(doc!(text_field=>"some text 2"))?;
index_writer.add_document(doc!(text_field=>"some text 3"))?;
index_writer.add_document(doc!(text_field=>"some text 4"))?;
index_writer.delete_term(Term::from_field_text(text_field, "4"));
index_writer.commit()?;
index
};
let mut segments: Vec<Segment> = Vec::new();
segments.extend(first_index.searchable_segments()?);
let target_settings = first_index.settings().clone();
let filter_segment = AliveBitSet::for_test_from_deleted_docs(&[0], 4);
let filter_segments = vec![Some(filter_segment)];
let index = merge_filtered_segments(
&segments,
target_settings,
filter_segments,
RamDirectory::default(),
)?;
let segments = index.searchable_segments()?;
assert_eq!(segments.len(), 1);
let segment_metas = segments[0].meta();
assert_eq!(segment_metas.num_deleted_docs(), 0);
assert_eq!(segment_metas.num_docs(), 2);
let searcher = index.reader()?.searcher();
{
let text_field = index.schema().get_field("text").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("test"), vec![] as Vec<u32>);
assert_eq!(do_search("text"), vec![0, 1]);
}
Ok(())
}
#[test]
fn test_apply_doc_id_filter_in_merger() -> crate::Result<()> {
let first_index = {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"some text 1"))?;
index_writer.add_document(doc!(text_field=>"some text 2"))?;
index_writer.add_document(doc!(text_field=>"some text 3"))?;
index_writer.add_document(doc!(text_field=>"some text 4"))?;
index_writer.delete_term(Term::from_field_text(text_field, "4"));
index_writer.commit()?;
index
};
let mut segments: Vec<Segment> = Vec::new();
segments.extend(first_index.searchable_segments()?);
let target_settings = first_index.settings().clone();
{
let filter_segment = AliveBitSet::for_test_from_deleted_docs(&[1], 4);
let filter_segments = vec![Some(filter_segment)];
let target_schema = segments[0].schema();
let merged_index = Index::create(
RamDirectory::default(),
target_schema.clone(),
target_settings.clone(),
)?;
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
merged_index.schema(),
merged_index.settings().clone(),
&segments[..],
filter_segments,
)?;
let doc_ids_alive: Vec<_> = merger.readers[0].doc_ids_alive().collect();
assert_eq!(doc_ids_alive, vec![0, 2]);
}
{
let filter_segments = vec![None];
let target_schema = segments[0].schema();
let merged_index = Index::create(
RamDirectory::default(),
target_schema.clone(),
target_settings.clone(),
)?;
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
merged_index.schema(),
merged_index.settings().clone(),
&segments[..],
filter_segments,
)?;
let doc_ids_alive: Vec<_> = merger.readers[0].doc_ids_alive().collect();
assert_eq!(doc_ids_alive, vec![0, 1, 2]);
}
Ok(())
}
}

View File

@@ -10,6 +10,7 @@
)]
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)]
#![allow(clippy::len_without_is_empty)]
//! # `tantivy`
//!
@@ -62,7 +63,7 @@
//! body => "He was an old man who fished alone in a skiff in \
//! the Gulf Stream and he had gone eighty-four days \
//! now without taking a fish."
//! ));
//! ))?;
//!
//! // We need to call .commit() explicitly to force the
//! // index_writer to finish processing the documents in the queue,
@@ -103,7 +104,7 @@
//! A good place for you to get started is to check out
//! the example code (
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
//! [source code](https://github.com/tantivy-search/tantivy/blob/main/examples/basic_search.rs))
//! [source code](https://github.com/quickwit-inc/tantivy/blob/main/examples/basic_search.rs))
#[cfg_attr(test, macro_use)]
extern crate serde_json;
@@ -135,7 +136,6 @@ pub type Result<T> = std::result::Result<T, TantivyError>;
/// Tantivy DateTime
pub type DateTime = chrono::DateTime<chrono::Utc>;
mod common;
mod core;
mod indexer;
@@ -163,8 +163,6 @@ pub use self::snippet::{Snippet, SnippetGenerator};
mod docset;
pub use self::docset::{DocSet, TERMINATED};
pub use crate::common::HasLen;
pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
pub use crate::core::{Executor, SegmentComponent};
pub use crate::core::{
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher, Segment,
@@ -172,12 +170,16 @@ pub use crate::core::{
};
pub use crate::core::{InvertedIndexReader, SegmentReader};
pub use crate::directory::Directory;
pub use crate::indexer::merge_segments;
pub use crate::indexer::demuxer::*;
pub use crate::indexer::merge_filtered_segments;
pub use crate::indexer::merge_indices;
pub use crate::indexer::operation::UserOperation;
pub use crate::indexer::IndexWriter;
pub use crate::indexer::{IndexWriter, PreparedCommit};
pub use crate::postings::Postings;
pub use crate::reader::LeasedItem;
pub use crate::schema::{Document, Term};
pub use common::HasLen;
pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use std::fmt;
use once_cell::sync::Lazy;
@@ -293,7 +295,7 @@ pub struct DocAddress {
}
#[cfg(test)]
mod tests {
pub mod tests {
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED};
@@ -304,11 +306,18 @@ mod tests {
use crate::Index;
use crate::Postings;
use crate::ReloadPolicy;
use common::{BinarySerializable, FixedSize};
use rand::distributions::Bernoulli;
use rand::distributions::Uniform;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
/// Checks if left and right are close one to each other.
/// Panics if the two values are more than 0.5% apart.
#[macro_export]
@@ -370,24 +379,22 @@ mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap();
let index = Index::create_from_tempdir(schema)?;
// writing the segment
let mut index_writer = index.writer_for_tests()?;
{
// writing the segment
let mut index_writer = index.writer_for_tests()?;
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c d");
index_writer.add_document(doc);
}
assert!(index_writer.commit().is_ok());
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc)?;
}
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc)?;
}
{
let doc = doc!(text_field=>"a b c d");
index_writer.add_document(doc)?;
}
index_writer.commit()?;
Ok(())
}
@@ -397,12 +404,12 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a a"));
index_writer.add_document(doc!(text_field=>"a"))?;
index_writer.add_document(doc!(text_field=>"a a"))?;
index_writer.commit()?;
index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"c"))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
@@ -424,7 +431,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.commit()?;
let index_reader = index.reader()?;
let searcher = index_reader.searcher();
@@ -446,9 +453,9 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!());
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!())?;
index_writer.add_document(doc!(text_field=>"a b"))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
@@ -490,20 +497,20 @@ mod tests {
// writing the segment
let mut index_writer = index.writer_for_tests()?;
// 0
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"a b"))?;
// 1
index_writer.add_document(doc!(text_field=>" a c"));
index_writer.add_document(doc!(text_field=>" a c"))?;
// 2
index_writer.add_document(doc!(text_field=>" b c"));
index_writer.add_document(doc!(text_field=>" b c"))?;
// 3
index_writer.add_document(doc!(text_field=>" b d"));
index_writer.add_document(doc!(text_field=>" b d"))?;
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.delete_term(Term::from_field_text(text_field, "a"));
// 4
index_writer.add_document(doc!(text_field=>" b c"));
index_writer.add_document(doc!(text_field=>" b c"))?;
// 5
index_writer.add_document(doc!(text_field=>" a"));
index_writer.add_document(doc!(text_field=>" a"))?;
index_writer.commit()?;
}
{
@@ -537,7 +544,7 @@ mod tests {
// writing the segment
let mut index_writer = index.writer_for_tests()?;
// 0
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"a b"))?;
// 1
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback()?;
@@ -573,7 +580,7 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"a b"))?;
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback()?;
index_writer.delete_term(Term::from_field_text(text_field, "a"));
@@ -623,7 +630,7 @@ mod tests {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(field=>1u64));
index_writer.add_document(doc!(field=>1u64))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
@@ -647,7 +654,7 @@ mod tests {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val));
index_writer.add_document(doc!(value_field => negative_val))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
@@ -671,7 +678,7 @@ mod tests {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let val = std::f64::consts::PI;
index_writer.add_document(doc!(value_field => val));
index_writer.add_document(doc!(value_field => val))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
@@ -694,7 +701,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"a"))?;
assert!(index_writer.commit().is_ok());
let reader = index.reader()?;
let searcher = reader.searcher();
@@ -717,14 +724,14 @@ mod tests {
// writing the segment
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"63"));
index_writer.add_document(doc!(text_field=>"70"));
index_writer.add_document(doc!(text_field=>"34"));
index_writer.add_document(doc!(text_field=>"1"));
index_writer.add_document(doc!(text_field=>"38"));
index_writer.add_document(doc!(text_field=>"33"));
index_writer.add_document(doc!(text_field=>"40"));
index_writer.add_document(doc!(text_field=>"17"));
index_writer.add_document(doc!(text_field=>"63"))?;
index_writer.add_document(doc!(text_field=>"70"))?;
index_writer.add_document(doc!(text_field=>"34"))?;
index_writer.add_document(doc!(text_field=>"1"))?;
index_writer.add_document(doc!(text_field=>"38"))?;
index_writer.add_document(doc!(text_field=>"33"))?;
index_writer.add_document(doc!(text_field=>"40"))?;
index_writer.add_document(doc!(text_field=>"17"))?;
index_writer.delete_term(Term::from_field_text(text_field, "38"));
index_writer.delete_term(Term::from_field_text(text_field, "34"));
index_writer.commit()?;
@@ -742,7 +749,7 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af bc bc"));
index_writer.add_document(doc!(text_field=>"af af af bc bc"))?;
index_writer.commit()?;
}
{
@@ -774,9 +781,9 @@ mod tests {
let reader = index.reader()?;
// writing the segment
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af af af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.add_document(doc!(text_field=>"af af af b"))?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!(text_field=>"a b c d"))?;
index_writer.commit()?;
reader.reload()?;
@@ -838,9 +845,9 @@ mod tests {
assert_eq!(reader.searcher().num_docs(), 0u64);
// writing the segment
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field=>"af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.add_document(doc!(text_field=>"af b"))?;
index_writer.add_document(doc!(text_field=>"a b c"))?;
index_writer.add_document(doc!(text_field=>"a b c d"))?;
index_writer.commit()?;
reader.reload()?;
assert_eq!(reader.searcher().num_docs(), 3u64);
@@ -880,7 +887,7 @@ mod tests {
{
let document =
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
index_writer.add_document(document);
index_writer.add_document(document)?;
index_writer.commit()?;
}
let reader = index.reader()?;
@@ -947,7 +954,7 @@ mod tests {
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for doc_id in 0u64..DOC_COUNT {
index_writer.add_document(doc!(id => doc_id));
index_writer.add_document(doc!(id => doc_id))?;
}
index_writer.commit()?;
@@ -964,7 +971,7 @@ mod tests {
index_writer.delete_term(Term::from_field_u64(id, doc_id));
index_writer.commit()?;
index_reader.reload()?;
index_writer.add_document(doc!(id => doc_id));
index_writer.add_document(doc!(id => doc_id))?;
index_writer.commit()?;
index_reader.reload()?;
let searcher = index_reader.searcher();
@@ -993,8 +1000,24 @@ mod tests {
#[test]
fn test_validate_checksum() -> crate::Result<()> {
let index_path = tempfile::tempdir().expect("dir");
let schema = Schema::builder().build();
let mut builder = Schema::builder();
let body = builder.add_text_field("body", TEXT | STORED);
let schema = builder.build();
let index = Index::create_in_dir(&index_path, schema)?;
let mut writer = index.writer(50_000_000)?;
for _ in 0..5000 {
writer.add_document(doc!(body => "foo"))?;
writer.add_document(doc!(body => "boo"))?;
}
writer.commit()?;
assert!(index.validate_checksum()?.is_empty());
// delete few docs
writer.delete_term(Term::from_field_text(body, "foo"));
writer.commit()?;
let segment_ids = index.searchable_segment_ids()?;
let _ = futures::executor::block_on(writer.merge(&segment_ids));
assert!(index.validate_checksum()?.is_empty());
Ok(())
}

View File

@@ -1,9 +1,9 @@
use std::io;
use crate::common::{BinarySerializable, VInt};
use crate::directory::OwnedBytes;
use crate::positions::COMPRESSION_BLOCK_SIZE;
use crate::postings::compression::{BlockDecoder, VIntDecoder};
use common::{BinarySerializable, VInt};
/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
/// This means we need to skip to the `nth` positions efficiently.

View File

@@ -1,7 +1,7 @@
use crate::common::{BinarySerializable, CountingWriter, VInt};
use crate::positions::COMPRESSION_BLOCK_SIZE;
use crate::postings::compression::BlockEncoder;
use crate::postings::compression::VIntEncoder;
use common::{BinarySerializable, CountingWriter, VInt};
use std::io::{self, Write};
/// The PositionSerializer is in charge of serializing all of the positions

View File

@@ -1,241 +1,98 @@
use std::ops::Range;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::compression::AlignedBuffer;
/// This modules define the logic used to search for a doc in a given
/// block. (at most 128 docs)
/// Search the first index containing an element greater or equal to
/// the target.
///
/// Searching within a block is a hotspot when running intersection.
/// so it was worth defining it in its own module.
#[cfg(target_arch = "x86_64")]
mod sse2 {
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
use std::arch::x86_64::__m128i as DataType;
use std::arch::x86_64::_mm_add_epi32 as op_add;
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
use std::arch::x86_64::_mm_set1_epi32 as set1;
use std::arch::x86_64::_mm_setzero_si128 as set0;
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
const MASK1: i32 = 78;
const MASK2: i32 = 177;
/// Performs an exhaustive linear search over the
///
/// There is no early exit here. We simply count the
/// number of elements that are `< target`.
pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
unsafe {
let ptr = arr as *const AlignedBuffer as *const DataType;
let vkey = set1(target as i32);
let mut cnt = set0();
// We work over 4 `__m128i` at a time.
// A single `__m128i` actual contains 4 `u32`.
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
cnt = op_sub(cnt, sum);
}
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
_mm_cvtsi128_si32(cnt) as usize
}
}
#[cfg(test)]
mod test {
use super::linear_search_sse2_128;
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
#[test]
fn test_linear_search_sse2_128_u32() {
let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
for el in 0u32..128u32 {
block[el as usize] = (el * 2 + 1) << 18;
}
let target = block[64] + 1;
assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
}
}
}
/// This `linear search` browser exhaustively through the array.
/// but the early exit is very difficult to predict.
/// The results should be equivalent to
/// ```compile_fail
/// block[..]
// .iter()
// .take_while(|&&val| val < target)
// .count()
/// ```
///
/// Coupled with `exponential search` this function is likely
/// to be called with the same `len`
fn linear_search(arr: &[u32], target: u32) -> usize {
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
}
fn exponential_search(arr: &[u32], target: u32) -> Range<usize> {
let end = arr.len();
let mut begin = 0;
for &pivot in &[1, 3, 7, 15, 31, 63] {
if pivot >= end {
break;
/// the `start` argument is just used to hint that the response is
/// greater than beyond `start`. the implementation may or may not use
/// it for optimization.
///
/// # Assumption
///
/// - The block is sorted. Some elements may appear several times. This is the case at the
/// end of the last block for instance.
/// - The target is assumed smaller or equal to the last element of the block.
pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
let mut start = 0;
let mut len = arr.len();
for _ in 0..7 {
len /= 2;
let pivot = unsafe { *arr.get_unchecked(start + len - 1) };
if pivot < target {
start += len;
}
if arr[pivot] > target {
return begin..pivot;
}
begin = pivot;
}
begin..end
}
#[inline(never)]
fn galloping(block_docs: &[u32], target: u32) -> usize {
let range = exponential_search(block_docs, target);
range.start + linear_search(&block_docs[range], target)
}
/// Tantivy may rely on SIMD instructions to search for a specific document within
/// a given block.
#[derive(Clone, Copy, PartialEq)]
pub enum BlockSearcher {
#[cfg(target_arch = "x86_64")]
Sse2,
Scalar,
}
impl BlockSearcher {
/// Search the first index containing an element greater or equal to
/// the target.
///
/// The results should be equivalent to
/// ```compile_fail
/// block[..]
// .iter()
// .take_while(|&&val| val < target)
// .count()
/// ```
///
/// The `start` argument is just used to hint that the response is
/// greater than beyond `start`. The implementation may or may not use
/// it for optimization.
///
/// # Assumption
///
/// The array len is > start.
/// The block is sorted
/// The target is assumed greater or equal to the `arr[start]`.
/// The target is assumed smaller or equal to the last element of the block.
///
/// Currently the scalar implementation starts by an exponential search, and
/// then operates a linear search in the result subarray.
///
/// If SSE2 instructions are available in the `(platform, running CPU)`,
/// then we use a different implementation that does an exhaustive linear search over
/// the block regardless of whether the block is full or not.
///
/// Indeed, if the block is not full, the remaining items are TERMINATED.
/// It is surprisingly faster, most likely because of the lack of branch misprediction.
pub(crate) fn search_in_block(self, block_docs: &AlignedBuffer, target: u32) -> usize {
#[cfg(target_arch = "x86_64")]
{
if self == BlockSearcher::Sse2 {
return sse2::linear_search_sse2_128(block_docs, target);
}
}
galloping(&block_docs.0[..], target)
}
}
impl Default for BlockSearcher {
fn default() -> BlockSearcher {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("sse2") {
return BlockSearcher::Sse2;
}
}
BlockSearcher::Scalar
}
start
}
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::linear_search;
use super::BlockSearcher;
use super::branchless_binary_search;
use crate::docset::TERMINATED;
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
#[test]
fn test_linear_search() {
let len: usize = 50;
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
for target in 1..*arr.last().unwrap() {
let res = linear_search(&arr[..], target);
if res > 0 {
assert!(arr[res - 1] < target);
}
if res < len {
assert!(arr[res] >= target);
}
}
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(&[1, 2], 0), 0..1);
assert_eq!(exponential_search(&[1, 2], 1), 0..1);
assert_eq!(
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
3..7
);
}
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
let cursor = search_in_block_trivial_but_slow(block, target);
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
output_buffer[..block.len()].copy_from_slice(block);
assert_eq!(
block_searcher.search_in_block(&AlignedBuffer(output_buffer), target),
cursor
);
}
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
use std::collections::HashSet;
let mut targets = HashSet::new();
for (i, val) in block.iter().cloned().enumerate() {
if i > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_in_block(block_searcher, block, target);
}
}
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use proptest::prelude::*;
use std::collections::HashSet;
fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
block.iter().take_while(|&&val| val < target).count()
}
fn test_search_in_block_util(block_searcher: BlockSearcher) {
for len in 1u32..128u32 {
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
util_test_search_in_block_all(block_searcher, &v[..]);
fn util_test_search_in_block(block: &[u32], target: u32) {
let cursor = search_in_block_trivial_but_slow(block, target);
assert!(cursor < COMPRESSION_BLOCK_SIZE);
assert!(block[cursor] >= target);
if cursor > 0 {
assert!(block[cursor - 1] < target);
}
assert_eq!(block.len(), COMPRESSION_BLOCK_SIZE);
let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
output_buffer[..block.len()].copy_from_slice(block);
assert_eq!(branchless_binary_search(&output_buffer, target), cursor);
}
fn util_test_search_in_block_all(block: &[u32]) {
let mut targets = HashSet::new();
targets.insert(0);
for &val in block {
if val > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_in_block(block, target);
}
}
#[test]
fn test_search_in_block_scalar() {
test_search_in_block_util(BlockSearcher::Scalar);
fn test_search_in_branchless_binary_search() {
let v: Vec<u32> = (0..COMPRESSION_BLOCK_SIZE).map(|i| i as u32 * 2).collect();
util_test_search_in_block_all(&v[..]);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_search_in_block_sse2() {
test_search_in_block_util(BlockSearcher::Sse2);
fn monotonous_block() -> impl Strategy<Value = Vec<u32>> {
prop::collection::vec(0u32..5u32, COMPRESSION_BLOCK_SIZE).prop_map(|mut deltas| {
let mut el = 0;
for i in 0..COMPRESSION_BLOCK_SIZE {
el += deltas[i];
deltas[i] = el;
}
deltas
})
}
proptest! {
#[test]
fn test_proptest_branchless_binary_search(block in monotonous_block()) {
util_test_search_in_block_all(&block[..]);
}
}
}

View File

@@ -1,16 +1,14 @@
use std::io;
use crate::common::{BinarySerializable, VInt};
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::fieldnorm::FieldNormReader;
use crate::postings::compression::{
AlignedBuffer, BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE,
};
use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
use crate::postings::{BlockInfo, FreqReadingOption, SkipReader};
use crate::query::Bm25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED};
use common::{BinarySerializable, VInt};
fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
it.next().map(|first| it.fold(first, Score::max))
@@ -209,9 +207,9 @@ impl BlockSegmentPostings {
///
/// This method is useful to run SSE2 linear search.
#[inline]
pub(crate) fn docs_aligned(&self) -> &AlignedBuffer {
pub(crate) fn full_block(&self) -> &[DocId; COMPRESSION_BLOCK_SIZE] {
debug_assert!(self.block_is_loaded());
self.doc_decoder.output_aligned()
self.doc_decoder.full_output()
}
/// Return the document at index `idx` of the block.
@@ -349,7 +347,6 @@ impl BlockSegmentPostings {
#[cfg(test)]
mod tests {
use super::BlockSegmentPostings;
use crate::common::HasLen;
use crate::core::Index;
use crate::docset::{DocSet, TERMINATED};
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
@@ -360,6 +357,7 @@ mod tests {
use crate::schema::Term;
use crate::schema::INDEXED;
use crate::DocId;
use common::HasLen;
#[test]
fn test_empty_segment_postings() {
@@ -395,8 +393,8 @@ mod tests {
}
#[test]
fn test_block_segment_postings() {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
fn test_block_segment_postings() -> crate::Result<()> {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>())?;
let mut offset: u32 = 0u32;
// checking that the `doc_freq` is correct
assert_eq!(block_segments.doc_freq(), 100_000);
@@ -411,16 +409,17 @@ mod tests {
offset += block.len() as u32;
block_segments.advance();
}
Ok(())
}
#[test]
fn test_skip_right_at_new_block() {
fn test_skip_right_at_new_block() -> crate::Result<()> {
let mut doc_ids = (0..128).collect::<Vec<u32>>();
// 128 is missing
doc_ids.push(129);
doc_ids.push(130);
{
let block_segments = build_block_postings(&doc_ids);
let block_segments = build_block_postings(&doc_ids)?;
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.seek(128), 129);
assert_eq!(docset.doc(), 129);
@@ -429,7 +428,7 @@ mod tests {
assert_eq!(docset.advance(), TERMINATED);
}
{
let block_segments = build_block_postings(&doc_ids);
let block_segments = build_block_postings(&doc_ids).unwrap();
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.seek(129), 129);
assert_eq!(docset.doc(), 129);
@@ -438,46 +437,47 @@ mod tests {
assert_eq!(docset.advance(), TERMINATED);
}
{
let block_segments = build_block_postings(&doc_ids);
let block_segments = build_block_postings(&doc_ids)?;
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.doc(), 0);
assert_eq!(docset.seek(131), TERMINATED);
assert_eq!(docset.doc(), TERMINATED);
}
Ok(())
}
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
fn build_block_postings(docs: &[DocId]) -> crate::Result<BlockSegmentPostings> {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests()?;
let mut last_doc = 0u32;
for &doc in docs {
for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64));
index_writer.add_document(doc!(int_field=>1u64))?;
}
index_writer.add_document(doc!(int_field=>0u64));
index_writer.add_document(doc!(int_field=>0u64))?;
last_doc = doc + 1;
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
let term = Term::from_field_u64(int_field, 0u64);
let term_info = inverted_index.get_term_info(&term).unwrap().unwrap();
inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
.unwrap()
let term_info = inverted_index.get_term_info(&term)?.unwrap();
let block_postings = inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?;
Ok(block_postings)
}
#[test]
fn test_block_segment_postings_seek() {
fn test_block_segment_postings_seek() -> crate::Result<()> {
let mut docs = vec![0];
for i in 0..1300 {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(&docs[..]);
let mut block_postings = build_block_postings(&docs[..])?;
for i in &[0, 424, 10000] {
block_postings.seek(*i);
let docs = block_postings.docs();
@@ -486,6 +486,7 @@ mod tests {
}
block_postings.seek(100_000);
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
Ok(())
}
#[test]
@@ -499,7 +500,7 @@ mod tests {
// the other containing odd numbers.
for i in 0..6 {
let doc = doc!(int_field=> (i % 2) as u64);
index_writer.add_document(doc);
index_writer.add_document(doc)?;
}
index_writer.commit()?;
let searcher = index.reader()?.searcher();

View File

@@ -1,5 +1,5 @@
use crate::common::FixedSize;
use bitpacking::{BitPacker, BitPacker4x};
use common::FixedSize;
pub const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * u32::SIZE_IN_BYTES;
@@ -49,16 +49,10 @@ impl BlockEncoder {
}
}
/// We ensure that the OutputBuffer is align on 128 bits
/// in order to run SSE2 linear search on it.
#[repr(align(128))]
#[derive(Clone)]
pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
#[derive(Clone)]
pub struct BlockDecoder {
bitpacker: BitPacker4x,
output: AlignedBuffer,
output: [u32; COMPRESSION_BLOCK_SIZE],
pub output_len: usize,
}
@@ -72,7 +66,7 @@ impl BlockDecoder {
pub fn with_val(val: u32) -> BlockDecoder {
BlockDecoder {
bitpacker: BitPacker4x::new(),
output: AlignedBuffer([val; COMPRESSION_BLOCK_SIZE]),
output: [val; COMPRESSION_BLOCK_SIZE],
output_len: 0,
}
}
@@ -85,28 +79,28 @@ impl BlockDecoder {
) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress_sorted(offset, compressed_data, &mut self.output.0, num_bits)
.decompress_sorted(offset, compressed_data, &mut self.output, num_bits)
}
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress(compressed_data, &mut self.output.0, num_bits)
.decompress(compressed_data, &mut self.output, num_bits)
}
#[inline]
pub fn output_array(&self) -> &[u32] {
&self.output.0[..self.output_len]
&self.output[..self.output_len]
}
#[inline]
pub(crate) fn output_aligned(&self) -> &AlignedBuffer {
pub(crate) fn full_output(&self) -> &[u32; COMPRESSION_BLOCK_SIZE] {
&self.output
}
#[inline]
pub fn output(&self, idx: usize) -> u32 {
self.output.0[idx]
self.output[idx]
}
}
@@ -190,8 +184,8 @@ impl VIntDecoder for BlockDecoder {
padding: u32,
) -> usize {
self.output_len = num_els;
self.output.0.iter_mut().for_each(|el| *el = padding);
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
self.output.iter_mut().for_each(|el| *el = padding);
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
}
fn uncompress_vint_unsorted(
@@ -201,12 +195,12 @@ impl VIntDecoder for BlockDecoder {
padding: u32,
) -> usize {
self.output_len = num_els;
self.output.0.iter_mut().for_each(|el| *el = padding);
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
self.output.iter_mut().for_each(|el| *el = padding);
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
}
fn uncompress_vint_unsorted_until_end(&mut self, compressed_data: &[u8]) {
let num_els = vint::uncompress_unsorted_until_end(compressed_data, &mut self.output.0);
let num_els = vint::uncompress_unsorted_until_end(compressed_data, &mut self.output);
self.output_len = num_els;
}
}

View File

@@ -3,6 +3,9 @@ Postings module (also called inverted index)
*/
mod block_search;
pub(crate) use self::block_search::branchless_binary_search;
mod block_segment_postings;
pub(crate) mod compression;
mod postings;
@@ -14,7 +17,6 @@ mod skip;
mod stacker;
mod term_info;
pub(crate) use self::block_search::BlockSearcher;
pub use self::block_segment_postings::BlockSegmentPostings;
pub use self::postings::Postings;
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
@@ -85,12 +87,12 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(title => r#"abc abc abc"#));
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
index_writer.add_document(doc!(title => r#"abc abc abc"#))?;
index_writer.add_document(doc!(title => r#"abc be be be be abc"#))?;
for _ in 0..1_000 {
index_writer.add_document(doc!(title => r#"abc abc abc"#));
index_writer.add_document(doc!(title => r#"abc abc abc"#))?;
}
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
index_writer.add_document(doc!(title => r#"abc be be be be abc"#))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
@@ -152,10 +154,7 @@ pub mod tests {
}
#[test]
pub fn test_drop_token_that_are_too_long() -> crate::Result<()> {
let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1);
exceeding_token_text.push_str(" hello");
pub fn test_index_max_length_token() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_options = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
@@ -168,33 +167,54 @@ pub mod tests {
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
{
index_writer.add_document(doc!(text_field=>exceeding_token_text));
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
assert_eq!(&bytes, b"hello");
}
{
index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(1u32);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
assert_eq!(&bytes[..], ok_token_text.as_bytes());
}
let reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;
let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
index_writer.add_document(doc!(text_field=>ok_token_text.clone()))?;
index_writer.commit()?;
reader.reload()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
assert_eq!(&bytes[..], ok_token_text.as_bytes());
Ok(())
}
#[test]
pub fn test_drop_token_that_are_too_long() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_options = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
.set_tokenizer("simple_no_truncation"),
);
let text_field = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader()?;
let mut index_writer = index.writer_for_tests()?;
let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1);
exceeding_token_text.push_str(" hello");
index_writer.add_document(doc!(text_field=>exceeding_token_text))?;
index_writer.commit()?;
reader.reload()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
assert_eq!(&bytes, b"hello");
Ok(())
}
@@ -313,13 +333,13 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field => "g b b d c g c"));
index_writer.add_document(doc!(text_field => "g a b b a d c g c"));
assert!(index_writer.commit().is_ok());
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(text_field => "g b b d c g c"))?;
index_writer.add_document(doc!(text_field => "g a b b a d c g c"))?;
index_writer.commit()?;
}
let term_a = Term::from_field_text(text_field, "a");
let searcher = index.reader().unwrap().searcher();
let searcher = index.reader()?.searcher();
let segment_reader = searcher.segment_reader(0);
let mut postings = segment_reader
.inverted_index(text_field)?
@@ -348,7 +368,7 @@ pub mod tests {
let mut index_writer = index.writer_for_tests()?;
for i in 0u64..num_docs as u64 {
let doc = doc!(value_field => 2u64, value_field => i % 2u64);
index_writer.add_document(doc);
index_writer.add_document(doc)?;
}
assert!(index_writer.commit().is_ok());
}

View File

@@ -11,7 +11,7 @@ use crate::docset::DocSet;
/// but other implementations mocking `SegmentPostings` exist,
/// for merging segments or for testing.
pub trait Postings: DocSet + 'static {
/// Returns the term frequency
/// The number of times the term appears in the document.
fn term_freq(&self) -> u32;
/// Returns the positions offseted with a given value.

View File

@@ -33,7 +33,7 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter>
SpecializedPostingsWriter::<TfAndPositionRecorder>::new_boxed()
}
})
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
.unwrap_or_else(SpecializedPostingsWriter::<NothingRecorder>::new_boxed),
FieldType::U64(_)
| FieldType::I64(_)
| FieldType::F64(_)
@@ -133,7 +133,8 @@ impl MultiFieldPostingsWriter {
doc_id_map: Option<&DocIdMapping>,
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
self.term_index.iter().collect();
Vec::with_capacity(self.term_index.len());
term_offsets.extend(self.term_index.iter());
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =

View File

@@ -1,10 +1,8 @@
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::FieldSerializer;
use crate::DocId;
use crate::{
common::{read_u32_vint, write_u32_vint},
indexer::doc_id_mapping::DocIdMapping,
};
use common::{read_u32_vint, write_u32_vint};
const POSITION_END: u32 = 0;

View File

@@ -1,12 +1,12 @@
use crate::common::HasLen;
use crate::docset::DocSet;
use crate::fastfield::DeleteBitSet;
use crate::fastfield::AliveBitSet;
use crate::positions::PositionReader;
use crate::postings::branchless_binary_search;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::BlockSearcher;
use crate::postings::BlockSegmentPostings;
use crate::postings::Postings;
use crate::{DocId, TERMINATED};
use common::HasLen;
/// `SegmentPostings` represents the inverted list or postings associated to
/// a term in a `Segment`.
@@ -18,7 +18,6 @@ pub struct SegmentPostings {
pub(crate) block_cursor: BlockSegmentPostings,
cur: usize,
position_reader: Option<PositionReader>,
block_searcher: BlockSearcher,
}
impl SegmentPostings {
@@ -28,7 +27,6 @@ impl SegmentPostings {
block_cursor: BlockSegmentPostings::empty(),
cur: 0,
position_reader: None,
block_searcher: BlockSearcher::default(),
}
}
@@ -36,7 +34,7 @@ impl SegmentPostings {
///
/// This method will clone and scan through the posting lists.
/// (this is a rather expensive operation).
pub fn doc_freq_given_deletes(&self, delete_bitset: &DeleteBitSet) -> u32 {
pub fn doc_freq_given_deletes(&self, alive_bitset: &AliveBitSet) -> u32 {
let mut docset = self.clone();
let mut doc_freq = 0;
loop {
@@ -44,7 +42,7 @@ impl SegmentPostings {
if doc == TERMINATED {
return doc_freq;
}
if delete_bitset.is_alive(doc) {
if alive_bitset.is_alive(doc) {
doc_freq += 1u32;
}
docset.advance();
@@ -154,7 +152,6 @@ impl SegmentPostings {
block_cursor: segment_block_postings,
cur: 0, // cursor within the block
position_reader,
block_searcher: BlockSearcher::default(),
}
}
}
@@ -183,8 +180,8 @@ impl DocSet for SegmentPostings {
self.block_cursor.seek(target);
// At this point we are on the block, that might contain our document.
let output = self.block_cursor.docs_aligned();
self.cur = self.block_searcher.search_in_block(output, target);
let output = self.block_cursor.full_block();
self.cur = branchless_binary_search(output, target);
// The last block is not full and padded with the value TERMINATED,
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
@@ -197,7 +194,7 @@ impl DocSet for SegmentPostings {
// with the value `TERMINATED`.
//
// After the search, the cursor should point to the first value of TERMINATED.
let doc = output.0[self.cur];
let doc = output[self.cur];
debug_assert!(doc >= target);
debug_assert_eq!(doc, self.doc());
doc
@@ -268,10 +265,10 @@ impl Postings for SegmentPostings {
mod tests {
use super::SegmentPostings;
use crate::common::HasLen;
use common::HasLen;
use crate::docset::{DocSet, TERMINATED};
use crate::fastfield::DeleteBitSet;
use crate::fastfield::AliveBitSet;
use crate::postings::postings::Postings;
#[test]
@@ -299,9 +296,10 @@ mod tests {
fn test_doc_freq() {
let docs = SegmentPostings::create_from_docs(&[0, 2, 10]);
assert_eq!(docs.doc_freq(), 3);
let delete_bitset = DeleteBitSet::for_test(&[2], 12);
assert_eq!(docs.doc_freq_given_deletes(&delete_bitset), 2);
let all_deleted = DeleteBitSet::for_test(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12);
assert_eq!(docs.doc_freq_given_deletes(&alive_bitset), 2);
let all_deleted =
AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0);
}
}

View File

@@ -1,7 +1,6 @@
use super::TermInfo;
use crate::common::{BinarySerializable, VInt};
use crate::common::{CompositeWrite, CountingWriter};
use crate::core::Segment;
use crate::directory::CompositeWrite;
use crate::directory::WritePtr;
use crate::fieldnorm::FieldNormReader;
use crate::positions::PositionSerializer;
@@ -12,6 +11,9 @@ use crate::schema::{Field, FieldEntry, FieldType};
use crate::schema::{IndexRecordOption, Schema};
use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
use crate::{DocId, Score};
use common::CountingWriter;
use common::{BinarySerializable, VInt};
use fail::fail_point;
use std::cmp::Ordering;
use std::io::{self, Write};
@@ -211,6 +213,9 @@ impl<'a> FieldSerializer<'a> {
/// If the current block is incomplete, it need to be encoded
/// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> {
fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
Err(io::Error::new(io::ErrorKind::Other, format!("{:?}", msg)))
});
if self.term_open {
self.postings_serializer
.close_term(self.current_term_info.doc_freq)?;
@@ -442,10 +447,8 @@ impl<W: Write> PostingsSerializer<W> {
let skip_data = self.skip_write.data();
VInt(skip_data.len() as u64).serialize(&mut self.output_write)?;
self.output_write.write_all(skip_data)?;
self.output_write.write_all(&self.postings_write[..])?;
} else {
self.output_write.write_all(&self.postings_write[..])?;
}
self.output_write.write_all(&self.postings_write[..])?;
self.skip_write.clear();
self.postings_write.clear();
self.bm25_weight = None;

View File

@@ -148,6 +148,10 @@ impl TermHashMap {
unordered_term_id
}
pub fn len(&self) -> usize {
self.len
}
pub fn iter(&self) -> Iter<'_> {
Iter {
inner: self.occupied.iter(),

Some files were not shown because too many files have changed in this diff Show More