Compare commits

...

129 Commits

Author SHA1 Message Date
Paul Masurel
94c73363e4 added unit test 2019-06-02 14:53:03 +09:00
Paul Masurel
96f194635f Trying to address #546 2019-05-29 09:17:41 +09:00
Paul Masurel
444662485f Remove mut in add_document and delete_term. Made stamper ordering rel… (#551)
* Remove mut in add_document and delete_term. Made stamper ordering relaxed.

* Made batch operations &mut self -> &self

* Added example
2019-05-28 10:26:00 +09:00
Stephen Carman
943c25d0f8 Make IndexMeta public (#553) 2019-05-28 09:27:49 +09:00
Paul Masurel
5c0b2a4579 Merge branch 'stamper_refactor' 2019-05-08 10:02:02 +09:00
Paul Masurel
9870a9258d Removed the mutex implementation of AtomicU64.
Fixed comment
2019-05-08 09:59:28 +09:00
Paul Masurel
7102b363f5 Fix build 2019-05-05 14:19:54 +09:00
Paul Masurel
66b4615e4e Issue/542 (#543)
* Closes 542.

Fast fields are all loaded when the segment reader is created.
2019-05-05 13:52:43 +09:00
petr-tik
da46913839 Merge branch 'master' into stamper_refactor 2019-04-30 22:28:48 +01:00
Paul Masurel
3df037961f Added more info to fast fields. 2019-04-30 13:14:01 +09:00
petr-tik
8ffae47854 Addressed code review
moved Opstamp to top-level namespace, added a docstring

Corrected minor typos/whitespace
2019-04-29 21:23:28 +01:00
petr-tik
1a90a1f3b0 Merge branch 'master' of github.com:tantivy-search/tantivy into stamper_refactor 2019-04-26 08:47:12 +01:00
Paul Masurel
dac50c6aeb Dds merged (#539)
* add ascii folding support

* Minor change and added Changelog.

* add additional tests

* Add tests for ascii folding (#533)

* first tests for ascii folding

* use a `RawTokenizer` for tokens using punctuation

* add test for all (?) folding, inspired by Lucene

* Simplification of the unit test code
2019-04-26 10:25:08 +09:00
Paul Masurel
31b22c5acc Added logging when token is dropped. (#538) 2019-04-26 09:23:28 +09:00
petr-tik
8e50921363 Tidied up the Stamper module and upgraded to a 1.34 dependency
Added stamper.revert method to be used for rollback - rolling back to a previous
commit in case of deleting all documents or rolling operations back should reset
the stamper as well

Added type alias for Opstamp - helps code readibility instead of seeing u64
returned by functions.

Moved to AtomicU64 on stable rust (since 1.34) - where possible use standard
library interfaces.
2019-04-24 20:46:28 +01:00
Paul Masurel
96a4f503ec Closes #526 (#535) 2019-04-24 20:59:48 +09:00
Paul Masurel
9df288b0c9 Merge branch 'master' of github.com:tantivy-search/tantivy 2019-04-24 12:31:47 +09:00
Paul Masurel
b7c2d0de97 Clippy2 (#534)
* Clippy comments

Clippy complaints that about the cast of &[u32] to a *const __m128i,
because of the lack of alignment constraints.

This commit passes the OutputBuffer object (which enforces proper
    alignment) instead of `&[u32]`.

* Clippy. Block alignment

* Code simplification

* Added comment. Code simplification

* Removed the extraneous freq block len hack.
2019-04-24 12:31:32 +09:00
Paul Masurel
62445e0ec8 Merge branch 'master' of github.com:tantivy-search/tantivy 2019-04-23 09:55:55 +09:00
Paul Masurel
a228825462 Clippy comments (#532)
Clippy complaints that about the cast of &[u32] to a *const __m128i,
because of the lack of alignment constraints.

This commit passes the OutputBuffer object (which enforces proper
    alignment) instead of `&[u32]`.
2019-04-23 09:54:02 +09:00
Paul Masurel
d3eabd14bc Clippy comments
Clippy complaints that about the cast of &[u32] to a *const __m128i,
because of the lack of alignment constraints.

This commit passes the OutputBuffer object (which enforces proper
    alignment) instead of `&[u32]`.
2019-04-22 11:16:21 +09:00
petr-tik
c967031d21 Delete files from target/ dir to avoid caching them on CI (#531)
* Delete files from target/ dir to avoid caching them on CI

idea from here https://github.com/rust-lang/cargo/issues/5885#issuecomment-432723546

* Delete examples
2019-04-21 08:02:27 +09:00
Paul Masurel
d823163d52 Closes #527. (#529)
Fixing the bug that affects the result of `query.count()` in presence of
deletes.
2019-04-19 09:19:50 +09:00
Paul Masurel
c4f59f202d Bumped combine version 2019-04-11 08:33:56 +09:00
Paul Masurel
acd29b535d Fix comment 2019-04-02 10:05:14 +09:00
Panagiotis Ktistakis
2cd31bcda2 Fix non english stemmers (#521) 2019-03-27 08:54:16 +09:00
Paul Masurel
99870de55c 0.10.0-dev 2019-03-25 08:58:26 +09:00
Paul Masurel
cad2d91845 Disabled tests for android 2019-03-24 22:58:46 +09:00
Paul Masurel
79f3cd6cf4 Added instructions to update 2019-03-24 09:10:31 +09:00
Paul Masurel
e3abb4481b broken link 2019-03-22 09:58:28 +09:00
Paul Masurel
bfa61d2f2f Added patreon button 2019-03-22 09:51:00 +09:00
Paul Masurel
6c0e621fdb Added bench info in README 2019-03-21 09:35:04 +09:00
Paul Masurel
a8cc5208f1 Linear simd (#519)
* linear simd search within block
2019-03-20 22:10:05 +09:00
Paul Masurel
83eb0d0cb7 Disabling tests on Android 2019-03-20 10:24:17 +09:00
Paul Masurel
ee6e273365 cleanup for nodefaultfeatures 2019-03-20 10:04:42 +09:00
Paul Masurel
6ea34b3d53 Fix version 2019-03-20 09:39:24 +09:00
Paul Masurel
22cf1004bd Reenabled test on android 2019-03-20 08:54:52 +09:00
Paul Masurel
5768d93171 Rename try to attempt as try is becoming a keyword in rust 2019-03-20 08:54:19 +09:00
Paul Masurel
663dd89c05 Feature/reader (#517)
Adding IndexReader to the API. Making it possible to watch for changes.

* Closes #500
2019-03-20 08:39:22 +09:00
barrotsteindev
a934577168 WIP: date field (#487)
* initial version, still a work in progress

* remove redudant or

* add chrono::DateTime and index i64

* add more tests

* fix tests

* pass DateTime by ptr

* remove println!

* document query_parser rfc 3339 date support

* added some more docs about implementation to schema.rs

* enforce DateTime is UTC, and re-export chrono

* added DateField to changelog

* fixed conflict

* use INDEXED instead of INT_INDEXED for date fields
2019-03-15 22:10:37 +09:00
Paul Masurel
94f1885334 Issue/513 (#514)
* Closes #513

* Clean up and doc

* Updated changelog
2019-03-07 09:39:30 +09:00
Jonathan Fok kan
2ccfdb97b5 WIP: compiling to wasm (#512)
* First work to enable compile to wasm

* Added back fst-regex/mmap to mmap feature

* Removed fst-regex. Forced uuid version 0.7.2.
2019-03-06 10:40:54 +09:00
Paul Masurel
e67883138d Cargo fmt 2019-03-06 10:31:00 +09:00
Paul Masurel
f5c65f1f60 Added comment on the constructor fo TopDocSByField 2019-03-06 10:30:37 +09:00
Mauri de Souza Nunes
ec73a9a284 Remove note about panicking in get_field docs (#503)
Since get_field rely on calling get on the underlying InnerSchema HashMap
it shouldn't fail if the field was not found, it simply returns None.
2019-02-28 09:23:00 +09:00
Thomas Schaller
a814a31f1e Remove semicolon from doc! expansion (#509) 2019-02-28 09:20:43 +09:00
Paul Masurel
9acadb3756 Code cleaning 2019-02-26 10:50:36 +09:00
Paul Masurel
774fcecf23 cargo fmt 2019-02-26 10:44:59 +09:00
Paul Masurel
27c9fa6028 Jannickj prove bug with facets (#508)
* prove bug with facets

* Closing #505

Introduce a term id in the TermHashMap
2019-02-25 22:33:17 +09:00
Paul Masurel
fdefea9e26 Removed path reference to tantivy-fst 2019-02-23 10:42:44 +09:00
Paul Masurel
b422f9c389 Partially addresses #500 (#502)
Using `tantivy_fst`. Storing `Weak<Mmap>` in the Mmap cache.
2019-02-23 10:33:59 +09:00
petr-tik
9451fd5b09 MsQueue to channel (#495)
* Format

Made the docstring consistent
remove empty line

* Move matches to dev deps

* Replace MsQueue with an unbounded crossbeam-channel

Questions:
queue.push ignores Result return

How to test pop() calls, if they block

* Format

Made the docstring consistent
remove empty line

* Unwrap the Result of queue.pop

* Addressed Paul's review

wrap the Result-returning send call with expect()

implemented the test not to fail after popping from empty queue

removed references to the Michael-Scott Queue

formatted
2019-02-23 09:06:50 +09:00
Jason Goldberger
788b3803d9 updated changelog (#501)
* updated changelog

* Update CHANGELOG.md

* Update CHANGELOG.md
2019-02-19 00:25:18 +09:00
Paul Masurel
5b11228083 Merge branch 'master' of github.com:tantivy-search/tantivy 2019-02-15 08:30:55 +09:00
Paul Masurel
515adff644 Merge branch 'hotfix/0.8.2' 2019-02-15 08:30:27 +09:00
Paul Masurel
e70a45426a 0.8.2 release
Backporting a fix for non x86_64 platforms
2019-02-14 09:16:27 +09:00
Jason Goldberger
e14701e9cd Add grouped operations (#493)
* [WIP] added UserOperation enum, added IndexWriter.run, and added MultiStamp

* removed MultiStamp in favor of std::ops::Range

* changed IndexWriter::run to return u64, Stamper::stamps to return a Range, added tests, and added docs

* changed delete_cursor skipping to use first operation's opstamp vice last. change index_writer test to use 1 thread

* added test for order batch of operations

* added a test comment
2019-02-14 08:56:01 +09:00
Paul Masurel
45e62d4329 Code simplification and adding comments 2019-02-06 10:05:15 +09:00
petr-tik
76d2b4dab6 Add integer range search example (#490)
Copied and simplified the example in the range_query mod
2019-02-05 23:34:06 +01:00
Paul Masurel
04e9606638 simplification of positions 2019-02-05 15:36:13 +01:00
Paul Masurel
a5c57ebbd9 Positions simplification 2019-02-05 14:50:51 +01:00
Paul Masurel
96eaa5bc63 Positions 2019-02-05 14:50:16 +01:00
Paul Masurel
f1d30ab196 fastfield reader fix 2019-02-05 14:10:16 +01:00
Paul Masurel
4507df9255 Closes #461 (#489)
Multivalued fast field uses `u64` indexes.
2019-02-04 13:24:00 +01:00
Paul Masurel
e8625548b7 Closes #461 (#488)
Multivalued fast field uses `u64` indexes.
2019-02-04 13:20:20 +01:00
Paul Masurel
50ed6fb534 Code cleanup
Fixed compilation without the mmap directory
2019-02-05 12:39:30 +01:00
Panagiotis Ktistakis
76609deadf Add Greek stemmer (#486) 2019-02-01 06:30:49 +01:00
Paul Masurel
749e62c40b renamed 2019-01-30 16:29:17 +01:00
Paul Masurel
259ce567d1 Using linear search 2019-01-29 15:59:24 +01:00
Paul Masurel
4c93b096eb Rustfmt 2019-01-29 11:45:30 +01:00
Paul Masurel
6a547b0b5f Issue/483 (#484)
* Downcast_ref

* fixing unit test
2019-01-28 11:43:42 +01:00
Paul Masurel
e99d1a2355 Better exponential search 2019-01-29 11:29:17 +01:00
Paul Masurel
c7bddc5fe3 Inlined exponential search 2019-01-28 17:28:07 +01:00
Paul Masurel
7b97dde335 Clippy + cargo fmt 2019-01-28 12:37:55 +01:00
Paul Masurel
644b4bd0a1 Issue/468b (#482)
* Moving lock to directory/

* added fs2

* doc

* Using fs2 for locking

* Added unit test

* Fixed error message related unit test

* Fixing location of import
2019-01-27 12:32:21 +01:00
Paul Masurel
bf94fd77db Issue/471 (#481)
* Closes 471

Removing writing_segments in the segment manager as it is now useless.
Removing the target merged segment id as it is useless as well.

* RAII for tracking which segment is in merge.

Closes #471

* fmt

* Using Inventory::default().
2019-01-27 12:18:59 +09:00
Paul Masurel
097eaf4aa6 impl Future as a result of merges 2019-01-28 03:56:43 +01:00
Paul Masurel
1fd46c1e9b Clippy 2019-01-28 03:46:23 +01:00
Paul Masurel
2fb219d017 Changelog 2019-01-24 09:12:07 +09:00
Paul Masurel
63b593bd0a Lower RAM usage in tests. 2019-01-24 09:10:38 +09:00
Paul Masurel
286bb75a0c Updated changelog 2019-01-24 09:03:58 +09:00
barrotsteindev
222b7f2580 Tantivy-288 (#472)
* add unit test

* improved test

* added SegmentManager#remove_empty_segments

* update old tests for new behaviour

* cleaner filter for empty segments

* PR adjustments

* rename x in closures

* simplify assert_eq!(vec.len(), 0)

* wait_merging_threads

* acquire searchers

* add comments to test

* rebased on latest master

* harden test

* fix merger#test_merge_multivalued_int_fields_all_deleted test
2019-01-24 08:58:56 +09:00
pentlander
5292e78860 Allow stemmers in languages other than English (#473)
Allow users to create stemmers for languages other than English. Add a
default stemmer for English.
2019-01-23 22:24:32 +09:00
Paul Masurel
c0cc6aac83 Updated changelog 2019-01-23 22:22:34 +09:00
Paul Masurel
0b0bf59a32 Allow stemmers in languages other than English (#478)
Allow users to create stemmers for languages other than English. Add a
default stemmer for English.

Closes #478
2019-01-23 22:21:00 +09:00
Paul Masurel
74f70a5c2c 32bits platforms 2019-01-23 13:21:31 +09:00
Paul Masurel
1acfb2ebb5 cargo fmt 2019-01-23 10:21:39 +09:00
Paul Masurel
4dfd091e67 Bumped version to 0.8.2-dev 2019-01-23 10:20:59 +09:00
Paul Masurel
8eba4ab807 Merge branch 'hotfix-476' 2019-01-23 10:20:33 +09:00
Paul Masurel
5e8e03882b Merge branch 'bug/476' 2019-01-23 10:18:27 +09:00
Paul Masurel
7df3260a15 Version bump 2019-01-23 10:13:18 +09:00
Paul Masurel
176f67a266 Refactoring 2019-01-23 10:06:40 +09:00
Paul Masurel
19babff849 Closes #476 2019-01-23 10:06:39 +09:00
Paul Masurel
bf2576adf9 Added a broken unit test 2019-01-23 10:04:27 +09:00
Paul Masurel
0e8fcd5727 Plastic surgery 2019-01-19 23:13:27 +09:00
Paul Masurel
f745c83bb7 Closes 466. Removing mentions of the chain collector. (#467) 2019-01-16 10:28:19 +09:00
Paul Masurel
ffb16d9103 More efficient indexing (#463)
* Using unrolled u32 VInt and caching Vec s

* cargo fmt

* Exposing a io::Write in the Expull thing

* expull as a writer. clippy + format

* inline the first block

* simplified -if let Some-

* vint reader iterator

* blop
2019-01-13 14:51:18 +09:00
Paul Masurel
98ca703daa More efficient indexing (#462)
* Using unrolled u32 VInt and caching Vec s

* cargo fmt

* Exposing a io::Write in the Expull thing

* expull as a writer. clippy + format

* inline the first block

* simplified -if let Some-

* vint reader iterator
2019-01-13 14:41:56 +09:00
Paul Masurel
b9d25cda5d Using LittleEndian explicitely 2019-01-08 12:41:58 +09:00
Paul Masurel
beb4289ec2 Less unsafe 2019-01-08 00:48:14 +09:00
Andrew Banchich
bdd72e4683 Update README.md (#459)
Fix Elasticsearch spelling
2018-12-27 07:26:49 +09:00
Paul Masurel
45c3cd19be Fixing README: git clone https... 2018-12-26 21:13:33 +09:00
Paul Masurel
b8241c5603 0.8.0 2018-12-26 10:18:34 +09:00
Paul Masurel
a4745151c0 Version to 0.8 2018-12-26 10:11:06 +09:00
Paul Masurel
e2ce326a8c Merge branch 'issue/457' 2018-12-18 10:35:01 +09:00
Paul Masurel
bb21d12a70 Bumping version 2018-12-18 10:14:12 +09:00
Paul Masurel
4565aba62a Added unit test for exponential search 2018-12-18 09:24:31 +09:00
Paul Masurel
545a7ec8dd Closes #457 2018-12-18 09:18:46 +09:00
Paul Masurel
e68775d71c Format and update murmurhash32 version 2018-12-17 19:12:38 +09:00
Paul Masurel
dcc92d287e Facet remove unsafe (#456)
* Removing some unsafe

* Removing some unsafe (2)

* Remove murmurhash
2018-12-17 19:08:48 +09:00
Paul Masurel
b48f81c051 Removing unsafe from bitpacking code (#455) 2018-12-17 19:06:37 +09:00
Paul Masurel
a3042e956b Facet remove unsafe (#454)
* Removing some unsafe

* Removing some unsafe (2)
2018-12-17 09:31:09 +09:00
dependabot[bot]
1fa10f0a0b Update itertools requirement from 0.7 to 0.8 (#453)
Updates the requirements on [itertools](https://github.com/bluss/rust-itertools) to permit the latest version.
- [Release notes](https://github.com/bluss/rust-itertools/releases)
- [Commits](https://github.com/bluss/rust-itertools/commits/0.8.0)

Signed-off-by: dependabot[bot] <support@dependabot.com>
2018-12-17 09:28:36 +09:00
Paul Masurel
279a9eb5e3 Closes #449 (#450)
Clippy working on stable.
Clippy warnings addressed
2018-12-10 12:20:59 +09:00
fdb-hiroshima
21a24672d8 Add accessors for Snippet and HighlightSection (#448)
* Add accessors for Snippet and HighlightSection

And add an example of custom highlighter

* Remove inline(always) and unnecessary empty lines
2018-12-02 18:00:16 +09:00
dependabot[bot]
a3f1fbaae6 Update scoped-pool requirement from 0.1 to 1.0 (#447)
Updates the requirements on [scoped-pool](https://github.com/reem/rust-scoped-pool) to permit the latest version.
- [Release notes](https://github.com/reem/rust-scoped-pool/releases)
- [Commits](https://github.com/reem/rust-scoped-pool/commits/1.0.0)

Signed-off-by: dependabot[bot] <support@dependabot.com>
2018-12-01 13:54:59 +09:00
Paul Masurel
a6e767c877 Cargo fmt 2018-11-30 22:52:45 +09:00
Paul Masurel
6af0488dbe Executor made sorted 2018-11-30 22:52:26 +09:00
Paul Masurel
07d87e154b Collector refactoring and multithreaded search (#437)
* Split Collector into an overall Collector and a per-segment SegmentCollector. Precursor to cross-segment parallelism, and as a side benefit cleans up any per-segment fields from being Option<T> to just T.

* Attempt to add MultiCollector back

* working. Chained collector is broken though

* Fix chained collector

* Fix test

* Make Weight Send+Sync for parallelization purposes

* Expose parameters of RangeQuery for external usage

* Removed &mut self

* fixing tests

* Restored TestCollectors

* blop

* multicollector working

* chained collector working

* test broken

* fixing unit test

* blop

* blop

* Blop

* simplifying APi

* blop

* better syntax

* Simplifying top_collector

* refactoring

* blop

* Sync with master

* Added multithread search

* Collector refactoring

* Schema::builder

* CR and rustdoc

* CR comments

* blop

* Added an executor

* Sorted the segment readers in the searcher

* Update searcher.rs

* Fixed unit testst

* changed the place where we have the sort-segment-by-count heuristic

* using crossbeam::channel

* inlining

* Comments about panics propagating

* Added unit test for executor panicking

* Readded default

* Removed Default impl

* Added unit test for executor
2018-11-30 22:46:59 +09:00
Paul Masurel
8b0b0133dd Importing crossbeam_channel from crossbeam reexport. 2018-11-19 09:19:28 +09:00
dependabot[bot]
7b9752f897 Update crossbeam-channel requirement from 0.2 to 0.3 (#436)
* Update crossbeam-channel requirement from 0.2 to 0.3

Updates the requirements on [crossbeam-channel](https://github.com/crossbeam-rs/crossbeam-channel) to permit the latest version.
- [Release notes](https://github.com/crossbeam-rs/crossbeam-channel/releases)
- [Changelog](https://github.com/crossbeam-rs/crossbeam-channel/blob/master/CHANGELOG.md)
- [Commits](https://github.com/crossbeam-rs/crossbeam-channel/commits/v0.3.0)

Signed-off-by: dependabot[bot] <support@dependabot.com>

* fixing build
2018-11-16 14:26:59 +09:00
dependabot[bot]
c92f41aea8 Update rand requirement from 0.5 to 0.6 (#440)
* Update rand requirement from 0.5 to 0.6

Updates the requirements on [rand](https://github.com/rust-random/rand) to permit the latest version.
- [Release notes](https://github.com/rust-random/rand/releases)
- [Changelog](https://github.com/rust-random/rand/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rust-random/rand/commits)

Signed-off-by: dependabot[bot] <support@dependabot.com>

* Updating rand.
2018-11-16 12:38:01 +09:00
Do Duy
dea16f1d9d Derive Clone for QueryParser (#442) 2018-11-15 18:45:40 +09:00
dependabot[bot]
236cfbec08 Update crossbeam requirement from 0.4 to 0.5 (#438)
Updates the requirements on [crossbeam](https://github.com/crossbeam-rs/crossbeam) to permit the latest version.
- [Release notes](https://github.com/crossbeam-rs/crossbeam/releases)
- [Changelog](https://github.com/crossbeam-rs/crossbeam/blob/master/CHANGELOG.md)
- [Commits](https://github.com/crossbeam-rs/crossbeam/commits/crossbeam-0.5.0)

Signed-off-by: dependabot[bot] <support@dependabot.com>
2018-11-15 06:16:22 +09:00
Paul Masurel
edcafb69bb Fixed benches 2018-11-10 17:04:29 -08:00
Paul Masurel
14908479d5 Release 0.7.1 2018-11-02 17:56:25 +09:00
Dru Sellers
ab4593eeb7 Adds open_or_create method (#428)
* Change the semantic of Index::create_in_dir.

It should return an error if the directory already contains an Index.

* Index::open_or_create is working

* additional test

* Checking that schema matches on open_or_create.

Simplifying unit tests.

* simplifying Eq
2018-10-31 08:36:39 +09:00
Dru Sellers
e75bb1d6a1 Fix NGram processing of non-ascii characters (#430)
* A working version

* optimize the ngram parsing

* Decoding codepoint only once.

* Closes #429

* using leading_zeros to make code less cryptic

* lookup in a table
2018-10-31 08:35:27 +09:00
dependabot[bot]
63b9d62237 Update base64 requirement from 0.9.1 to 0.10.0 (#433)
Updates the requirements on [base64](https://github.com/alicemaz/rust-base64) to permit the latest version.
- [Release notes](https://github.com/alicemaz/rust-base64/releases)
- [Changelog](https://github.com/alicemaz/rust-base64/blob/master/RELEASE-NOTES.md)
- [Commits](https://github.com/alicemaz/rust-base64/commits/v0.10.0)

Signed-off-by: dependabot[bot] <support@dependabot.com>
2018-10-31 08:34:44 +09:00
158 changed files with 13128 additions and 4858 deletions

View File

@@ -61,6 +61,9 @@ before_script:
script: script:
- bash ci/script.sh - bash ci/script.sh
after_success:
- cargo doc-upload
before_deploy: before_deploy:
- sh ci/before_deploy.sh - sh ci/before_deploy.sh
@@ -68,6 +71,11 @@ cache: cargo
before_cache: before_cache:
# Travis can't cache files that are not readable by "others" # Travis can't cache files that are not readable by "others"
- chmod -R a+r $HOME/.cargo - chmod -R a+r $HOME/.cargo
- find ./target/debug -type f -maxdepth 1 -delete
- rm -f ./target/.rustc_info.json
- rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
- rm -r target/debug/examples/
- ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
#branches: #branches:
# only: # only:
@@ -77,4 +85,4 @@ before_cache:
notifications: notifications:
email: email:
on_success: never on_success: never

View File

@@ -1,3 +1,116 @@
Tantivy 0.10.0
=====================
*Tantivy 0.10.0 index format is compatible with the index format in 0.9.0.*
- Added an ASCII folding filter (@drusellers)
- Bugfix in `query.count` in presence of deletes (@pmasurel)
Minor
---------
- Small simplification of the code.
Calling .freq() or .doc() when .advance() has never been called
on segment postings should panic from now on.
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
- Fast fields are now preloaded when the `SegmentReader` is created.
- `IndexMeta` is now public. (@hntd187)
- `IndexWriter` `add_document`, `delete_term`. `IndexWriter` is `Sync`, making it possible to use it with a `
Arc<RwLock<IndexWriter>>`. `add_document` and `delete_term` can
only require a read lock. (@pmasurel)
- Introducing `Opstamp` as an expressive type alias for `u64`. (@petr-tik)
- Stamper now relies on `AtomicU64` on all platforms (@petr-tik)
## How to update?
Your existing indexes are usable as is, but you may need some
trivial updates.
### Fast fields
Fast fields used to be accessed directly from the `SegmentReader`.
The API changed, you are now required to acquire your fast field reader via the
`segment_reader.fast_fields()`, and use one of the typed method:
- `.u64()`, `.i64()` if your field is single-valued ;
- `.u64s()`, `.i64s()` if your field is multi-valued ;
- `.bytes()` if your field is bytes fast field.
Tantivy 0.9.0
=====================
*0.9.0 index format is not compatible with the
previous index format.*
- MAJOR BUGFIX :
Some `Mmap` objects were being leaked, and would never get released. (@fulmicoton)
- Removed most unsafe (@fulmicoton)
- Indexer memory footprint improved. (VInt comp, inlining the first block. (@fulmicoton)
- Stemming in other language possible (@pentlander)
- Segments with no docs are deleted earlier (@barrotsteindev)
- Added grouped add and delete operations.
They are guaranteed to happen together (i.e. they cannot be split by a commit).
In addition, adds are guaranteed to happen on the same segment. (@elbow-jason)
- Removed `INT_STORED` and `INT_INDEXED`. It is now possible to use `STORED` and `INDEXED`
for int fields. (@fulmicoton)
- Added DateTime field (@barrotsteindev)
- Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
- SIMD linear search within blocks (@fulmicoton)
## How to update ?
tantivy 0.9 brought some API breaking change.
To update from tantivy 0.8, you will need to go through the following steps.
- `schema::INT_INDEXED` and `schema::INT_STORED` should be replaced by `schema::INDEXED` and `schema::INT_STORED`.
- The index now does not hold the pool of searcher anymore. You are required to create an intermediary object called
`IndexReader` for this.
```rust
// create the reader. You typically need to create 1 reader for the entire
// lifetime of you program.
let reader = index.reader()?;
// Acquire a searcher (previously `index.searcher()`) is now written:
let searcher = reader.searcher();
// With the default setting of the reader, you are not required to
// call `index.load_searchers()` anymore.
//
// The IndexReader will pick up that change automatically, regardless
// of whether the update was done in a different process or not.
// If this behavior is not wanted, you can create your reader with
// the `ReloadPolicy::Manual`, and manually decide when to reload the index
// by calling `reader.reload()?`.
```
Tantivy 0.8.2
=====================
Fixing build for x86_64 platforms. (#496)
No need to update from 0.8.1 if tantivy
is building on your platform.
Tantivy 0.8.1
=====================
Hotfix of #476.
Merge was reflecting deletes before commit was passed.
Thanks @barrotsteindev for reporting the bug.
Tantivy 0.8.0
=====================
*No change in the index format*
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)
- Multithreaded search (@jwolfe, @fulmicoton)
Tantivy 0.7.1
=====================
*No change in the index format*
- Bugfix: NGramTokenizer panics on non ascii chars
- Added a space usage API
Tantivy 0.7 Tantivy 0.7
===================== =====================

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.7.0" version = "0.10.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -12,50 +12,55 @@ readme = "README.md"
keywords = ["search", "information", "retrieval"] keywords = ["search", "information", "retrieval"]
[dependencies] [dependencies]
base64 = "0.9.1" base64 = "0.10.0"
byteorder = "1.0" byteorder = "1.0"
lazy_static = "1" lazy_static = "1"
regex = "1.0" regex = "1.0"
fst = {version="0.3", default-features=false} tantivy-fst = "0.1"
fst-regex = { version="0.2" } memmap = {version = "0.7", optional=true}
lz4 = {version="1.20", optional=true} lz4 = {version="1.20", optional=true}
snap = {version="0.2"} snap = {version="0.2"}
atomicwrites = {version="0.2.2", optional=true} atomicwrites = {version="0.2.2", optional=true}
tempfile = "3.0" tempfile = "3.0"
log = "0.4" log = "0.4"
combine = "3" combine = ">=3.6.0,<4.0.0"
tempdir = "0.3" tempdir = "0.3"
serde = "1.0" serde = "1.0"
serde_derive = "1.0" serde_derive = "1.0"
serde_json = "1.0" serde_json = "1.0"
num_cpus = "1.2" num_cpus = "1.2"
itertools = "0.7" fs2={version="0.4", optional=true}
itertools = "0.8"
levenshtein_automata = {version="0.1", features=["fst_automaton"]} levenshtein_automata = {version="0.1", features=["fst_automaton"]}
notify = {version="4", optional=true}
bit-set = "0.5" bit-set = "0.5"
uuid = { version = "0.7", features = ["v4", "serde"] } uuid = { version = "0.7.2", features = ["v4", "serde"] }
crossbeam = "0.4" crossbeam = "0.5"
crossbeam-channel = "0.2"
futures = "0.1" futures = "0.1"
futures-cpupool = "0.1" futures-cpupool = "0.1"
owning_ref = "0.4" owning_ref = "0.4"
stable_deref_trait = "1.0.0" stable_deref_trait = "1.0.0"
rust-stemmers = "1" rust-stemmers = "1.1"
downcast = { version="0.9" } downcast-rs = { version="1.0" }
matches = "0.1" bitpacking = "0.6"
bitpacking = "0.5" census = "0.2"
census = "0.1"
fnv = "1.0.6" fnv = "1.0.6"
owned-read = "0.4" owned-read = "0.4"
failure = "0.1" failure = "0.1"
htmlescape = "0.3.1" htmlescape = "0.3.1"
fail = "0.2" fail = "0.2"
scoped-pool = "1.0"
murmurhash32 = "0.2"
chrono = "0.4"
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
winapi = "0.2" winapi = "0.2"
[dev-dependencies] [dev-dependencies]
rand = "0.5" rand = "0.6"
maplit = "1" maplit = "1"
matches = "0.1.8"
time = "0.1.42"
[profile.release] [profile.release]
opt-level = 3 opt-level = 3
@@ -69,11 +74,11 @@ overflow-checks = true
[features] [features]
# by default no-fail is disabled. We manually enable it when running test. # by default no-fail is disabled. We manually enable it when running test.
default = ["mmap", "no_fail"] default = ["mmap", "no_fail"]
mmap = ["fst/mmap", "atomicwrites"] mmap = ["atomicwrites", "fs2", "memmap", "notify"]
lz4-compression = ["lz4"] lz4-compression = ["lz4"]
no_fail = ["fail/no_fail"] no_fail = ["fail/no_fail"]
unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
[badges] [badges]
travis-ci = { repository = "tantivy-search/tantivy" } travis-ci = { repository = "tantivy-search/tantivy" }

View File

@@ -17,19 +17,29 @@
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/6)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6) [![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/6)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/7)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7) [![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/7)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
[![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
**Tantivy** is a **full text search engine library** written in rust. **Tantivy** is a **full text search engine library** written in rust.
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
an off-the-shelf search engine server, but rather a crate that can be used an off-the-shelf search engine server, but rather a crate that can be used
to build such a search engine. to build such a search engine.
Tantivy is, in fact, strongly inspired by Lucene's design. Tantivy is, in fact, strongly inspired by Lucene's design.
# Benchmark
Tantivy is typically faster than Lucene, but the results will depend on
the nature of the queries in your workload.
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
performance for different type of queries / collection.
# Features # Features
- Full-text search - Full-text search
- Configurable tokenizer. (stemming available for 17 latin languages. Third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)) and [Japanese](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:) - Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
- Tiny startup time (<10ms), perfect for command line tools - Tiny startup time (<10ms), perfect for command line tools
- BM25 scoring (the same as lucene) - BM25 scoring (the same as lucene)
@@ -41,6 +51,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set. - SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene) - Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
- `&[u8]` fast fields - `&[u8]` fast fields
- Text, i64, u64, dates and hierarchical facet fields
- LZ4 compressed document store - LZ4 compressed document store
- Range queries - Range queries
- Faceted search - Faceted search
@@ -76,7 +87,7 @@ It will walk you through getting a wikipedia search engine up and running in a f
Tantivy compiles on stable rust but requires `Rust >= 1.27`. Tantivy compiles on stable rust but requires `Rust >= 1.27`.
To check out and run tests, you can simply run : To check out and run tests, you can simply run :
git clone git@github.com:tantivy-search/tantivy.git git clone https://github.com/tantivy-search/tantivy.git
cd tantivy cd tantivy
cargo build cargo build
@@ -85,6 +96,14 @@ To check out and run tests, you can simply run :
Some tests will not run with just `cargo test` because of `fail-rs`. Some tests will not run with just `cargo test` because of `fail-rs`.
To run the tests exhaustively, run `./run-tests.sh`. To run the tests exhaustively, run `./run-tests.sh`.
# Contribute # How can I support this project ?
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy. There are many ways to support this project.
- If you use tantivy, tell us about your experience on [gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
- Report bugs
- Write a blog post
- Complete documentation
- Contribute code (you can join [our gitter](https://gitter.im/tantivy-search/tantivy) )
- Talk about tantivy around you
- Drop a word on on [![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/fulmicoton) or even [![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)

View File

@@ -16,10 +16,12 @@ extern crate tempdir;
// Importing tantivy... // Importing tantivy...
#[macro_use] #[macro_use]
extern crate tantivy; extern crate tantivy;
use tantivy::collector::TopCollector; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::Index; use tantivy::Index;
use tantivy::ReloadPolicy;
use tempdir::TempDir;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the // Let's create a temporary directory for the
@@ -34,7 +36,7 @@ fn main() -> tantivy::Result<()> {
// be indexed". // be indexed".
// first we need to define a schema ... // first we need to define a schema ...
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
// Our first field is title. // Our first field is title.
// We want full-text search for it, and we also want // We want full-text search for it, and we also want
@@ -105,37 +107,37 @@ fn main() -> tantivy::Result<()> {
// For convenience, tantivy also comes with a macro to // For convenience, tantivy also comes with a macro to
// reduce the boilerplate above. // reduce the boilerplate above.
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Of Mice and Men", title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \ bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \ over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \ side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \ Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \ fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
)); ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Of Mice and Men", title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \ bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \ over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \ side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \ Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \ fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
)); ));
// Multivalued field just need to be repeated. // Multivalued field just need to be repeated.
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",
title => "The Modern Prometheus", title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \ body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \ enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \ yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking." increasing confidence in the success of my undertaking."
)); ));
// This is an example, so we will only index 3 documents // This is an example, so we will only index 3 documents
@@ -169,24 +171,33 @@ fn main() -> tantivy::Result<()> {
// //
// ### Searcher // ### Searcher
// //
// Let's search our index. Start by reloading // A reader is required to get search the index.
// searchers in the index. This should be done // It acts as a `Searcher` pool that reloads itself,
// after every `commit()`. // depending on a `ReloadPolicy`.
index.load_searchers()?; //
// For a search server you will typically create one reader for the entire lifetime of your
// program, and acquire a new searcher for every single request.
//
// In the code below, we rely on the 'ON_COMMIT' policy: the reader
// will reload the index automatically after each commit.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
// We now need to acquire a searcher. // We now need to acquire a searcher.
// Some search experience might require more than
// one query.
// //
// The searcher ensure that we get to work // A searcher points to snapshotted, immutable version of the index.
// with a consistent version of the index. //
// Some search experience might require more than
// one query. Using the same searcher ensures that all of these queries will run on the
// same version of the index.
// //
// Acquiring a `searcher` is very cheap. // Acquiring a `searcher` is very cheap.
// //
// You should acquire a searcher every time you // You should acquire a searcher every time you start processing a request and
// start processing a request and
// and release it right after your query is finished. // and release it right after your query is finished.
let searcher = index.searcher(); let searcher = reader.searcher();
// ### Query // ### Query
@@ -212,15 +223,10 @@ fn main() -> tantivy::Result<()> {
// //
// We are not interested in all of the documents but // We are not interested in all of the documents but
// only in the top 10. Keeping track of our top 10 best documents // only in the top 10. Keeping track of our top 10 best documents
// is the role of the TopCollector. // is the role of the TopDocs.
let mut top_collector = TopCollector::with_limit(10);
// We can now perform our query. // We can now perform our query.
searcher.search(&*query, &mut top_collector)?; let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
// Our top collector now contains the 10
// most relevant doc ids...
let doc_addresses = top_collector.docs();
// The actual documents still need to be // The actual documents still need to be
// retrieved from Tantivy's store. // retrieved from Tantivy's store.
@@ -228,13 +234,10 @@ fn main() -> tantivy::Result<()> {
// Since the body field was not configured as stored, // Since the body field was not configured as stored,
// the document returned will only contain // the document returned will only contain
// a title. // a title.
for (_score, doc_address) in top_docs {
for doc_address in doc_addresses {
let retrieved_doc = searcher.doc(doc_address)?; let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc)); println!("{}", schema.to_json(&retrieved_doc));
} }
Ok(()) Ok(())
} }
use tempdir::TempDir;

View File

@@ -0,0 +1,196 @@
// # Custom collector example
//
// This example shows how you can implement your own
// collector. As an example, we will compute a collector
// that computes the standard deviation of a given fast field.
//
// Of course, you can have a look at the tantivy's built-in collectors
// such as the `CountCollector` for more examples.
extern crate tempdir;
// ---
// Importing tantivy...
#[macro_use]
extern crate tantivy;
use tantivy::collector::{Collector, SegmentCollector};
use tantivy::fastfield::FastFieldReader;
use tantivy::query::QueryParser;
use tantivy::schema::Field;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::SegmentReader;
use tantivy::{Index, TantivyError};
#[derive(Default)]
struct Stats {
count: usize,
sum: f64,
squared_sum: f64,
}
impl Stats {
pub fn count(&self) -> usize {
self.count
}
pub fn mean(&self) -> f64 {
self.sum / (self.count as f64)
}
fn square_mean(&self) -> f64 {
self.squared_sum / (self.count as f64)
}
pub fn standard_deviation(&self) -> f64 {
let mean = self.mean();
(self.square_mean() - mean * mean).sqrt()
}
fn non_zero_count(self) -> Option<Stats> {
if self.count == 0 {
None
} else {
Some(self)
}
}
}
struct StatsCollector {
field: Field,
}
impl StatsCollector {
fn with_field(field: Field) -> StatsCollector {
StatsCollector { field }
}
}
impl Collector for StatsCollector {
// That's the type of our result.
// Our standard deviation will be a float.
type Fruit = Option<Stats>;
type Child = StatsSegmentCollector;
fn for_segment(
&self,
_segment_local_id: u32,
segment_reader: &SegmentReader,
) -> tantivy::Result<StatsSegmentCollector> {
let fast_field_reader = segment_reader
.fast_fields()
.u64(self.field)
.ok_or_else(|| {
let field_name = segment_reader.schema().get_field_name(self.field);
TantivyError::SchemaError(format!(
"Field {:?} is not a u64 fast field.",
field_name
))
})?;
Ok(StatsSegmentCollector {
fast_field_reader,
stats: Stats::default(),
})
}
fn requires_scoring(&self) -> bool {
// this collector does not care about score.
false
}
fn merge_fruits(&self, segment_stats: Vec<Option<Stats>>) -> tantivy::Result<Option<Stats>> {
let mut stats = Stats::default();
for segment_stats_opt in segment_stats {
if let Some(segment_stats) = segment_stats_opt {
stats.count += segment_stats.count;
stats.sum += segment_stats.sum;
stats.squared_sum += segment_stats.squared_sum;
}
}
Ok(stats.non_zero_count())
}
}
struct StatsSegmentCollector {
fast_field_reader: FastFieldReader<u64>,
stats: Stats,
}
impl SegmentCollector for StatsSegmentCollector {
type Fruit = Option<Stats>;
fn collect(&mut self, doc: u32, _score: f32) {
let value = self.fast_field_reader.get(doc) as f64;
self.stats.count += 1;
self.stats.sum += value;
self.stats.squared_sum += value * value;
}
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
self.stats.non_zero_count()
}
}
fn main() -> tantivy::Result<()> {
// # Defining the schema
//
// The Tantivy index requires a very strict schema.
// The schema declares which fields are in the index,
// and for each field, its type and "the way it should
// be indexed".
// first we need to define a schema ...
let mut schema_builder = Schema::builder();
// We'll assume a fictional index containing
// products, and with a name, a description, and a price.
let product_name = schema_builder.add_text_field("name", TEXT);
let product_description = schema_builder.add_text_field("description", TEXT);
let price = schema_builder.add_u64_field("price", INDEXED | FAST);
let schema = schema_builder.build();
// # Indexing documents
//
// Lets index a bunch of fake documents for the sake of
// this example.
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
product_name => "Super Broom 2000",
product_description => "While it is ok for short distance travel, this broom \
was designed quiditch. It will up your game.",
price => 30_200u64
));
index_writer.add_document(doc!(
product_name => "Turbulobroom",
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
You'll enjoy its sharp turns, and rapid acceleration",
price => 29_240u64
));
index_writer.add_document(doc!(
product_name => "Broomio",
product_description => "Great value for the price. This broom is a market favorite",
price => 21_240u64
));
index_writer.add_document(doc!(
product_name => "Whack a Mole",
product_description => "Prime quality bat.",
price => 5_200u64
));
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);
// here we want to get a hit on the 'ken' in Frankenstein
let query = query_parser.parse_query("broom")?;
if let Some(stats) = searcher.search(&query, &StatsCollector::with_field(price))? {
println!("count: {}", stats.count());
println!("mean: {}", stats.mean());
println!("standard deviation: {}", stats.standard_deviation());
}
Ok(())
}

View File

@@ -5,7 +5,7 @@
#[macro_use] #[macro_use]
extern crate tantivy; extern crate tantivy;
use tantivy::collector::TopCollector; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::tokenizer::NgramTokenizer; use tantivy::tokenizer::NgramTokenizer;
@@ -20,7 +20,7 @@ fn main() -> tantivy::Result<()> {
// be indexed". // be indexed".
// first we need to define a schema ... // first we need to define a schema ...
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
// Our first field is title. // Our first field is title.
// In this example we want to use NGram searching // In this example we want to use NGram searching
@@ -68,12 +68,12 @@ fn main() -> tantivy::Result<()> {
// heap for the indexer can increase its throughput. // heap for the indexer can increase its throughput.
let mut index_writer = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "The Old Man and the Sea", title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \ body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish." he had gone eighty-four days now without taking a fish."
)); ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Of Mice and Men", title => "Of Mice and Men",
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
bank and runs deep and green. The water is warm too, for it has slipped twinkling bank and runs deep and green. The water is warm too, for it has slipped twinkling
over the yellow sands in the sunlight before reaching the narrow pool. On one over the yellow sands in the sunlight before reaching the narrow pool. On one
@@ -84,16 +84,16 @@ fn main() -> tantivy::Result<()> {
limbs and branches that arch over the pool"# limbs and branches that arch over the pool"#
)); ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
enterprise which you have regarded with such evil forebodings. I arrived here enterprise which you have regarded with such evil forebodings. I arrived here
yesterday, and my first task is to assure my dear sister of my welfare and yesterday, and my first task is to assure my dear sister of my welfare and
increasing confidence in the success of my undertaking."# increasing confidence in the success of my undertaking."#
)); ));
index_writer.commit()?; index_writer.commit()?;
index.load_searchers()?;
let searcher = index.searcher(); let reader = index.reader()?;
let searcher = reader.searcher();
// The query parser can interpret human queries. // The query parser can interpret human queries.
// Here, if the user does not specify which // Here, if the user does not specify which
@@ -104,11 +104,9 @@ fn main() -> tantivy::Result<()> {
// here we want to get a hit on the 'ken' in Frankenstein // here we want to get a hit on the 'ken' in Frankenstein
let query = query_parser.parse_query("ken")?; let query = query_parser.parse_query("ken")?;
let mut top_collector = TopCollector::with_limit(10); let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
searcher.search(&*query, &mut top_collector)?;
let doc_addresses = top_collector.docs(); for (_, doc_address) in top_docs {
for doc_address in doc_addresses {
let retrieved_doc = searcher.doc(doc_address)?; let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc)); println!("{}", schema.to_json(&retrieved_doc));
} }

View File

@@ -10,16 +10,20 @@
// Importing tantivy... // Importing tantivy...
#[macro_use] #[macro_use]
extern crate tantivy; extern crate tantivy;
use tantivy::collector::TopCollector; use tantivy::collector::TopDocs;
use tantivy::query::TermQuery; use tantivy::query::TermQuery;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::Index; use tantivy::Index;
use tantivy::IndexReader;
// A simple helper function to fetch a single document // A simple helper function to fetch a single document
// given its id from our index. // given its id from our index.
// It will be helpful to check our work. // It will be helpful to check our work.
fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Option<Document>> { fn extract_doc_given_isbn(
let searcher = index.searcher(); reader: &IndexReader,
isbn_term: &Term,
) -> tantivy::Result<Option<Document>> {
let searcher = reader.searcher();
// This is the simplest query you can think of. // This is the simplest query you can think of.
// It matches all of the documents containing a specific term. // It matches all of the documents containing a specific term.
@@ -27,10 +31,9 @@ fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Op
// The second argument is here to tell we don't care about decoding positions, // The second argument is here to tell we don't care about decoding positions,
// or term frequencies. // or term frequencies.
let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic); let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic);
let mut top_collector = TopCollector::with_limit(1); let top_docs = searcher.search(&term_query, &TopDocs::with_limit(1))?;
searcher.search(&term_query, &mut top_collector)?;
if let Some(doc_address) = top_collector.docs().first() { if let Some((_score, doc_address)) = top_docs.first() {
let doc = searcher.doc(*doc_address)?; let doc = searcher.doc(*doc_address)?;
Ok(Some(doc)) Ok(Some(doc))
} else { } else {
@@ -44,7 +47,7 @@ fn main() -> tantivy::Result<()> {
// //
// Check out the *basic_search* example if this makes // Check out the *basic_search* example if this makes
// small sense to you. // small sense to you.
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
// Tantivy does not really have a notion of primary id. // Tantivy does not really have a notion of primary id.
// This may change in the future. // This may change in the future.
@@ -86,12 +89,12 @@ fn main() -> tantivy::Result<()> {
isbn => "978-9176370711", isbn => "978-9176370711",
)); ));
index_writer.commit()?; index_writer.commit()?;
index.load_searchers()?; let reader = index.reader()?;
let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711"); let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711");
// Oops our frankenstein doc seems mispelled // Oops our frankenstein doc seems mispelled
let frankenstein_doc_misspelled = extract_doc_given_isbn(&index, &frankenstein_isbn)?.unwrap(); let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!( assert_eq!(
schema.to_json(&frankenstein_doc_misspelled), schema.to_json(&frankenstein_doc_misspelled),
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#, r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
@@ -130,10 +133,10 @@ fn main() -> tantivy::Result<()> {
// Everything happened as if the document was updated. // Everything happened as if the document was updated.
index_writer.commit()?; index_writer.commit()?;
// We reload our searcher to make our change available to clients. // We reload our searcher to make our change available to clients.
index.load_searchers()?; reader.reload()?;
// No more typo! // No more typo!
let frankenstein_new_doc = extract_doc_given_isbn(&index, &frankenstein_isbn)?.unwrap(); let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!( assert_eq!(
schema.to_json(&frankenstein_new_doc), schema.to_json(&frankenstein_new_doc),
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#, r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,

View File

@@ -25,7 +25,7 @@ fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the // Let's create a temporary directory for the
// sake of this example // sake of this example
let index_path = TempDir::new("tantivy_facet_example_dir")?; let index_path = TempDir::new("tantivy_facet_example_dir")?;
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
schema_builder.add_text_field("name", TEXT | STORED); schema_builder.add_text_field("name", TEXT | STORED);
@@ -55,18 +55,17 @@ fn main() -> tantivy::Result<()> {
index_writer.commit()?; index_writer.commit()?;
index.load_searchers()?; let reader = index.reader()?;
let searcher = index.searcher(); let searcher = reader.searcher();
let mut facet_collector = FacetCollector::for_field(tags); let mut facet_collector = FacetCollector::for_field(tags);
facet_collector.add_facet("/pools"); facet_collector.add_facet("/pools");
searcher.search(&AllQuery, &mut facet_collector).unwrap(); let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
let counts = facet_collector.harvest();
// This lists all of the facet counts // This lists all of the facet counts
let facets: Vec<(&Facet, u64)> = counts.get("/pools").collect(); let facets: Vec<(&Facet, u64)> = facet_counts.get("/pools").collect();
assert_eq!( assert_eq!(
facets, facets,
vec![ vec![

View File

@@ -0,0 +1,43 @@
// # Searching a range on an indexed int field.
//
// Below is an example of creating an indexed integer field in your schema
// You can use RangeQuery to get a Count of all occurrences in a given range.
#[macro_use]
extern crate tantivy;
use tantivy::collector::Count;
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED};
use tantivy::Index;
use tantivy::Result;
fn run() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field
let mut schema_builder = Schema::builder();
// `INDEXED` is a short-hand to indicate that our field should be "searchable".
let year_field = schema_builder.add_u64_field("year", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
{
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
for year in 1950u64..2019u64 {
index_writer.add_document(doc!(year_field => year));
}
index_writer.commit()?;
// The index will be a range of years
}
reader.reload()?;
let searcher = reader.searcher();
// The end is excluded i.e. here we are searching up to 1969
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
// Uses a Count collector to sum the total number of docs in the range
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
assert_eq!(num_60s_books, 10);
Ok(())
}
fn main() {
run().unwrap()
}

View File

@@ -18,7 +18,7 @@ use tantivy::{DocId, DocSet, Postings};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the // We first create a schema for the sake of the
// example. Check the `basic_search` example for more information. // example. Check the `basic_search` example for more information.
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
// For this example, we need to make sure to index positions for our title // For this example, we need to make sure to index positions for our title
// field. `TEXT` precisely does this. // field. `TEXT` precisely does this.
@@ -33,9 +33,9 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc!(title => "The modern Promotheus")); index_writer.add_document(doc!(title => "The modern Promotheus"));
index_writer.commit()?; index_writer.commit()?;
index.load_searchers()?; let reader = index.reader()?;
let searcher = index.searcher(); let searcher = reader.searcher();
// A tantivy index is actually a collection of segments. // A tantivy index is actually a collection of segments.
// Similarly, a searcher just wraps a list `segment_reader`. // Similarly, a searcher just wraps a list `segment_reader`.

View File

@@ -0,0 +1,107 @@
// # Indexing from different threads.
//
// It is fairly common to have to index from different threads.
// Tantivy forbids to create more than one `IndexWriter` at a time.
//
// This `IndexWriter` itself has its own multithreaded layer, so managing your own
// indexing threads will not help. However, it can still be useful for some applications.
//
// For instance, if preparing documents to send to tantivy before indexing is the bottleneck of
// your application, it is reasonable to have multiple threads.
//
// Another very common reason to want to index from multiple threads, is implementing a webserver
// with CRUD capabilities. The server framework will most likely handle request from
// different threads.
//
// The recommended way to address both of these use case is to wrap your `IndexWriter` into a
// `Arc<RwLock<IndexWriter>>`.
//
// While this is counterintuitive, adding and deleting documents do not require mutability
// over the `IndexWriter`, so several threads will be able to do this operation concurrently.
//
// The example below does not represent an actual real-life use case (who would spawn thread to
// index a single document?), but aims at demonstrating the mechanism that makes indexing
// from several threads possible.
extern crate tempdir;
// ---
// Importing tantivy...
#[macro_use]
extern crate tantivy;
use std::sync::{Arc, RwLock};
use std::thread;
use std::time::Duration;
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::Opstamp;
use tantivy::{Index, IndexWriter};
fn main() -> tantivy::Result<()> {
// # Defining the schema
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT | STORED);
let body = schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let index_writer: Arc<RwLock<IndexWriter>> = Arc::new(RwLock::new(index.writer(50_000_000)?));
// # First indexing thread.
let index_writer_clone_1 = index_writer.clone();
thread::spawn(move || {
// we index 100 times the document... for the sake of the example.
for i in 0..100 {
let opstamp = {
// A read lock is sufficient here.
let index_writer_rlock = index_writer_clone_1.read().unwrap();
index_writer_rlock.add_document(
doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
))
};
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(20));
}
});
// # Second indexing thread.
let index_writer_clone_2 = index_writer.clone();
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.
thread::spawn(move || {
// we index 100 times the document... for the sake of the example.
for i in 0..100 {
// A read lock is sufficient here.
let opstamp = {
let index_writer_rlock = index_writer_clone_2.read().unwrap();
index_writer_rlock.add_document(doc!(
title => "Manufacturing consent",
body => "Some great book description..."
))
};
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(10));
}
});
// # In the main thread, we commit 10 times, once every 500ms.
for _ in 0..10 {
let opstamp: Opstamp = {
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
let mut index_writer_wlock = index_writer.write().unwrap();
index_writer_wlock.commit().unwrap()
};
println!("committed with opstamp {}", opstamp);
thread::sleep(Duration::from_millis(500));
}
Ok(())
}

View File

@@ -10,11 +10,11 @@ extern crate tempdir;
// Importing tantivy... // Importing tantivy...
#[macro_use] #[macro_use]
extern crate tantivy; extern crate tantivy;
use tantivy::collector::TopCollector; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::Index; use tantivy::Index;
use tantivy::SnippetGenerator; use tantivy::{Snippet, SnippetGenerator};
use tempdir::TempDir; use tempdir::TempDir;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
let index_path = TempDir::new("tantivy_example_dir")?; let index_path = TempDir::new("tantivy_example_dir")?;
// # Defining the schema // # Defining the schema
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT | STORED); let title = schema_builder.add_text_field("title", TEXT | STORED);
let body = schema_builder.add_text_field("body", TEXT | STORED); let body = schema_builder.add_text_field("body", TEXT | STORED);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -35,37 +35,52 @@ fn main() -> tantivy::Result<()> {
// we'll only need one doc for this example. // we'll only need one doc for this example.
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Of Mice and Men", title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \ bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \ over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \ side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \ Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \ fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
)); ));
// ... // ...
index_writer.commit()?; index_writer.commit()?;
index.load_searchers()?; let reader = index.reader()?;
let searcher = reader.searcher();
let searcher = index.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, body]); let query_parser = QueryParser::for_index(&index, vec![title, body]);
let query = query_parser.parse_query("sycamore spring")?; let query = query_parser.parse_query("sycamore spring")?;
let mut top_collector = TopCollector::with_limit(10); let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
searcher.search(&*query, &mut top_collector)?;
let snippet_generator = SnippetGenerator::new(&searcher, &*query, body)?; let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
let doc_addresses = top_collector.docs(); for (score, doc_address) in top_docs {
for doc_address in doc_addresses {
let doc = searcher.doc(doc_address)?; let doc = searcher.doc(doc_address)?;
let snippet = snippet_generator.snippet_from_doc(&doc); let snippet = snippet_generator.snippet_from_doc(&doc);
println!("Document score {}:", score);
println!("title: {}", doc.get_first(title).unwrap().text().unwrap()); println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
println!("snippet: {}", snippet.to_html()); println!("snippet: {}", snippet.to_html());
println!("custom highlighting: {}", highlight(snippet));
} }
Ok(()) Ok(())
} }
fn highlight(snippet: Snippet) -> String {
let mut result = String::new();
let mut start_from = 0;
for (start, end) in snippet.highlighted().iter().map(|h| h.bounds()) {
result.push_str(&snippet.fragments()[start_from..start]);
result.push_str(" --> ");
result.push_str(&snippet.fragments()[start..end]);
result.push_str(" <-- ");
start_from = end;
}
result.push_str(&snippet.fragments()[start_from..]);
result
}

View File

@@ -15,7 +15,7 @@ extern crate tempdir;
// Importing tantivy... // Importing tantivy...
#[macro_use] #[macro_use]
extern crate tantivy; extern crate tantivy;
use tantivy::collector::TopCollector; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::tokenizer::*; use tantivy::tokenizer::*;
@@ -23,7 +23,7 @@ use tantivy::Index;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// this example assumes you understand the content in `basic_search` // this example assumes you understand the content in `basic_search`
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
// This configures your custom options for how tantivy will // This configures your custom options for how tantivy will
// store and process your content in the index; The key // store and process your content in the index; The key
@@ -72,48 +72,44 @@ fn main() -> tantivy::Result<()> {
title => "The Old Man and the Sea", title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \ body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish." he had gone eighty-four days now without taking a fish."
)); ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Of Mice and Men", title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \ bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \ over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \ side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \ Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \ fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
)); ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Frankenstein", title => "Frankenstein",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \ body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \ enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \ yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking." increasing confidence in the success of my undertaking."
)); ));
index_writer.commit()?; index_writer.commit()?;
index.load_searchers()?; let reader = index.reader()?;
let searcher = index.searcher(); let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, body]); let query_parser = QueryParser::for_index(&index, vec![title, body]);
// stop words are applied on the query as well. // stop words are applied on the query as well.
// The following will be equivalent to `title:frankenstein` // The following will be equivalent to `title:frankenstein`
let query = query_parser.parse_query("title:\"the Frankenstein\"")?; let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let mut top_collector = TopCollector::with_limit(10); for (score, doc_address) in top_docs {
searcher.search(&*query, &mut top_collector)?;
let doc_addresses = top_collector.docs();
for doc_address in doc_addresses {
let retrieved_doc = searcher.doc(doc_address)?; let retrieved_doc = searcher.doc(doc_address)?;
println!("\n==\nDocument score {}:", score);
println!("{}", schema.to_json(&retrieved_doc)); println!("{}", schema.to_json(&retrieved_doc));
} }

View File

@@ -9,10 +9,10 @@ fn main() -> tantivy::Result<()> {
// Check out the basic example if this is confusing to you. // Check out the basic example if this is confusing to you.
// //
// first we need to define a schema ... // first we need to define a schema ...
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", TEXT | STORED); schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("body", TEXT); schema_builder.add_text_field("body", TEXT);
schema_builder.add_u64_field("year", INT_INDEXED); schema_builder.add_u64_field("year", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
// Let's assume we have a json-serialized document. // Let's assume we have a json-serialized document.

View File

@@ -1,142 +0,0 @@
use collector::Collector;
use DocId;
use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
/// Collector that does nothing.
/// This is used in the chain Collector and will hopefully
/// be optimized away by the compiler.
pub struct DoNothingCollector;
impl Collector for DoNothingCollector {
#[inline]
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
}
#[inline]
fn collect(&mut self, _doc: DocId, _score: Score) {}
#[inline]
fn requires_scoring(&self) -> bool {
false
}
}
/// Zero-cost abstraction used to collect on multiple collectors.
/// This contraption is only usable if the type of your collectors
/// are known at compile time.
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT};
/// use tantivy::{Index, Result};
/// use tantivy::collector::{CountCollector, TopCollector, chain};
/// use tantivy::query::QueryParser;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of Muadib",
/// ));
/// index_writer.add_document(doc!(
/// title => "A Dairy Cow",
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut top_collector = TopCollector::with_limit(2);
/// let mut count_collector = CountCollector::default();
/// {
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// searcher.search(&*query, &mut collectors).unwrap();
/// }
/// assert_eq!(count_collector.count(), 2);
/// assert!(top_collector.at_capacity());
/// }
///
/// Ok(())
/// }
/// ```
pub struct ChainedCollector<Left: Collector, Right: Collector> {
left: Left,
right: Right,
}
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
/// Adds a collector
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
ChainedCollector {
left: self,
right: new_collector,
}
}
}
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
self.left.set_segment(segment_local_id, segment)?;
self.right.set_segment(segment_local_id, segment)?;
Ok(())
}
fn collect(&mut self, doc: DocId, score: Score) {
self.left.collect(doc, score);
self.right.collect(doc, score);
}
fn requires_scoring(&self) -> bool {
self.left.requires_scoring() || self.right.requires_scoring()
}
}
/// Creates a `ChainedCollector`
pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
ChainedCollector {
left: DoNothingCollector,
right: DoNothingCollector,
}
}
#[cfg(test)]
mod tests {
use super::*;
use collector::{Collector, CountCollector, TopCollector};
#[test]
fn test_chained_collector() {
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);
}
assert_eq!(count_collector.count(), 3);
assert!(top_collector.at_capacity());
}
}

View File

@@ -1,4 +1,5 @@
use super::Collector; use super::Collector;
use collector::SegmentCollector;
use DocId; use DocId;
use Result; use Result;
use Score; use Score;
@@ -11,14 +12,14 @@ use SegmentReader;
/// ```rust /// ```rust
/// #[macro_use] /// #[macro_use]
/// extern crate tantivy; /// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT}; /// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{Index, Result}; /// use tantivy::{Index, Result};
/// use tantivy::collector::CountCollector; /// use tantivy::collector::Count;
/// use tantivy::query::QueryParser; /// use tantivy::query::QueryParser;
/// ///
/// # fn main() { example().unwrap(); } /// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> { /// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new(); /// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
@@ -39,63 +40,90 @@ use SegmentReader;
/// index_writer.commit().unwrap(); /// index_writer.commit().unwrap();
/// } /// }
/// ///
/// index.load_searchers()?; /// let reader = index.reader()?;
/// let searcher = index.searcher(); /// let searcher = reader.searcher();
/// ///
/// { /// {
/// let mut count_collector = CountCollector::default();
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?; /// let query = query_parser.parse_query("diary")?;
/// searcher.search(&*query, &mut count_collector).unwrap(); /// let count = searcher.search(&query, &Count).unwrap();
/// ///
/// assert_eq!(count_collector.count(), 2); /// assert_eq!(count, 2);
/// } /// }
/// ///
/// Ok(()) /// Ok(())
/// } /// }
/// ``` /// ```
#[derive(Default)] pub struct Count;
pub struct CountCollector {
count: usize,
}
impl CountCollector { impl Collector for Count {
/// Returns the count of documents that were type Fruit = usize;
/// collected.
pub fn count(&self) -> usize {
self.count
}
}
impl Collector for CountCollector { type Child = SegmentCountCollector;
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
}
fn collect(&mut self, _: DocId, _: Score) { fn for_segment(&self, _: SegmentLocalId, _: &SegmentReader) -> Result<SegmentCountCollector> {
self.count += 1; Ok(SegmentCountCollector::default())
} }
fn requires_scoring(&self) -> bool { fn requires_scoring(&self) -> bool {
false false
} }
fn merge_fruits(&self, segment_counts: Vec<usize>) -> Result<usize> {
Ok(segment_counts.into_iter().sum())
}
}
#[derive(Default)]
pub struct SegmentCountCollector {
count: usize,
}
impl SegmentCollector for SegmentCountCollector {
type Fruit = usize;
fn collect(&mut self, _: DocId, _: Score) {
self.count += 1;
}
fn harvest(self) -> usize {
self.count
}
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::{Count, SegmentCountCollector};
use collector::{Collector, CountCollector}; use collector::Collector;
use collector::SegmentCollector;
#[test] #[test]
fn test_count_collector() { fn test_count_collect_does_not_requires_scoring() {
let mut count_collector = CountCollector::default(); assert!(!Count.requires_scoring());
assert_eq!(count_collector.count(), 0); }
count_collector.collect(0u32, 1f32);
assert_eq!(count_collector.count(), 1); #[test]
assert_eq!(count_collector.count(), 1); fn test_segment_count_collector() {
count_collector.collect(1u32, 1f32); {
assert_eq!(count_collector.count(), 2); let count_collector = SegmentCountCollector::default();
assert!(!count_collector.requires_scoring()); assert_eq!(count_collector.harvest(), 0);
}
{
let mut count_collector = SegmentCountCollector::default();
count_collector.collect(0u32, 1f32);
assert_eq!(count_collector.harvest(), 1);
}
{
let mut count_collector = SegmentCountCollector::default();
count_collector.collect(0u32, 1f32);
assert_eq!(count_collector.harvest(), 1);
}
{
let mut count_collector = SegmentCountCollector::default();
count_collector.collect(0u32, 1f32);
count_collector.collect(1u32, 1f32);
assert_eq!(count_collector.harvest(), 2);
}
} }
} }

View File

@@ -1,25 +1,23 @@
use collector::Collector; use collector::Collector;
use collector::SegmentCollector;
use docset::SkipResult; use docset::SkipResult;
use fastfield::FacetReader; use fastfield::FacetReader;
use schema::Facet; use schema::Facet;
use schema::Field; use schema::Field;
use std::cell::UnsafeCell; use std::cmp::Ordering;
use std::collections::btree_map; use std::collections::btree_map;
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::collections::BTreeSet; use std::collections::BTreeSet;
use std::collections::BinaryHeap; use std::collections::BinaryHeap;
use std::collections::Bound; use std::collections::Bound;
use std::iter::Peekable; use std::iter::Peekable;
use std::mem;
use std::{u64, usize}; use std::{u64, usize};
use termdict::TermMerger;
use std::cmp::Ordering;
use DocId; use DocId;
use Result; use Result;
use Score; use Score;
use SegmentLocalId; use SegmentLocalId;
use SegmentReader; use SegmentReader;
use TantivyError;
struct Hit<'a> { struct Hit<'a> {
count: u64, count: u64,
@@ -46,12 +44,6 @@ impl<'a> Ord for Hit<'a> {
} }
} }
struct SegmentFacetCounter {
pub facet_reader: FacetReader,
pub facet_ords: Vec<u64>,
pub facet_counts: Vec<u64>,
}
fn facet_depth(facet_bytes: &[u8]) -> usize { fn facet_depth(facet_bytes: &[u8]) -> usize {
if facet_bytes.is_empty() { if facet_bytes.is_empty() {
0 0
@@ -91,14 +83,14 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// ```rust /// ```rust
/// #[macro_use] /// #[macro_use]
/// extern crate tantivy; /// extern crate tantivy;
/// use tantivy::schema::{Facet, SchemaBuilder, TEXT}; /// use tantivy::schema::{Facet, Schema, TEXT};
/// use tantivy::{Index, Result}; /// use tantivy::{Index, Result};
/// use tantivy::collector::FacetCollector; /// use tantivy::collector::FacetCollector;
/// use tantivy::query::AllQuery; /// use tantivy::query::AllQuery;
/// ///
/// # fn main() { example().unwrap(); } /// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> { /// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new(); /// let mut schema_builder = Schema::builder();
/// ///
/// // Facet have their own specific type. /// // Facet have their own specific type.
/// // It is not a bad practise to put all of your /// // It is not a bad practise to put all of your
@@ -131,23 +123,19 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// facet => Facet::from("/lang/en"), /// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/biography") /// facet => Facet::from("/category/biography")
/// )); /// ));
/// index_writer.commit().unwrap(); /// index_writer.commit()?;
/// } /// }
/// /// let reader = index.reader()?;
/// index.load_searchers()?; /// let searcher = reader.searcher();
/// let searcher = index.searcher();
/// ///
/// { /// {
/// let mut facet_collector = FacetCollector::for_field(facet); /// let mut facet_collector = FacetCollector::for_field(facet);
/// facet_collector.add_facet("/lang"); /// facet_collector.add_facet("/lang");
/// facet_collector.add_facet("/category"); /// facet_collector.add_facet("/category");
/// searcher.search(&AllQuery, &mut facet_collector).unwrap(); /// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
///
/// // this object contains count aggregate for all of the facets.
/// let counts = facet_collector.harvest();
/// ///
/// // This lists all of the facet counts /// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = counts /// let facets: Vec<(&Facet, u64)> = facet_counts
/// .get("/category") /// .get("/category")
/// .collect(); /// .collect();
/// assert_eq!(facets, vec![ /// assert_eq!(facets, vec![
@@ -159,13 +147,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// { /// {
/// let mut facet_collector = FacetCollector::for_field(facet); /// let mut facet_collector = FacetCollector::for_field(facet);
/// facet_collector.add_facet("/category/fiction"); /// facet_collector.add_facet("/category/fiction");
/// searcher.search(&AllQuery, &mut facet_collector).unwrap(); /// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
///
/// // this object contains count aggregate for all of the facets.
/// let counts = facet_collector.harvest();
/// ///
/// // This lists all of the facet counts /// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = counts /// let facets: Vec<(&Facet, u64)> = facet_counts
/// .get("/category/fiction") /// .get("/category/fiction")
/// .collect(); /// .collect();
/// assert_eq!(facets, vec![ /// assert_eq!(facets, vec![
@@ -178,13 +163,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// { /// {
/// let mut facet_collector = FacetCollector::for_field(facet); /// let mut facet_collector = FacetCollector::for_field(facet);
/// facet_collector.add_facet("/category/fiction"); /// facet_collector.add_facet("/category/fiction");
/// searcher.search(&AllQuery, &mut facet_collector).unwrap(); /// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
///
/// // this object contains count aggregate for all of the facets.
/// let counts = facet_collector.harvest();
/// ///
/// // This lists all of the facet counts /// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = counts.top_k("/category/fiction", 1); /// let facets: Vec<(&Facet, u64)> = facet_counts.top_k("/category/fiction", 1);
/// assert_eq!(facets, vec![ /// assert_eq!(facets, vec![
/// (&Facet::from("/category/fiction/fantasy"), 2) /// (&Facet::from("/category/fiction/fantasy"), 2)
/// ]); /// ]);
@@ -194,28 +176,28 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// } /// }
/// ``` /// ```
pub struct FacetCollector { pub struct FacetCollector {
facet_ords: Vec<u64>,
field: Field, field: Field,
ff_reader: Option<UnsafeCell<FacetReader>>,
segment_counters: Vec<SegmentFacetCounter>,
// facet_ord -> collapse facet_id
current_segment_collapse_mapping: Vec<usize>,
// collapse facet_id -> count
current_segment_counts: Vec<u64>,
// collapse facet_id -> facet_ord
current_collapse_facet_ords: Vec<u64>,
facets: BTreeSet<Facet>, facets: BTreeSet<Facet>,
} }
pub struct FacetSegmentCollector {
reader: FacetReader,
facet_ords_buf: Vec<u64>,
// facet_ord -> collapse facet_id
collapse_mapping: Vec<usize>,
// collapse facet_id -> count
counts: Vec<u64>,
// collapse facet_id -> facet_ord
collapse_facet_ords: Vec<u64>,
}
fn skip<'a, I: Iterator<Item = &'a Facet>>( fn skip<'a, I: Iterator<Item = &'a Facet>>(
target: &[u8], target: &[u8],
collapse_it: &mut Peekable<I>, collapse_it: &mut Peekable<I>,
) -> SkipResult { ) -> SkipResult {
loop { loop {
match collapse_it.peek() { match collapse_it.peek() {
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) { Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
Ordering::Less => {} Ordering::Less => {}
Ordering::Greater => { Ordering::Greater => {
return SkipResult::OverStep; return SkipResult::OverStep;
@@ -240,15 +222,8 @@ impl FacetCollector {
/// is of the proper type. /// is of the proper type.
pub fn for_field(field: Field) -> FacetCollector { pub fn for_field(field: Field) -> FacetCollector {
FacetCollector { FacetCollector {
facet_ords: Vec::with_capacity(255),
segment_counters: Vec::new(),
field, field,
ff_reader: None, facets: BTreeSet::default(),
facets: BTreeSet::new(),
current_segment_collapse_mapping: Vec::new(),
current_collapse_facet_ords: Vec::new(),
current_segment_counts: Vec::new(),
} }
} }
@@ -278,143 +253,103 @@ impl FacetCollector {
} }
self.facets.insert(facet); self.facets.insert(facet);
} }
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
self.current_segment_collapse_mapping.clear();
self.current_collapse_facet_ords.clear();
self.current_segment_counts.clear();
let mut collapse_facet_it = self.facets.iter().peekable();
self.current_collapse_facet_ords.push(0);
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
if !facet_streamer.advance() {
return;
}
'outer: loop {
// at the begining of this loop, facet_streamer
// is positionned on a term that has not been processed yet.
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
match skip_result {
SkipResult::Reached => {
// we reach a facet we decided to collapse.
let collapse_depth = facet_depth(facet_streamer.key());
let mut collapsed_id = 0;
self.current_segment_collapse_mapping.push(0);
while facet_streamer.advance() {
let depth = facet_depth(facet_streamer.key());
if depth <= collapse_depth {
continue 'outer;
}
if depth == collapse_depth + 1 {
collapsed_id = self.current_collapse_facet_ords.len();
self.current_collapse_facet_ords
.push(facet_streamer.term_ord());
self.current_segment_collapse_mapping.push(collapsed_id);
} else {
self.current_segment_collapse_mapping.push(collapsed_id);
}
}
break;
}
SkipResult::End | SkipResult::OverStep => {
self.current_segment_collapse_mapping.push(0);
if !facet_streamer.advance() {
break;
}
}
}
}
}
fn finalize_segment(&mut self) {
if self.ff_reader.is_some() {
self.segment_counters.push(SegmentFacetCounter {
facet_reader: self.ff_reader.take().unwrap().into_inner(),
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
});
}
}
/// Returns the results of the collection.
///
/// This method does not just return the counters,
/// it also translates the facet ordinals of the last segment.
pub fn harvest(mut self) -> FacetCounts {
self.finalize_segment();
let collapsed_facet_ords: Vec<&[u64]> = self
.segment_counters
.iter()
.map(|segment_counter| &segment_counter.facet_ords[..])
.collect();
let collapsed_facet_counts: Vec<&[u64]> = self
.segment_counters
.iter()
.map(|segment_counter| &segment_counter.facet_counts[..])
.collect();
let facet_streams = self
.segment_counters
.iter()
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
.collect::<Vec<_>>();
let mut facet_merger = TermMerger::new(facet_streams);
let mut facet_counts = BTreeMap::new();
while facet_merger.advance() {
let count = facet_merger
.current_kvs()
.iter()
.map(|it| {
let seg_ord = it.segment_ord;
let term_ord = it.streamer.term_ord();
collapsed_facet_ords[seg_ord]
.binary_search(&term_ord)
.map(|collapsed_term_id| {
if collapsed_term_id == 0 {
0
} else {
collapsed_facet_counts[seg_ord][collapsed_term_id]
}
}).unwrap_or(0)
}).sum();
if count > 0u64 {
let bytes: Vec<u8> = facet_merger.key().to_owned();
// may create an corrupted facet if the term dicitonary is corrupted
let facet = unsafe { Facet::from_encoded(bytes) };
facet_counts.insert(facet, count);
}
}
FacetCounts { facet_counts }
}
} }
impl Collector for FacetCollector { impl Collector for FacetCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { type Fruit = FacetCounts;
self.finalize_segment();
let facet_reader = reader.facet_reader(self.field)?; type Child = FacetSegmentCollector;
self.set_collapse_mapping(&facet_reader);
self.current_segment_counts fn for_segment(
.resize(self.current_collapse_facet_ords.len(), 0); &self,
self.ff_reader = Some(UnsafeCell::new(facet_reader)); _: SegmentLocalId,
Ok(()) reader: &SegmentReader,
) -> Result<FacetSegmentCollector> {
let field_name = reader.schema().get_field_name(self.field);
let facet_reader = reader.facet_reader(self.field).ok_or_else(|| {
TantivyError::SchemaError(format!("Field {:?} is not a facet field.", field_name))
})?;
let mut collapse_mapping = Vec::new();
let mut counts = Vec::new();
let mut collapse_facet_ords = Vec::new();
let mut collapse_facet_it = self.facets.iter().peekable();
collapse_facet_ords.push(0);
{
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
if facet_streamer.advance() {
'outer: loop {
// at the begining of this loop, facet_streamer
// is positionned on a term that has not been processed yet.
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
match skip_result {
SkipResult::Reached => {
// we reach a facet we decided to collapse.
let collapse_depth = facet_depth(facet_streamer.key());
let mut collapsed_id = 0;
collapse_mapping.push(0);
while facet_streamer.advance() {
let depth = facet_depth(facet_streamer.key());
if depth <= collapse_depth {
continue 'outer;
}
if depth == collapse_depth + 1 {
collapsed_id = collapse_facet_ords.len();
collapse_facet_ords.push(facet_streamer.term_ord());
collapse_mapping.push(collapsed_id);
} else {
collapse_mapping.push(collapsed_id);
}
}
break;
}
SkipResult::End | SkipResult::OverStep => {
collapse_mapping.push(0);
if !facet_streamer.advance() {
break;
}
}
}
}
}
}
counts.resize(collapse_facet_ords.len(), 0);
Ok(FacetSegmentCollector {
reader: facet_reader,
facet_ords_buf: Vec::with_capacity(255),
collapse_mapping,
counts,
collapse_facet_ords,
})
} }
fn requires_scoring(&self) -> bool {
false
}
fn merge_fruits(&self, segments_facet_counts: Vec<FacetCounts>) -> Result<FacetCounts> {
let mut facet_counts: BTreeMap<Facet, u64> = BTreeMap::new();
for segment_facet_counts in segments_facet_counts {
for (facet, count) in segment_facet_counts.facet_counts {
*(facet_counts.entry(facet).or_insert(0)) += count;
}
}
Ok(FacetCounts { facet_counts })
}
}
impl SegmentCollector for FacetSegmentCollector {
type Fruit = FacetCounts;
fn collect(&mut self, doc: DocId, _: Score) { fn collect(&mut self, doc: DocId, _: Score) {
let facet_reader: &mut FacetReader = unsafe { self.reader.facet_ords(doc, &mut self.facet_ords_buf);
&mut *self
.ff_reader
.as_ref()
.expect("collect() was called before set_segment. This should never happen.")
.get()
};
facet_reader.facet_ords(doc, &mut self.facet_ords);
let mut previous_collapsed_ord: usize = usize::MAX; let mut previous_collapsed_ord: usize = usize::MAX;
for &facet_ord in &self.facet_ords { for &facet_ord in &self.facet_ords_buf {
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize]; let collapsed_ord = self.collapse_mapping[facet_ord as usize];
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord self.counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord {
{
0 0
} else { } else {
1 1
@@ -423,8 +358,24 @@ impl Collector for FacetCollector {
} }
} }
fn requires_scoring(&self) -> bool { /// Returns the results of the collection.
false ///
/// This method does not just return the counters,
/// it also translates the facet ordinals of the last segment.
fn harvest(self) -> FacetCounts {
let mut facet_counts = BTreeMap::new();
let facet_dict = self.reader.facet_dict();
for (collapsed_facet_ord, count) in self.counts.iter().cloned().enumerate() {
if count == 0 {
continue;
}
let mut facet = vec![];
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
// TODO
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
}
FacetCounts { facet_counts }
} }
} }
@@ -456,9 +407,9 @@ impl FacetCounts {
let right_bound = if facet.is_root() { let right_bound = if facet.is_root() {
Bound::Unbounded Bound::Unbounded
} else { } else {
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned(); let mut facet_after_bytes: String = facet.encoded_str().to_owned();
facet_after_bytes.push(1u8); facet_after_bytes.push('\u{1}');
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic let facet_after = Facet::from_encoded_string(facet_after_bytes);
Bound::Excluded(facet_after) Bound::Excluded(facet_after)
}; };
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound)); let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
@@ -505,14 +456,14 @@ mod tests {
use core::Index; use core::Index;
use query::AllQuery; use query::AllQuery;
use rand::distributions::Uniform; use rand::distributions::Uniform;
use rand::prelude::SliceRandom;
use rand::{thread_rng, Rng}; use rand::{thread_rng, Rng};
use schema::Field; use schema::{Document, Facet, Field, Schema};
use schema::{Document, Facet, SchemaBuilder};
use std::iter; use std::iter;
#[test] #[test]
fn test_facet_collector_drilldown() { fn test_facet_collector_drilldown() {
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet"); let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -527,21 +478,20 @@ mod tests {
n /= 4; n /= 4;
let leaf = n % 5; let leaf = n % 5;
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf)) Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
}).collect(); })
.collect();
for i in 0..num_facets * 10 { for i in 0..num_facets * 10 {
let mut doc = Document::new(); let mut doc = Document::new();
doc.add_facet(facet_field, facets[i % num_facets].clone()); doc.add_facet(facet_field, facets[i % num_facets].clone());
index_writer.add_document(doc); index_writer.add_document(doc);
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap(); let reader = index.reader().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field); let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top1")); facet_collector.add_facet(Facet::from("/top1"));
searcher.search(&AllQuery, &mut facet_collector).unwrap(); let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
let counts: FacetCounts = facet_collector.harvest();
{ {
let facets: Vec<(String, u64)> = counts let facets: Vec<(String, u64)> = counts
.get("/top1") .get("/top1")
@@ -555,18 +505,16 @@ mod tests {
("/top1/mid2", 50), ("/top1/mid2", 50),
("/top1/mid3", 50), ("/top1/mid3", 50),
] ]
.iter() .iter()
.map(|&(facet_str, count)| (String::from(facet_str), count)) .map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>() .collect::<Vec<_>>()
); );
} }
} }
#[test] #[test]
#[should_panic( #[should_panic(expected = "Tried to add a facet which is a descendant of \
expected = "Tried to add a facet which is a descendant of \ an already added facet.")]
an already added facet."
)]
fn test_misused_facet_collector() { fn test_misused_facet_collector() {
let mut facet_collector = FacetCollector::for_field(Field(0)); let mut facet_collector = FacetCollector::for_field(Field(0));
facet_collector.add_facet(Facet::from("/country")); facet_collector.add_facet(Facet::from("/country"));
@@ -575,7 +523,7 @@ mod tests {
#[test] #[test]
fn test_doc_unsorted_multifacet() { fn test_doc_unsorted_multifacet() {
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facets"); let facet_field = schema_builder.add_facet_field("facets");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -587,13 +535,12 @@ mod tests {
facet_field => Facet::from_text(&"/subjects/B/b"), facet_field => Facet::from_text(&"/subjects/B/b"),
)); ));
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap(); let reader = index.reader().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1); assert_eq!(searcher.num_docs(), 1);
let mut facet_collector = FacetCollector::for_field(facet_field); let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet("/subjects"); facet_collector.add_facet("/subjects");
searcher.search(&AllQuery, &mut facet_collector).unwrap(); let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
let counts = facet_collector.harvest();
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect(); let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
assert_eq!(facets[0].1, 1); assert_eq!(facets[0].1, 1);
} }
@@ -607,7 +554,7 @@ mod tests {
#[test] #[test]
fn test_facet_collector_topk() { fn test_facet_collector_topk() {
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet"); let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -619,29 +566,28 @@ mod tests {
let facet = Facet::from(&format!("/facet/{}", c)); let facet = Facet::from(&format!("/facet/{}", c));
let doc = doc!(facet_field => facet); let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count) iter::repeat(doc).take(count)
}).map(|mut doc| { })
.map(|mut doc| {
doc.add_facet( doc.add_facet(
facet_field, facet_field,
&format!("/facet/{}", thread_rng().sample(&uniform)), &format!("/facet/{}", thread_rng().sample(&uniform)),
); );
doc doc
}).collect(); })
thread_rng().shuffle(&mut docs[..]); .collect();
docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for doc in docs { for doc in docs {
index_writer.add_document(doc); index_writer.add_document(doc);
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap(); let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field); let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet("/facet"); facet_collector.add_facet("/facet");
searcher.search(&AllQuery, &mut facet_collector).unwrap(); let counts: FacetCounts = searcher.search(&AllQuery, &facet_collector).unwrap();
let counts: FacetCounts = facet_collector.harvest();
{ {
let facets: Vec<(&Facet, u64)> = counts.top_k("/facet", 3); let facets: Vec<(&Facet, u64)> = counts.top_k("/facet", 3);
assert_eq!( assert_eq!(
@@ -664,13 +610,13 @@ mod bench {
use query::AllQuery; use query::AllQuery;
use rand::{thread_rng, Rng}; use rand::{thread_rng, Rng};
use schema::Facet; use schema::Facet;
use schema::SchemaBuilder; use schema::Schema;
use test::Bencher; use test::Bencher;
use Index; use Index;
#[bench] #[bench]
fn bench_facet_collector(b: &mut Bencher) { fn bench_facet_collector(b: &mut Bencher) {
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facet"); let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -690,12 +636,11 @@ mod bench {
index_writer.add_document(doc); index_writer.add_document(doc);
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap(); let reader = index.reader().unwrap();
b.iter(|| { b.iter(|| {
let searcher = index.searcher(); let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field); let facet_collector = FacetCollector::for_field(facet_field);
searcher.search(&AllQuery, &mut facet_collector).unwrap(); searcher.search(&AllQuery, &facet_collector).unwrap();
}); });
} }
} }

View File

@@ -79,7 +79,7 @@ mod tests {
// make sure we have facet counters correctly filled // make sure we have facet counters correctly filled
fn test_facet_collector_results() { fn test_facet_collector_results() {
let mut schema_builder = schema::SchemaBuilder::new(); let mut schema_builder = schema::Schema::builder();
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST); let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST); let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
let text_field = schema_builder.add_text_field("text", STRING); let text_field = schema_builder.add_text_field("text", STRING);
@@ -88,7 +88,7 @@ mod tests {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
for i in 0u64..10u64 { for i in 0u64..10u64 {
index_writer.add_document(doc!( index_writer.add_document(doc!(
@@ -101,8 +101,7 @@ mod tests {
assert_eq!(index_writer.commit().unwrap(), 10u64); assert_eq!(index_writer.commit().unwrap(), 10u64);
} }
index.load_searchers().unwrap(); let searcher = index.reader().searcher();
let searcher = index.searcher();
let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64); let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64);
let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64); let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64);

View File

@@ -1,7 +1,91 @@
/*! /*!
Defines how the documents matching a search query should be processed.
# Collectors
Collectors define the information you want to extract from the documents matching the queries.
In tantivy jargon, we call this information your search "fruit".
Your fruit could for instance be :
- [the count of matching documents](./struct.Count.html)
- [the top 10 documents, by relevancy or by a fast field](./struct.TopDocs.html)
- [facet counts](./struct.FacetCollector.html)
At one point in your code, you will trigger the actual search operation by calling
[the `search(...)` method of your `Searcher` object](../struct.Searcher.html#method.search).
This call will look like this.
```verbatim
let fruit = searcher.search(&query, &collector)?;
```
Here the type of fruit is actually determined as an associated type of the collector (`Collector::Fruit`).
# Combining several collectors
A rich search experience often requires to run several collectors on your search query.
For instance,
- selecting the top-K products matching your query
- counting the matching documents
- computing several facets
- computing statistics about the matching product prices
A simple and efficient way to do that is to pass your collectors as one tuple.
The resulting `Fruit` will then be a typed tuple with each collector's original fruits
in their respective position.
```rust
# extern crate tantivy;
# use tantivy::schema::*;
# use tantivy::*;
# use tantivy::query::*;
use tantivy::collector::{Count, TopDocs};
#
# fn main() -> tantivy::Result<()> {
# let mut schema_builder = Schema::builder();
# let title = schema_builder.add_text_field("title", TEXT);
# let schema = schema_builder.build();
# let index = Index::create_in_ram(schema);
# let mut index_writer = index.writer(3_000_000)?;
# index_writer.add_document(doc!(
# title => "The Name of the Wind",
# ));
# index_writer.add_document(doc!(
# title => "The Diary of Muadib",
# ));
# index_writer.commit()?;
# let reader = index.reader()?;
# let searcher = reader.searcher();
# let query_parser = QueryParser::for_index(&index, vec![title]);
# let query = query_parser.parse_query("diary")?;
let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
searcher.search(&query, &(Count, TopDocs::with_limit(2)))?;
# Ok(())
# }
```
The `Collector` trait is implemented for up to 4 collectors.
If you have more than 4 collectors, you can either group them into
tuples of tuples `(a,(b,(c,d)))`, or rely on `MultiCollector`'s.
# Combining several collectors dynamically
Combining collectors into a tuple is a zero-cost abstraction: everything
happens as if you had manually implemented a single collector
combining all of our features.
Unfortunately it requires you to know at compile time your collector types.
If on the other hand, the collectors depend on some query parameter,
you can rely on `MultiCollector`'s.
# Implementing your own collectors.
See the `custom_collector` example.
*/ */
use downcast_rs;
use DocId; use DocId;
use Result; use Result;
use Score; use Score;
@@ -9,7 +93,7 @@ use SegmentLocalId;
use SegmentReader; use SegmentReader;
mod count_collector; mod count_collector;
pub use self::count_collector::CountCollector; pub use self::count_collector::Count;
mod multi_collector; mod multi_collector;
pub use self::multi_collector::MultiCollector; pub use self::multi_collector::MultiCollector;
@@ -17,237 +101,264 @@ pub use self::multi_collector::MultiCollector;
mod top_collector; mod top_collector;
mod top_score_collector; mod top_score_collector;
pub use self::top_score_collector::TopScoreCollector; pub use self::top_score_collector::TopDocs;
#[deprecated]
pub use self::top_score_collector::TopScoreCollector as TopCollector;
mod top_field_collector; mod top_field_collector;
pub use self::top_field_collector::TopFieldCollector; pub use self::top_field_collector::TopDocsByField;
mod facet_collector; mod facet_collector;
pub use self::facet_collector::FacetCollector; pub use self::facet_collector::FacetCollector;
mod chained_collector; /// `Fruit` is the type for the result of our collection.
pub use self::chained_collector::{chain, ChainedCollector}; /// e.g. `usize` for the `Count` collector.
pub trait Fruit: Send + downcast_rs::Downcast {}
impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
/// Collectors are in charge of collecting and retaining relevant /// Collectors are in charge of collecting and retaining relevant
/// information from the document found and scored by the query. /// information from the document found and scored by the query.
/// ///
///
/// For instance, /// For instance,
/// ///
/// - keeping track of the top 10 best documents /// - keeping track of the top 10 best documents
/// - computing a breakdown over a fast field /// - computing a breakdown over a fast field
/// - computing the number of documents matching the query /// - computing the number of documents matching the query
/// ///
/// Queries are in charge of pushing the `DocSet` to the collector. /// Our search index is in fact a collection of segments, so
/// a `Collector` trait is actually more of a factory to instance
/// `SegmentCollector`s for each segments.
/// ///
/// As they work on multiple segments, they first inform /// The collection logic itself is in the `SegmentCollector`.
/// the collector of a change in a segment and then
/// call the `collect` method to push the document to the collector.
///
/// Temporally, our collector will receive calls
/// - `.set_segment(0, segment_reader_0)`
/// - `.collect(doc0_of_segment_0)`
/// - `.collect(...)`
/// - `.collect(last_doc_of_segment_0)`
/// - `.set_segment(1, segment_reader_1)`
/// - `.collect(doc0_of_segment_1)`
/// - `.collect(...)`
/// - `.collect(last_doc_of_segment_1)`
/// - `...`
/// - `.collect(last_doc_of_last_segment)`
/// ///
/// Segments are not guaranteed to be visited in any specific order. /// Segments are not guaranteed to be visited in any specific order.
pub trait Collector { pub trait Collector: Sync {
/// `Fruit` is the type for the result of our collection.
/// e.g. `usize` for the `Count` collector.
type Fruit: Fruit;
/// Type of the `SegmentCollector` associated to this collector.
type Child: SegmentCollector<Fruit = Self::Fruit>;
/// `set_segment` is called before beginning to enumerate /// `set_segment` is called before beginning to enumerate
/// on this segment. /// on this segment.
fn set_segment( fn for_segment(
&mut self, &self,
segment_local_id: SegmentLocalId, segment_local_id: SegmentLocalId,
segment: &SegmentReader, segment: &SegmentReader,
) -> Result<()>; ) -> Result<Self::Child>;
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score);
/// Returns true iff the collector requires to compute scores for documents. /// Returns true iff the collector requires to compute scores for documents.
fn requires_scoring(&self) -> bool; fn requires_scoring(&self) -> bool;
/// Combines the fruit associated to the collection of each segments
/// into one fruit.
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit>;
} }
impl<'a, C: Collector> Collector for &'a mut C { /// The `SegmentCollector` is the trait in charge of defining the
fn set_segment( /// collect operation at the scale of the segment.
&mut self, ///
segment_local_id: SegmentLocalId, /// `.collect(doc, score)` will be called for every documents
segment: &SegmentReader, /// matching the query.
) -> Result<()> { pub trait SegmentCollector: 'static {
(*self).set_segment(segment_local_id, segment) /// `Fruit` is the type for the result of our collection.
} /// e.g. `usize` for the `Count` collector.
type Fruit: Fruit;
/// The query pushes the scored document to the collector via this method. /// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score) { fn collect(&mut self, doc: DocId, score: Score);
C::collect(self, doc, score)
/// Extract the fruit of the collection from the `SegmentCollector`.
fn harvest(self) -> Self::Fruit;
}
// -----------------------------------------------
// Tuple implementations.
impl<Left, Right> Collector for (Left, Right)
where
Left: Collector,
Right: Collector,
{
type Fruit = (Left::Fruit, Right::Fruit);
type Child = (Left::Child, Right::Child);
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
let left = self.0.for_segment(segment_local_id, segment)?;
let right = self.1.for_segment(segment_local_id, segment)?;
Ok((left, right))
} }
fn requires_scoring(&self) -> bool { fn requires_scoring(&self) -> bool {
C::requires_scoring(self) self.0.requires_scoring() || self.1.requires_scoring()
}
fn merge_fruits(
&self,
children: Vec<(Left::Fruit, Right::Fruit)>,
) -> Result<(Left::Fruit, Right::Fruit)> {
let mut left_fruits = vec![];
let mut right_fruits = vec![];
for (left_fruit, right_fruit) in children {
left_fruits.push(left_fruit);
right_fruits.push(right_fruit);
}
Ok((
self.0.merge_fruits(left_fruits)?,
self.1.merge_fruits(right_fruits)?,
))
} }
} }
impl<Left, Right> SegmentCollector for (Left, Right)
where
Left: SegmentCollector,
Right: SegmentCollector,
{
type Fruit = (Left::Fruit, Right::Fruit);
fn collect(&mut self, doc: DocId, score: Score) {
self.0.collect(doc, score);
self.1.collect(doc, score);
}
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
(self.0.harvest(), self.1.harvest())
}
}
// 3-Tuple
impl<One, Two, Three> Collector for (One, Two, Three)
where
One: Collector,
Two: Collector,
Three: Collector,
{
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
type Child = (One::Child, Two::Child, Three::Child);
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
let one = self.0.for_segment(segment_local_id, segment)?;
let two = self.1.for_segment(segment_local_id, segment)?;
let three = self.2.for_segment(segment_local_id, segment)?;
Ok((one, two, three))
}
fn requires_scoring(&self) -> bool {
self.0.requires_scoring() || self.1.requires_scoring() || self.2.requires_scoring()
}
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
let mut one_fruits = vec![];
let mut two_fruits = vec![];
let mut three_fruits = vec![];
for (one_fruit, two_fruit, three_fruit) in children {
one_fruits.push(one_fruit);
two_fruits.push(two_fruit);
three_fruits.push(three_fruit);
}
Ok((
self.0.merge_fruits(one_fruits)?,
self.1.merge_fruits(two_fruits)?,
self.2.merge_fruits(three_fruits)?,
))
}
}
impl<One, Two, Three> SegmentCollector for (One, Two, Three)
where
One: SegmentCollector,
Two: SegmentCollector,
Three: SegmentCollector,
{
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
fn collect(&mut self, doc: DocId, score: Score) {
self.0.collect(doc, score);
self.1.collect(doc, score);
self.2.collect(doc, score);
}
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
(self.0.harvest(), self.1.harvest(), self.2.harvest())
}
}
// 4-Tuple
impl<One, Two, Three, Four> Collector for (One, Two, Three, Four)
where
One: Collector,
Two: Collector,
Three: Collector,
Four: Collector,
{
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
type Child = (One::Child, Two::Child, Three::Child, Four::Child);
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
let one = self.0.for_segment(segment_local_id, segment)?;
let two = self.1.for_segment(segment_local_id, segment)?;
let three = self.2.for_segment(segment_local_id, segment)?;
let four = self.3.for_segment(segment_local_id, segment)?;
Ok((one, two, three, four))
}
fn requires_scoring(&self) -> bool {
self.0.requires_scoring()
|| self.1.requires_scoring()
|| self.2.requires_scoring()
|| self.3.requires_scoring()
}
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
let mut one_fruits = vec![];
let mut two_fruits = vec![];
let mut three_fruits = vec![];
let mut four_fruits = vec![];
for (one_fruit, two_fruit, three_fruit, four_fruit) in children {
one_fruits.push(one_fruit);
two_fruits.push(two_fruit);
three_fruits.push(three_fruit);
four_fruits.push(four_fruit);
}
Ok((
self.0.merge_fruits(one_fruits)?,
self.1.merge_fruits(two_fruits)?,
self.2.merge_fruits(three_fruits)?,
self.3.merge_fruits(four_fruits)?,
))
}
}
impl<One, Two, Three, Four> SegmentCollector for (One, Two, Three, Four)
where
One: SegmentCollector,
Two: SegmentCollector,
Three: SegmentCollector,
Four: SegmentCollector,
{
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
fn collect(&mut self, doc: DocId, score: Score) {
self.0.collect(doc, score);
self.1.collect(doc, score);
self.2.collect(doc, score);
self.3.collect(doc, score);
}
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
(
self.0.harvest(),
self.1.harvest(),
self.2.harvest(),
self.3.harvest(),
)
}
}
impl_downcast!(Fruit);
#[cfg(test)] #[cfg(test)]
pub mod tests { pub mod tests;
use super::*;
use core::SegmentReader;
use fastfield::BytesFastFieldReader;
use fastfield::FastFieldReader;
use schema::Field;
use DocId;
use Score;
use SegmentLocalId;
/// Stores all of the doc ids.
/// This collector is only used for tests.
/// It is unusable in practise, as it does not store
/// the segment ordinals
pub struct TestCollector {
offset: DocId,
segment_max_doc: DocId,
docs: Vec<DocId>,
scores: Vec<Score>,
}
impl TestCollector {
/// Return the exhalist of documents.
pub fn docs(self) -> Vec<DocId> {
self.docs
}
pub fn scores(self) -> Vec<Score> {
self.scores
}
}
impl Default for TestCollector {
fn default() -> TestCollector {
TestCollector {
offset: 0,
segment_max_doc: 0,
docs: Vec::new(),
scores: Vec::new(),
}
}
}
impl Collector for TestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.offset += self.segment_max_doc;
self.segment_max_doc = reader.max_doc();
Ok(())
}
fn collect(&mut self, doc: DocId, score: Score) {
self.docs.push(doc + self.offset);
self.scores.push(score);
}
fn requires_scoring(&self) -> bool {
true
}
}
/// Collects in order all of the fast fields for all of the
/// doc in the `DocSet`
///
/// This collector is mainly useful for tests.
pub struct FastFieldTestCollector {
vals: Vec<u64>,
field: Field,
ff_reader: Option<FastFieldReader<u64>>,
}
impl FastFieldTestCollector {
pub fn for_field(field: Field) -> FastFieldTestCollector {
FastFieldTestCollector {
vals: Vec::new(),
field,
ff_reader: None,
}
}
pub fn vals(self) -> Vec<u64> {
self.vals
}
}
impl Collector for FastFieldTestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
Ok(())
}
fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.ff_reader.as_ref().unwrap().get(doc);
self.vals.push(val);
}
fn requires_scoring(&self) -> bool {
false
}
}
/// Collects in order all of the fast field bytes for all of the
/// docs in the `DocSet`
///
/// This collector is mainly useful for tests.
pub struct BytesFastFieldTestCollector {
vals: Vec<u8>,
field: Field,
ff_reader: Option<BytesFastFieldReader>,
}
impl BytesFastFieldTestCollector {
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
BytesFastFieldTestCollector {
vals: Vec::new(),
field,
ff_reader: None,
}
}
pub fn vals(self) -> Vec<u8> {
self.vals
}
}
impl Collector for BytesFastFieldTestCollector {
fn set_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<()> {
self.ff_reader = Some(segment.bytes_fast_field_reader(self.field)?);
Ok(())
}
fn collect(&mut self, doc: u32, _score: f32) {
let val = self.ff_reader.as_ref().unwrap().get_val(doc);
self.vals.extend(val);
}
fn requires_scoring(&self) -> bool {
false
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use collector::{Collector, CountCollector};
use test::Bencher;
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {
let mut count_collector = CountCollector::default();
let docs: Vec<u32> = (0..1_000_000).collect();
for doc in docs {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
}
}

View File

@@ -1,26 +1,120 @@
use super::Collector; use super::Collector;
use super::SegmentCollector;
use collector::Fruit;
use std::marker::PhantomData;
use std::ops::Deref;
use DocId; use DocId;
use Result; use Result;
use Score; use Score;
use SegmentLocalId; use SegmentLocalId;
use SegmentReader; use SegmentReader;
use TantivyError;
pub struct MultiFruit {
sub_fruits: Vec<Option<Box<Fruit>>>,
}
pub struct CollectorWrapper<TCollector: Collector>(TCollector);
impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
type Fruit = Box<Fruit>;
type Child = Box<BoxableSegmentCollector>;
fn for_segment(
&self,
segment_local_id: u32,
reader: &SegmentReader,
) -> Result<Box<BoxableSegmentCollector>> {
let child = self.0.for_segment(segment_local_id, reader)?;
Ok(Box::new(SegmentCollectorWrapper(child)))
}
fn requires_scoring(&self) -> bool {
self.0.requires_scoring()
}
fn merge_fruits(&self, children: Vec<<Self as Collector>::Fruit>) -> Result<Box<Fruit>> {
let typed_fruit: Vec<TCollector::Fruit> = children
.into_iter()
.map(|untyped_fruit| {
untyped_fruit
.downcast::<TCollector::Fruit>()
.map(|boxed_but_typed| *boxed_but_typed)
.map_err(|_| {
TantivyError::InvalidArgument("Failed to cast child fruit.".to_string())
})
})
.collect::<Result<_>>()?;
let merged_fruit = self.0.merge_fruits(typed_fruit)?;
Ok(Box::new(merged_fruit))
}
}
impl SegmentCollector for Box<BoxableSegmentCollector> {
type Fruit = Box<Fruit>;
fn collect(&mut self, doc: u32, score: f32) {
self.as_mut().collect(doc, score);
}
fn harvest(self) -> Box<Fruit> {
BoxableSegmentCollector::harvest_from_box(self)
}
}
pub trait BoxableSegmentCollector {
fn collect(&mut self, doc: u32, score: f32);
fn harvest_from_box(self: Box<Self>) -> Box<Fruit>;
}
pub struct SegmentCollectorWrapper<TSegmentCollector: SegmentCollector>(TSegmentCollector);
impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector
for SegmentCollectorWrapper<TSegmentCollector>
{
fn collect(&mut self, doc: u32, score: f32) {
self.0.collect(doc, score);
}
fn harvest_from_box(self: Box<Self>) -> Box<Fruit> {
Box::new(self.0.harvest())
}
}
pub struct FruitHandle<TFruit: Fruit> {
pos: usize,
_phantom: PhantomData<TFruit>,
}
impl<TFruit: Fruit> FruitHandle<TFruit> {
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
*boxed_fruit
.downcast::<TFruit>()
.map_err(|_| ())
.expect("Failed to downcast collector fruit.")
}
}
/// Multicollector makes it possible to collect on more than one collector. /// Multicollector makes it possible to collect on more than one collector.
/// It should only be used for use cases where the Collector types is unknown /// It should only be used for use cases where the Collector types is unknown
/// at compile time. /// at compile time.
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`. ///
/// If the type of the collectors is known, you can just group yours collectors
/// in a tuple. See the
/// [Combining several collectors section of the collector documentation](./index.html#combining-several-collectors).
/// ///
/// ```rust /// ```rust
/// #[macro_use] /// #[macro_use]
/// extern crate tantivy; /// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT}; /// use tantivy::schema::{Schema, TEXT};
/// use tantivy::{Index, Result}; /// use tantivy::{Index, Result};
/// use tantivy::collector::{CountCollector, TopCollector, MultiCollector}; /// use tantivy::collector::{Count, TopDocs, MultiCollector};
/// use tantivy::query::QueryParser; /// use tantivy::query::QueryParser;
/// ///
/// # fn main() { example().unwrap(); } /// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> { /// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new(); /// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
@@ -41,58 +135,121 @@ use SegmentReader;
/// index_writer.commit().unwrap(); /// index_writer.commit().unwrap();
/// } /// }
/// ///
/// index.load_searchers()?; /// let reader = index.reader()?;
/// let searcher = index.searcher(); /// let searcher = reader.searcher();
/// ///
/// { /// let mut collectors = MultiCollector::new();
/// let mut top_collector = TopCollector::with_limit(2); /// let top_docs_handle = collectors.add_collector(TopDocs::with_limit(2));
/// let mut count_collector = CountCollector::default(); /// let count_handle = collectors.add_collector(Count);
/// { /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let mut collectors = /// let query = query_parser.parse_query("diary")?;
/// MultiCollector::from(vec![&mut top_collector, &mut count_collector]); /// let mut multi_fruit = searcher.search(&query, &collectors)?;
/// let query_parser = QueryParser::for_index(&index, vec![title]); ///
/// let query = query_parser.parse_query("diary")?; /// let count = count_handle.extract(&mut multi_fruit);
/// searcher.search(&*query, &mut collectors).unwrap(); /// let top_docs = top_docs_handle.extract(&mut multi_fruit);
/// } ///
/// assert_eq!(count_collector.count(), 2); /// # assert_eq!(count, 2);
/// assert!(top_collector.at_capacity()); /// # assert_eq!(top_docs.len(), 2);
/// }
/// ///
/// Ok(()) /// Ok(())
/// } /// }
/// ``` /// ```
#[allow(clippy::type_complexity)]
#[derive(Default)]
pub struct MultiCollector<'a> { pub struct MultiCollector<'a> {
collectors: Vec<&'a mut Collector>, collector_wrappers:
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>,
} }
impl<'a> MultiCollector<'a> { impl<'a> MultiCollector<'a> {
/// Constructor /// Create a new `MultiCollector`
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector { pub fn new() -> Self {
MultiCollector { collectors } Default::default()
}
/// Add a new collector to our `MultiCollector`.
pub fn add_collector<'b: 'a, TCollector: Collector + 'b>(
&mut self,
collector: TCollector,
) -> FruitHandle<TCollector::Fruit> {
let pos = self.collector_wrappers.len();
self.collector_wrappers
.push(Box::new(CollectorWrapper(collector)));
FruitHandle {
pos,
_phantom: PhantomData,
}
} }
} }
impl<'a> Collector for MultiCollector<'a> { impl<'a> Collector for MultiCollector<'a> {
fn set_segment( type Fruit = MultiFruit;
&mut self, type Child = MultiCollectorChild;
fn for_segment(
&self,
segment_local_id: SegmentLocalId, segment_local_id: SegmentLocalId,
segment: &SegmentReader, segment: &SegmentReader,
) -> Result<()> { ) -> Result<MultiCollectorChild> {
for collector in &mut self.collectors { let children = self
collector.set_segment(segment_local_id, segment)?; .collector_wrappers
} .iter()
Ok(()) .map(|collector_wrapper| collector_wrapper.for_segment(segment_local_id, segment))
.collect::<Result<Vec<_>>>()?;
Ok(MultiCollectorChild { children })
} }
fn requires_scoring(&self) -> bool {
self.collector_wrappers
.iter()
.map(Deref::deref)
.any(Collector::requires_scoring)
}
fn merge_fruits(&self, segments_multifruits: Vec<MultiFruit>) -> Result<MultiFruit> {
let mut segment_fruits_list: Vec<Vec<Box<Fruit>>> = (0..self.collector_wrappers.len())
.map(|_| Vec::with_capacity(segments_multifruits.len()))
.collect::<Vec<_>>();
for segment_multifruit in segments_multifruits {
for (idx, segment_fruit_opt) in segment_multifruit.sub_fruits.into_iter().enumerate() {
if let Some(segment_fruit) = segment_fruit_opt {
segment_fruits_list[idx].push(segment_fruit);
}
}
}
let sub_fruits = self
.collector_wrappers
.iter()
.zip(segment_fruits_list)
.map(|(child_collector, segment_fruits)| {
Ok(Some(child_collector.merge_fruits(segment_fruits)?))
})
.collect::<Result<_>>()?;
Ok(MultiFruit { sub_fruits })
}
}
pub struct MultiCollectorChild {
children: Vec<Box<BoxableSegmentCollector>>,
}
impl SegmentCollector for MultiCollectorChild {
type Fruit = MultiFruit;
fn collect(&mut self, doc: DocId, score: Score) { fn collect(&mut self, doc: DocId, score: Score) {
for collector in &mut self.collectors { for child in &mut self.children {
collector.collect(doc, score); child.collect(doc, score);
} }
} }
fn requires_scoring(&self) -> bool {
self.collectors fn harvest(self) -> MultiFruit {
.iter() MultiFruit {
.any(|collector| collector.requires_scoring()) sub_fruits: self
.children
.into_iter()
.map(|child| Some(child.harvest()))
.collect(),
}
} }
} }
@@ -100,20 +257,41 @@ impl<'a> Collector for MultiCollector<'a> {
mod tests { mod tests {
use super::*; use super::*;
use collector::{Collector, CountCollector, TopScoreCollector}; use collector::{Count, TopDocs};
use query::TermQuery;
use schema::IndexRecordOption;
use schema::{Schema, TEXT};
use Index;
use Term;
#[test] #[test]
fn test_multi_collector() { fn test_multi_collector() {
let mut top_collector = TopScoreCollector::with_limit(2); let mut schema_builder = Schema::builder();
let mut count_collector = CountCollector::default(); let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{ {
let mut collectors = let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
MultiCollector::from(vec![&mut top_collector, &mut count_collector]); index_writer.add_document(doc!(text=>"abc"));
collectors.collect(1, 0.2); index_writer.add_document(doc!(text=>"abc abc abc"));
collectors.collect(2, 0.1); index_writer.add_document(doc!(text=>"abc abc"));
collectors.collect(3, 0.5); index_writer.commit().unwrap();
index_writer.add_document(doc!(text=>""));
index_writer.add_document(doc!(text=>"abc abc abc abc"));
index_writer.add_document(doc!(text=>"abc"));
index_writer.commit().unwrap();
} }
assert_eq!(count_collector.count(), 3); let searcher = index.reader().unwrap().searcher();
assert!(top_collector.at_capacity()); let term = Term::from_field_text(text, "abc");
let query = TermQuery::new(term, IndexRecordOption::Basic);
let mut collectors = MultiCollector::new();
let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2));
let count_handler = collectors.add_collector(Count);
let mut multifruits = searcher.search(&query, &mut collectors).unwrap();
assert_eq!(count_handler.extract(&mut multifruits), 5);
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
} }
} }

208
src/collector/tests.rs Normal file
View File

@@ -0,0 +1,208 @@
use super::*;
use core::SegmentReader;
use fastfield::BytesFastFieldReader;
use fastfield::FastFieldReader;
use schema::Field;
use DocAddress;
use DocId;
use Score;
use SegmentLocalId;
/// Stores all of the doc ids.
/// This collector is only used for tests.
/// It is unusable in pr
///
/// actise, as it does not store
/// the segment ordinals
pub struct TestCollector;
pub struct TestSegmentCollector {
segment_id: SegmentLocalId,
fruit: TestFruit,
}
#[derive(Default)]
pub struct TestFruit {
docs: Vec<DocAddress>,
scores: Vec<Score>,
}
impl TestFruit {
/// Return the list of matching documents exhaustively.
pub fn docs(&self) -> &[DocAddress] {
&self.docs[..]
}
pub fn scores(&self) -> &[Score] {
&self.scores[..]
}
}
impl Collector for TestCollector {
type Fruit = TestFruit;
type Child = TestSegmentCollector;
fn for_segment(
&self,
segment_id: SegmentLocalId,
_reader: &SegmentReader,
) -> Result<TestSegmentCollector> {
Ok(TestSegmentCollector {
segment_id,
fruit: TestFruit::default(),
})
}
fn requires_scoring(&self) -> bool {
true
}
fn merge_fruits(&self, mut children: Vec<TestFruit>) -> Result<TestFruit> {
children.sort_by_key(|fruit| {
if fruit.docs().is_empty() {
0
} else {
fruit.docs()[0].segment_ord()
}
});
let mut docs = vec![];
let mut scores = vec![];
for child in children {
docs.extend(child.docs());
scores.extend(child.scores);
}
Ok(TestFruit { docs, scores })
}
}
impl SegmentCollector for TestSegmentCollector {
type Fruit = TestFruit;
fn collect(&mut self, doc: DocId, score: Score) {
self.fruit.docs.push(DocAddress(self.segment_id, doc));
self.fruit.scores.push(score);
}
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
self.fruit
}
}
/// Collects in order all of the fast fields for all of the
/// doc in the `DocSet`
///
/// This collector is mainly useful for tests.
pub struct FastFieldTestCollector {
field: Field,
}
pub struct FastFieldSegmentCollector {
vals: Vec<u64>,
reader: FastFieldReader<u64>,
}
impl FastFieldTestCollector {
pub fn for_field(field: Field) -> FastFieldTestCollector {
FastFieldTestCollector { field }
}
}
impl Collector for FastFieldTestCollector {
type Fruit = Vec<u64>;
type Child = FastFieldSegmentCollector;
fn for_segment(
&self,
_: SegmentLocalId,
segment_reader: &SegmentReader,
) -> Result<FastFieldSegmentCollector> {
let reader = segment_reader
.fast_fields()
.u64(self.field)
.expect("Requested field is not a fast field.");
Ok(FastFieldSegmentCollector {
vals: Vec::new(),
reader,
})
}
fn requires_scoring(&self) -> bool {
false
}
fn merge_fruits(&self, children: Vec<Vec<u64>>) -> Result<Vec<u64>> {
Ok(children.into_iter().flat_map(|v| v.into_iter()).collect())
}
}
impl SegmentCollector for FastFieldSegmentCollector {
type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.reader.get(doc);
self.vals.push(val);
}
fn harvest(self) -> Vec<u64> {
self.vals
}
}
/// Collects in order all of the fast field bytes for all of the
/// docs in the `DocSet`
///
/// This collector is mainly useful for tests.
pub struct BytesFastFieldTestCollector {
field: Field,
}
pub struct BytesFastFieldSegmentCollector {
vals: Vec<u8>,
reader: BytesFastFieldReader,
}
impl BytesFastFieldTestCollector {
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
BytesFastFieldTestCollector { field }
}
}
impl Collector for BytesFastFieldTestCollector {
type Fruit = Vec<u8>;
type Child = BytesFastFieldSegmentCollector;
fn for_segment(
&self,
_segment_local_id: u32,
segment_reader: &SegmentReader,
) -> Result<BytesFastFieldSegmentCollector> {
Ok(BytesFastFieldSegmentCollector {
vals: Vec::new(),
reader: segment_reader
.fast_fields()
.bytes(self.field)
.expect("Field is not a bytes fast field."),
})
}
fn requires_scoring(&self) -> bool {
false
}
fn merge_fruits(&self, children: Vec<Vec<u8>>) -> Result<Vec<u8>> {
Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
}
}
impl SegmentCollector for BytesFastFieldSegmentCollector {
type Fruit = Vec<u8>;
fn collect(&mut self, doc: u32, _score: f32) {
let data = self.reader.get_bytes(doc);
self.vals.extend(data);
}
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
self.vals
}
}

View File

@@ -1,56 +1,59 @@
use serde::export::PhantomData;
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::BinaryHeap; use std::collections::BinaryHeap;
use DocAddress; use DocAddress;
use DocId; use DocId;
use Result;
use SegmentLocalId; use SegmentLocalId;
use SegmentReader;
/// Contains a feature (field, score, etc.) of a document along with the document address. /// Contains a feature (field, score, etc.) of a document along with the document address.
/// ///
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the /// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
/// default Rust heap is a max heap, whereas a min heap is needed. /// default Rust heap is a max heap, whereas a min heap is needed.
#[derive(Clone, Copy)] ///
pub struct ComparableDoc<T> { /// WARNING: equality is not what you would expect here.
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
/// is equal. This should be perfectly fine for this usage, but let's make sure this
/// struct is never public.
struct ComparableDoc<T, D> {
feature: T, feature: T,
doc_address: DocAddress, doc: D,
} }
impl<T: PartialOrd> PartialOrd for ComparableDoc<T> { impl<T: PartialOrd, D> PartialOrd for ComparableDoc<T, D> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> { fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other)) Some(self.cmp(other))
} }
} }
impl<T: PartialOrd> Ord for ComparableDoc<T> { impl<T: PartialOrd, D> Ord for ComparableDoc<T, D> {
#[inline] #[inline]
fn cmp(&self, other: &Self) -> Ordering { fn cmp(&self, other: &Self) -> Ordering {
other other
.feature .feature
.partial_cmp(&self.feature) .partial_cmp(&self.feature)
.unwrap_or_else(|| other.doc_address.cmp(&self.doc_address)) .unwrap_or_else(|| Ordering::Equal)
} }
} }
impl<T: PartialOrd> PartialEq for ComparableDoc<T> { impl<T: PartialOrd, D> PartialEq for ComparableDoc<T, D> {
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
self.cmp(other) == Ordering::Equal self.cmp(other) == Ordering::Equal
} }
} }
impl<T: PartialOrd> Eq for ComparableDoc<T> {} impl<T: PartialOrd, D> Eq for ComparableDoc<T, D> {}
/// The Top Collector keeps track of the K documents pub(crate) struct TopCollector<T> {
/// sorted by type `T`.
///
/// The implementation is based on a `BinaryHeap`.
/// The theorical complexity for collecting the top `K` out of `n` documents
/// is `O(n log K)`.
pub struct TopCollector<T> {
limit: usize, limit: usize,
heap: BinaryHeap<ComparableDoc<T>>, _marker: PhantomData<T>,
segment_id: u32,
} }
impl<T: PartialOrd + Clone> TopCollector<T> { impl<T> TopCollector<T>
where
T: PartialOrd + Clone,
{
/// Creates a top collector, with a number of documents equal to "limit". /// Creates a top collector, with a number of documents equal to "limit".
/// ///
/// # Panics /// # Panics
@@ -61,127 +64,156 @@ impl<T: PartialOrd + Clone> TopCollector<T> {
} }
TopCollector { TopCollector {
limit, limit,
heap: BinaryHeap::with_capacity(limit), _marker: PhantomData,
segment_id: 0,
} }
} }
/// Returns K best documents sorted in decreasing order. pub fn limit(&self) -> usize {
/// self.limit
/// Calling this method triggers the sort.
/// The result of the sort is not cached.
pub fn docs(&self) -> Vec<DocAddress> {
self.top_docs()
.into_iter()
.map(|(_feature, doc)| doc)
.collect()
} }
/// Returns K best FeatureDocuments sorted in decreasing order. pub fn merge_fruits(
/// &self,
/// Calling this method triggers the sort. children: Vec<Vec<(T, DocAddress)>>,
/// The result of the sort is not cached. ) -> Result<Vec<(T, DocAddress)>> {
pub fn top_docs(&self) -> Vec<(T, DocAddress)> { if self.limit == 0 {
let mut feature_docs: Vec<ComparableDoc<T>> = self.heap.iter().cloned().collect(); return Ok(Vec::new());
feature_docs.sort(); }
feature_docs let mut top_collector = BinaryHeap::new();
for child_fruit in children {
for (feature, doc) in child_fruit {
if top_collector.len() < self.limit {
top_collector.push(ComparableDoc { feature, doc });
} else if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature {
*head = ComparableDoc { feature, doc };
}
}
}
}
Ok(top_collector
.into_sorted_vec()
.into_iter() .into_iter()
.map( .map(|cdoc| (cdoc.feature, cdoc.doc))
|ComparableDoc { .collect())
feature, }
doc_address,
}| (feature, doc_address), pub(crate) fn for_segment<F: PartialOrd>(
).collect() &self,
segment_id: SegmentLocalId,
_: &SegmentReader,
) -> Result<TopSegmentCollector<F>> {
Ok(TopSegmentCollector::new(segment_id, self.limit))
}
}
/// The Top Collector keeps track of the K documents
/// sorted by type `T`.
///
/// The implementation is based on a `BinaryHeap`.
/// The theorical complexity for collecting the top `K` out of `n` documents
/// is `O(n log K)`.
pub(crate) struct TopSegmentCollector<T> {
limit: usize,
heap: BinaryHeap<ComparableDoc<T, DocId>>,
segment_id: u32,
}
impl<T: PartialOrd> TopSegmentCollector<T> {
fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector<T> {
TopSegmentCollector {
limit,
heap: BinaryHeap::with_capacity(limit),
segment_id,
}
}
}
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
pub fn harvest(self) -> Vec<(T, DocAddress)> {
let segment_id = self.segment_id;
self.heap
.into_sorted_vec()
.into_iter()
.map(|comparable_doc| {
(
comparable_doc.feature,
DocAddress(segment_id, comparable_doc.doc),
)
})
.collect()
} }
/// Return true iff at least K documents have gone through /// Return true iff at least K documents have gone through
/// the collector. /// the collector.
#[inline] #[inline(always)]
pub fn at_capacity(&self) -> bool { pub(crate) fn at_capacity(&self) -> bool {
self.heap.len() >= self.limit self.heap.len() >= self.limit
} }
/// Sets the segment local ID for the collector
pub fn set_segment_id(&mut self, segment_id: SegmentLocalId) {
self.segment_id = segment_id;
}
/// Collects a document scored by the given feature /// Collects a document scored by the given feature
/// ///
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it /// It collects documents until it has reached the max capacity. Once it reaches capacity, it
/// will compare the lowest scoring item with the given one and keep whichever is greater. /// will compare the lowest scoring item with the given one and keep whichever is greater.
#[inline(always)]
pub fn collect(&mut self, doc: DocId, feature: T) { pub fn collect(&mut self, doc: DocId, feature: T) {
if self.at_capacity() { if self.at_capacity() {
// It's ok to unwrap as long as a limit of 0 is forbidden. // It's ok to unwrap as long as a limit of 0 is forbidden.
let limit_doc: ComparableDoc<T> = self if let Some(limit_feature) = self.heap.peek().map(|head| head.feature.clone()) {
.heap if limit_feature < feature {
.peek() if let Some(mut head) = self.heap.peek_mut() {
.expect("Top collector with size 0 is forbidden") head.feature = feature;
.clone(); head.doc = doc;
if limit_doc.feature < feature { }
let mut mut_head = self }
.heap
.peek_mut()
.expect("Top collector with size 0 is forbidden");
mut_head.feature = feature;
mut_head.doc_address = DocAddress(self.segment_id, doc);
} }
} else { } else {
let wrapped_doc = ComparableDoc { // we have not reached capacity yet, so we can just push the
feature, // element.
doc_address: DocAddress(self.segment_id, doc), self.heap.push(ComparableDoc { feature, doc });
};
self.heap.push(wrapped_doc);
} }
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::{TopCollector, TopSegmentCollector};
use DocId; use DocAddress;
use Score; use Score;
#[test] #[test]
fn test_top_collector_not_at_capacity() { fn test_top_collector_not_at_capacity() {
let mut top_collector = TopCollector::with_limit(4); let mut top_collector = TopSegmentCollector::new(0, 4);
top_collector.collect(1, 0.8); top_collector.collect(1, 0.8);
top_collector.collect(3, 0.2); top_collector.collect(3, 0.2);
top_collector.collect(5, 0.3); top_collector.collect(5, 0.3);
assert!(!top_collector.at_capacity()); assert_eq!(
let score_docs: Vec<(Score, DocId)> = top_collector top_collector.harvest(),
.top_docs() vec![
.into_iter() (0.8, DocAddress(0, 1)),
.map(|(score, doc_address)| (score, doc_address.doc())) (0.3, DocAddress(0, 5)),
.collect(); (0.2, DocAddress(0, 3))
assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]); ]
);
} }
#[test] #[test]
fn test_top_collector_at_capacity() { fn test_top_collector_at_capacity() {
let mut top_collector = TopCollector::with_limit(4); let mut top_collector = TopSegmentCollector::new(0, 4);
top_collector.collect(1, 0.8); top_collector.collect(1, 0.8);
top_collector.collect(3, 0.2); top_collector.collect(3, 0.2);
top_collector.collect(5, 0.3); top_collector.collect(5, 0.3);
top_collector.collect(7, 0.9); top_collector.collect(7, 0.9);
top_collector.collect(9, -0.2); top_collector.collect(9, -0.2);
assert!(top_collector.at_capacity()); assert_eq!(
{ top_collector.harvest(),
let score_docs: Vec<(Score, DocId)> = top_collector vec![
.top_docs() (0.9, DocAddress(0, 7)),
.into_iter() (0.8, DocAddress(0, 1)),
.map(|(score, doc_address)| (score, doc_address.doc())) (0.3, DocAddress(0, 5)),
.collect(); (0.2, DocAddress(0, 3))
assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]); ]
} );
{
let docs: Vec<DocId> = top_collector
.docs()
.into_iter()
.map(|doc_address| doc_address.doc())
.collect();
assert_eq!(docs, vec![7, 1, 5, 3]);
}
} }
#[test] #[test]
@@ -189,5 +221,4 @@ mod tests {
fn test_top_0() { fn test_top_0() {
let _collector: TopCollector<Score> = TopCollector::with_limit(0); let _collector: TopCollector<Score> = TopCollector::with_limit(0);
} }
} }

View File

@@ -1,13 +1,16 @@
use super::Collector; use super::Collector;
use collector::top_collector::TopCollector; use collector::top_collector::TopCollector;
use collector::top_collector::TopSegmentCollector;
use collector::SegmentCollector;
use fastfield::FastFieldReader; use fastfield::FastFieldReader;
use fastfield::FastValue; use fastfield::FastValue;
use schema::Field; use schema::Field;
use std::marker::PhantomData;
use DocAddress; use DocAddress;
use DocId;
use Result; use Result;
use Score; use SegmentLocalId;
use SegmentReader; use SegmentReader;
use TantivyError;
/// The Top Field Collector keeps track of the K documents /// The Top Field Collector keeps track of the K documents
/// sorted by a fast field in the index /// sorted by a fast field in the index
@@ -19,136 +22,151 @@ use SegmentReader;
/// ```rust /// ```rust
/// #[macro_use] /// #[macro_use]
/// extern crate tantivy; /// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT, FAST}; /// # use tantivy::schema::{Schema, Field, FAST, TEXT};
/// use tantivy::{Index, Result, DocId}; /// # use tantivy::{Index, Result, DocAddress};
/// use tantivy::collector::TopFieldCollector; /// # use tantivy::query::{Query, QueryParser};
/// use tantivy::query::QueryParser; /// use tantivy::Searcher;
/// use tantivy::collector::TopDocs;
/// ///
/// # fn main() { example().unwrap(); } /// # fn main() -> tantivy::Result<()> {
/// fn example() -> Result<()> { /// # let mut schema_builder = Schema::builder();
/// let mut schema_builder = SchemaBuilder::new(); /// # let title = schema_builder.add_text_field("title", TEXT);
/// let title = schema_builder.add_text_field("title", TEXT); /// # let rating = schema_builder.add_u64_field("rating", FAST);
/// let rating = schema_builder.add_u64_field("rating", FAST); /// # let schema = schema_builder.build();
/// let schema = schema_builder.build(); /// # let index = Index::create_in_ram(schema);
/// let index = Index::create_in_ram(schema); /// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// { /// # index_writer.add_document(doc!(
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; /// # title => "The Name of the Wind",
/// index_writer.add_document(doc!( /// # rating => 92u64,
/// title => "The Name of the Wind", /// # ));
/// rating => 92u64, /// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
/// )); /// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
/// index_writer.add_document(doc!( /// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
/// title => "The Diary of Muadib", /// # index_writer.commit()?;
/// rating => 97u64, /// # let reader = index.reader()?;
/// )); /// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
/// index_writer.add_document(doc!( /// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
/// title => "A Dairy Cow", /// # assert_eq!(top_docs,
/// rating => 63u64, /// # vec![(97u64, DocAddress(0u32, 1)),
/// )); /// # (80u64, DocAddress(0u32, 3))]);
/// index_writer.add_document(doc!( /// # Ok(())
/// title => "The Diary of a Young Girl", /// # }
/// rating => 80u64, /// #
/// )); /// /// Searches the document matching the given query, and
/// index_writer.commit().unwrap(); /// /// collects the top 10 documents, order by the `field`
/// } /// /// given in argument.
/// ///
/// /// `field` is required to be a FAST field.
/// fn docs_sorted_by_rating(searcher: &Searcher,
/// query: &Query,
/// sort_by_field: Field)
/// -> Result<Vec<(u64, DocAddress)>> {
/// ///
/// index.load_searchers()?; /// // This is where we build our collector!
/// let searcher = index.searcher(); /// let top_docs_by_rating = TopDocs::with_limit(2).order_by_field(sort_by_field);
/// ///
/// { /// // ... and here is our documents. Not this is a simple vec.
/// let mut top_collector = TopFieldCollector::with_limit(rating, 2); /// // The `u64` in the pair is the value of our fast field for each documents.
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// searcher.search(query, &top_docs_by_rating)
/// let query = query_parser.parse_query("diary")?;
/// searcher.search(&*query, &mut top_collector).unwrap();
///
/// let score_docs: Vec<(u64, DocId)> = top_collector
/// .top_docs()
/// .into_iter()
/// .map(|(field, doc_address)| (field, doc_address.doc()))
/// .collect();
///
/// assert_eq!(score_docs, vec![(97u64, 1), (80, 3)]);
/// }
///
/// Ok(())
/// } /// }
/// ``` /// ```
pub struct TopFieldCollector<T: FastValue> { pub struct TopDocsByField<T> {
field: Field,
collector: TopCollector<T>, collector: TopCollector<T>,
fast_field: Option<FastFieldReader<T>>, field: Field,
} }
impl<T: FastValue + PartialOrd + Clone> TopFieldCollector<T> { impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
/// Creates a top field collector, with a number of documents equal to "limit". /// Creates a top field collector, with a number of documents equal to "limit".
/// ///
/// The given field name must be a fast field, otherwise the collector have an error while /// The given field name must be a fast field, otherwise the collector have an error while
/// collecting results. /// collecting results.
/// ///
/// This constructor is crate-private. Client are supposed to call
/// build `TopDocsByField` object using the `TopDocs` API.
///
/// e.g.:
/// `TopDocs::with_limit(2).order_by_field(sort_by_field)`
///
/// # Panics /// # Panics
/// The method panics if limit is 0 /// The method panics if limit is 0
pub fn with_limit(field: Field, limit: usize) -> Self { pub(crate) fn new(field: Field, limit: usize) -> TopDocsByField<T> {
TopFieldCollector { TopDocsByField {
field,
collector: TopCollector::with_limit(limit), collector: TopCollector::with_limit(limit),
fast_field: None, field,
} }
} }
/// Returns K best documents sorted the given field name in decreasing order.
///
/// Calling this method triggers the sort.
/// The result of the sort is not cached.
pub fn docs(&self) -> Vec<DocAddress> {
self.collector.docs()
}
/// Returns K best FieldDocuments sorted in decreasing order.
///
/// Calling this method triggers the sort.
/// The result of the sort is not cached.
pub fn top_docs(&self) -> Vec<(T, DocAddress)> {
self.collector.top_docs()
}
/// Return true iff at least K documents have gone through
/// the collector.
#[inline]
pub fn at_capacity(&self) -> bool {
self.collector.at_capacity()
}
} }
impl<T: FastValue + PartialOrd + Clone> Collector for TopFieldCollector<T> { impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByField<T> {
fn set_segment(&mut self, segment_id: u32, segment: &SegmentReader) -> Result<()> { type Fruit = Vec<(T, DocAddress)>;
self.collector.set_segment_id(segment_id);
self.fast_field = Some(segment.fast_field_reader(self.field)?);
Ok(())
}
fn collect(&mut self, doc: DocId, _score: Score) { type Child = TopFieldSegmentCollector<T>;
let field_value = self
.fast_field fn for_segment(
.as_ref() &self,
.expect("collect() was called before set_segment. This should never happen.") segment_local_id: SegmentLocalId,
.get(doc); reader: &SegmentReader,
self.collector.collect(doc, field_value); ) -> Result<TopFieldSegmentCollector<T>> {
let collector = self.collector.for_segment(segment_local_id, reader)?;
let reader = reader.fast_fields().u64(self.field).ok_or_else(|| {
let field_name = reader.schema().get_field_name(self.field);
TantivyError::SchemaError(format!("Failed to find fast field reader {:?}", field_name))
})?;
Ok(TopFieldSegmentCollector {
collector,
reader,
_type: PhantomData,
})
} }
fn requires_scoring(&self) -> bool { fn requires_scoring(&self) -> bool {
false false
} }
fn merge_fruits(
&self,
segment_fruits: Vec<Vec<(T, DocAddress)>>,
) -> Result<Vec<(T, DocAddress)>> {
self.collector.merge_fruits(segment_fruits)
}
}
pub struct TopFieldSegmentCollector<T> {
collector: TopSegmentCollector<u64>,
reader: FastFieldReader<u64>,
_type: PhantomData<T>,
}
impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
for TopFieldSegmentCollector<T>
{
type Fruit = Vec<(T, DocAddress)>;
fn collect(&mut self, doc: u32, _score: f32) {
let field_value = self.reader.get(doc);
self.collector.collect(doc, field_value);
}
fn harvest(self) -> Vec<(T, DocAddress)> {
self.collector
.harvest()
.into_iter()
.map(|(val, doc_address)| (T::from_u64(val), doc_address))
.collect()
}
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::TopDocsByField;
use collector::Collector;
use collector::TopDocs;
use query::Query; use query::Query;
use query::QueryParser; use query::QueryParser;
use schema::Field; use schema::Field;
use schema::IntOptions; use schema::IntOptions;
use schema::Schema; use schema::{Schema, FAST, TEXT};
use schema::{SchemaBuilder, FAST, TEXT}; use DocAddress;
use Index; use Index;
use IndexWriter; use IndexWriter;
use TantivyError; use TantivyError;
@@ -158,7 +176,7 @@ mod tests {
#[test] #[test]
fn test_top_collector_not_at_capacity() { fn test_top_collector_not_at_capacity() {
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field(TITLE, TEXT); let title = schema_builder.add_text_field(TITLE, TEXT);
let size = schema_builder.add_u64_field(SIZE, FAST); let size = schema_builder.add_u64_field(SIZE, FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -176,24 +194,24 @@ mod tests {
size => 16u64, size => 16u64,
)); ));
}); });
let searcher = index.searcher(); let searcher = index.reader().unwrap().searcher();
let mut top_collector = TopFieldCollector::with_limit(size, 4); let top_collector = TopDocs::with_limit(4).order_by_field(size);
searcher.search(&*query, &mut top_collector).unwrap(); let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
assert!(!top_collector.at_capacity()); assert_eq!(
top_docs,
let score_docs: Vec<(u64, DocId)> = top_collector vec![
.top_docs() (64, DocAddress(0, 1)),
.into_iter() (16, DocAddress(0, 2)),
.map(|(field, doc_address)| (field, doc_address.doc())) (12, DocAddress(0, 0))
.collect(); ]
assert_eq!(score_docs, vec![(64, 1), (16, 2), (12, 0)]); );
} }
#[test] #[test]
#[should_panic] #[should_panic]
fn test_field_does_not_exist() { fn test_field_does_not_exist() {
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field(TITLE, TEXT); let title = schema_builder.add_text_field(TITLE, TEXT);
let size = schema_builder.add_u64_field(SIZE, FAST); let size = schema_builder.add_u64_field(SIZE, FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -203,15 +221,17 @@ mod tests {
size => 12u64, size => 12u64,
)); ));
}); });
let searcher = index.searcher(); let searcher = index.reader().unwrap().searcher();
let segment = searcher.segment_reader(0); let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(Field(2));
let mut top_collector: TopFieldCollector<u64> = TopFieldCollector::with_limit(Field(2), 4); let segment_reader = searcher.segment_reader(0u32);
let _ = top_collector.set_segment(0, segment); top_collector
.for_segment(0, segment_reader)
.expect("should panic");
} }
#[test] #[test]
fn test_field_not_fast_field() { fn test_field_not_fast_field() {
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field(TITLE, TEXT); let title = schema_builder.add_text_field(TITLE, TEXT);
let size = schema_builder.add_u64_field(SIZE, IntOptions::default()); let size = schema_builder.add_u64_field(SIZE, IntOptions::default());
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -221,28 +241,18 @@ mod tests {
size => 12u64, size => 12u64,
)); ));
}); });
let searcher = index.searcher(); let searcher = index.reader().unwrap().searcher();
let segment = searcher.segment_reader(0); let segment = searcher.segment_reader(0);
let mut top_collector: TopFieldCollector<u64> = TopFieldCollector::with_limit(size, 4); let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(size);
assert_matches!( assert_matches!(
top_collector.set_segment(0, segment), top_collector
Err(TantivyError::FastFieldError(_)) .for_segment(0, segment)
.map(|_| ())
.unwrap_err(),
TantivyError::SchemaError(_)
); );
} }
#[test]
#[should_panic]
fn test_collect_before_set_segment() {
let mut top_collector: TopFieldCollector<u64> = TopFieldCollector::with_limit(Field(0), 4);
top_collector.collect(0, 0f32);
}
#[test]
#[should_panic]
fn test_top_0() {
let _: TopFieldCollector<u64> = TopFieldCollector::with_limit(Field(0), 0);
}
fn index( fn index(
query: &str, query: &str,
query_field: Field, query_field: Field,
@@ -254,8 +264,6 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
doc_adder(&mut index_writer); doc_adder(&mut index_writer);
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap();
let query_parser = QueryParser::for_index(&index, vec![query_field]); let query_parser = QueryParser::for_index(&index, vec![query_field]);
let query = query_parser.parse_query(query).unwrap(); let query = query_parser.parse_query(query).unwrap();
(index, query) (index, query)

View File

@@ -1,5 +1,10 @@
use super::Collector; use super::Collector;
use collector::top_collector::TopCollector; use collector::top_collector::TopCollector;
use collector::top_collector::TopSegmentCollector;
use collector::SegmentCollector;
use collector::TopDocsByField;
use fastfield::FastValue;
use schema::Field;
use DocAddress; use DocAddress;
use DocId; use DocId;
use Result; use Result;
@@ -17,14 +22,15 @@ use SegmentReader;
/// ```rust /// ```rust
/// #[macro_use] /// #[macro_use]
/// extern crate tantivy; /// extern crate tantivy;
/// use tantivy::schema::{SchemaBuilder, TEXT}; /// use tantivy::DocAddress;
/// use tantivy::{Index, Result, DocId, Score}; /// use tantivy::schema::{Schema, TEXT};
/// use tantivy::collector::TopScoreCollector; /// use tantivy::{Index, Result};
/// use tantivy::collector::TopDocs;
/// use tantivy::query::QueryParser; /// use tantivy::query::QueryParser;
/// ///
/// # fn main() { example().unwrap(); } /// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> { /// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new(); /// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build(); /// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema); /// let index = Index::create_in_ram(schema);
@@ -45,143 +51,153 @@ use SegmentReader;
/// index_writer.commit().unwrap(); /// index_writer.commit().unwrap();
/// } /// }
/// ///
/// index.load_searchers()?; /// let reader = index.reader()?;
/// let searcher = index.searcher(); /// let searcher = reader.searcher();
/// ///
/// { /// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let mut top_collector = TopScoreCollector::with_limit(2); /// let query = query_parser.parse_query("diary")?;
/// let query_parser = QueryParser::for_index(&index, vec![title]); /// let top_docs = searcher.search(&query, &TopDocs::with_limit(2))?;
/// let query = query_parser.parse_query("diary")?;
/// searcher.search(&*query, &mut top_collector).unwrap();
/// ///
/// let score_docs: Vec<(Score, DocId)> = top_collector /// assert_eq!(&top_docs[0], &(0.7261542, DocAddress(0, 1)));
/// .top_docs() /// assert_eq!(&top_docs[1], &(0.6099695, DocAddress(0, 3)));
/// .into_iter()
/// .map(|(score, doc_address)| (score, doc_address.doc()))
/// .collect();
///
/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]);
/// }
/// ///
/// Ok(()) /// Ok(())
/// } /// }
/// ``` /// ```
pub struct TopScoreCollector { pub struct TopDocs(TopCollector<Score>);
collector: TopCollector<Score>,
}
impl TopScoreCollector { impl TopDocs {
/// Creates a top score collector, with a number of documents equal to "limit". /// Creates a top score collector, with a number of documents equal to "limit".
/// ///
/// # Panics /// # Panics
/// The method panics if limit is 0 /// The method panics if limit is 0
pub fn with_limit(limit: usize) -> TopScoreCollector { pub fn with_limit(limit: usize) -> TopDocs {
TopScoreCollector { TopDocs(TopCollector::with_limit(limit))
collector: TopCollector::with_limit(limit),
}
} }
/// Returns K best scored documents sorted in decreasing order. /// Set top-K to rank documents by a given fast field.
/// ///
/// Calling this method triggers the sort. /// (By default, `TopDocs` collects the top-K documents sorted by
/// The result of the sort is not cached. /// the similarity score.)
pub fn docs(&self) -> Vec<DocAddress> { pub fn order_by_field<T: PartialOrd + FastValue + Clone>(
self.collector.docs() self,
} field: Field,
) -> TopDocsByField<T> {
/// Returns K best ScoredDocuments sorted in decreasing order. TopDocsByField::new(field, self.0.limit())
///
/// Calling this method triggers the sort.
/// The result of the sort is not cached.
pub fn top_docs(&self) -> Vec<(Score, DocAddress)> {
self.collector.top_docs()
}
/// Returns K best ScoredDocuments sorted in decreasing order.
///
/// Calling this method triggers the sort.
/// The result of the sort is not cached.
#[deprecated]
pub fn score_docs(&self) -> Vec<(Score, DocAddress)> {
self.collector.top_docs()
}
/// Return true iff at least K documents have gone through
/// the collector.
#[inline]
pub fn at_capacity(&self) -> bool {
self.collector.at_capacity()
} }
} }
impl Collector for TopScoreCollector { impl Collector for TopDocs {
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> { type Fruit = Vec<(Score, DocAddress)>;
self.collector.set_segment_id(segment_id);
Ok(())
}
fn collect(&mut self, doc: DocId, score: Score) { type Child = TopScoreSegmentCollector;
self.collector.collect(doc, score);
fn for_segment(
&self,
segment_local_id: SegmentLocalId,
reader: &SegmentReader,
) -> Result<Self::Child> {
let collector = self.0.for_segment(segment_local_id, reader)?;
Ok(TopScoreSegmentCollector(collector))
} }
fn requires_scoring(&self) -> bool { fn requires_scoring(&self) -> bool {
true true
} }
fn merge_fruits(&self, child_fruits: Vec<Vec<(Score, DocAddress)>>) -> Result<Self::Fruit> {
self.0.merge_fruits(child_fruits)
}
}
/// Segment Collector associated to `TopDocs`.
pub struct TopScoreSegmentCollector(TopSegmentCollector<Score>);
impl SegmentCollector for TopScoreSegmentCollector {
type Fruit = Vec<(Score, DocAddress)>;
fn collect(&mut self, doc: DocId, score: Score) {
self.0.collect(doc, score)
}
fn harvest(self) -> Vec<(Score, DocAddress)> {
self.0.harvest()
}
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::TopDocs;
use collector::Collector; use query::QueryParser;
use DocId; use schema::Schema;
use schema::TEXT;
use DocAddress;
use Index;
use Score; use Score;
fn make_index() -> Index {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
index_writer.add_document(doc!(text_field=>"I like Droopy"));
assert!(index_writer.commit().is_ok());
}
index
}
#[test] #[test]
fn test_top_collector_not_at_capacity() { fn test_top_collector_not_at_capacity() {
let mut top_collector = TopScoreCollector::with_limit(4); let index = make_index();
top_collector.collect(1, 0.8); let field = index.schema().get_field("text").unwrap();
top_collector.collect(3, 0.2); let query_parser = QueryParser::for_index(&index, vec![field]);
top_collector.collect(5, 0.3); let text_query = query_parser.parse_query("droopy tax").unwrap();
assert!(!top_collector.at_capacity()); let score_docs: Vec<(Score, DocAddress)> = index
let score_docs: Vec<(Score, DocId)> = top_collector .reader()
.top_docs() .unwrap()
.into_iter() .searcher()
.map(|(score, doc_address)| (score, doc_address.doc())) .search(&text_query, &TopDocs::with_limit(4))
.collect(); .unwrap();
assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]); assert_eq!(
score_docs,
vec![
(0.81221175, DocAddress(0u32, 1)),
(0.5376842, DocAddress(0u32, 2)),
(0.48527452, DocAddress(0, 0))
]
);
} }
#[test] #[test]
fn test_top_collector_at_capacity() { fn test_top_collector_at_capacity() {
let mut top_collector = TopScoreCollector::with_limit(4); let index = make_index();
top_collector.collect(1, 0.8); let field = index.schema().get_field("text").unwrap();
top_collector.collect(3, 0.2); let query_parser = QueryParser::for_index(&index, vec![field]);
top_collector.collect(5, 0.3); let text_query = query_parser.parse_query("droopy tax").unwrap();
top_collector.collect(7, 0.9); let score_docs: Vec<(Score, DocAddress)> = index
top_collector.collect(9, -0.2); .reader()
assert!(top_collector.at_capacity()); .unwrap()
{ .searcher()
let score_docs: Vec<(Score, DocId)> = top_collector .search(&text_query, &TopDocs::with_limit(2))
.top_docs() .unwrap();
.into_iter() assert_eq!(
.map(|(score, doc_address)| (score, doc_address.doc())) score_docs,
.collect(); vec![
assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]); (0.81221175, DocAddress(0u32, 1)),
} (0.5376842, DocAddress(0u32, 2)),
{ ]
let docs: Vec<DocId> = top_collector );
.docs()
.into_iter()
.map(|doc_address| doc_address.doc())
.collect();
assert_eq!(docs, vec![7, 1, 5, 3]);
}
} }
#[test] #[test]
#[should_panic] #[should_panic]
fn test_top_0() { fn test_top_0() {
TopScoreCollector::with_limit(0); TopDocs::with_limit(0);
} }
} }

View File

@@ -1,9 +1,6 @@
use common::serialize::BinarySerializable; use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
use std::io; use std::io;
use std::io::Write;
use std::mem;
use std::ops::Deref; use std::ops::Deref;
use std::ptr;
pub(crate) struct BitPacker { pub(crate) struct BitPacker {
mini_buffer: u64, mini_buffer: u64,
@@ -18,7 +15,7 @@ impl BitPacker {
} }
} }
pub fn write<TWrite: Write>( pub fn write<TWrite: io::Write>(
&mut self, &mut self,
val: u64, val: u64,
num_bits: u8, num_bits: u8,
@@ -28,14 +25,14 @@ impl BitPacker {
let num_bits = num_bits as usize; let num_bits = num_bits as usize;
if self.mini_buffer_written + num_bits > 64 { if self.mini_buffer_written + num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32); self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
self.mini_buffer.serialize(output)?; output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32); self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64; self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
} else { } else {
self.mini_buffer |= val_u64 << self.mini_buffer_written; self.mini_buffer |= val_u64 << self.mini_buffer_written;
self.mini_buffer_written += num_bits; self.mini_buffer_written += num_bits;
if self.mini_buffer_written == 64 { if self.mini_buffer_written == 64 {
self.mini_buffer.serialize(output)?; output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer_written = 0; self.mini_buffer_written = 0;
self.mini_buffer = 0u64; self.mini_buffer = 0u64;
} }
@@ -43,17 +40,18 @@ impl BitPacker {
Ok(()) Ok(())
} }
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> { pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 { if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8; let num_bytes = (self.mini_buffer_written + 7) / 8;
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) }; let mut arr: [u8; 8] = [0u8; 8];
LittleEndian::write_u64(&mut arr, self.mini_buffer);
output.write_all(&arr[..num_bytes])?; output.write_all(&arr[..num_bytes])?;
self.mini_buffer_written = 0; self.mini_buffer_written = 0;
} }
Ok(()) Ok(())
} }
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> { pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
self.flush(output)?; self.flush(output)?;
// Padding the write file to simplify reads. // Padding the write file to simplify reads.
output.write_all(&[0u8; 7])?; output.write_all(&[0u8; 7])?;
@@ -66,7 +64,7 @@ pub struct BitUnpacker<Data>
where where
Data: Deref<Target = [u8]>, Data: Deref<Target = [u8]>,
{ {
num_bits: usize, num_bits: u64,
mask: u64, mask: u64,
data: Data, data: Data,
} }
@@ -82,13 +80,13 @@ where
(1u64 << num_bits) - 1u64 (1u64 << num_bits) - 1u64
}; };
BitUnpacker { BitUnpacker {
num_bits: num_bits as usize, num_bits: u64::from(num_bits),
mask, mask,
data, data,
} }
} }
pub fn get(&self, idx: usize) -> u64 { pub fn get(&self, idx: u64) -> u64 {
if self.num_bits == 0 { if self.num_bits == 0 {
return 0u64; return 0u64;
} }
@@ -99,42 +97,13 @@ where
let addr = addr_in_bits >> 3; let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7; let bit_shift = addr_in_bits & 7;
debug_assert!( debug_assert!(
addr + 8 <= data.len(), addr + 8 <= data.len() as u64,
"The fast field field should have been padded with 7 bytes." "The fast field field should have been padded with 7 bytes."
); );
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))] let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[(addr as usize)..]);
let val_unshifted_unmasked: u64 =
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask val_shifted & mask
} }
/// Reads a range of values from the fast field.
///
/// The range of values read is from
/// `[start..start + output.len()[`
pub fn get_range(&self, start: u32, output: &mut [u64]) {
if self.num_bits == 0 {
for val in output.iter_mut() {
*val = 0u64;
}
} else {
let data: &[u8] = &*self.data;
let num_bits = self.num_bits;
let mask = self.mask;
let mut addr_in_bits = (start as usize) * num_bits;
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;
}
}
}
} }
#[cfg(test)] #[cfg(test)]
@@ -160,7 +129,7 @@ mod test {
fn test_bitpacker_util(len: usize, num_bits: u8) { fn test_bitpacker_util(len: usize, num_bits: u8) {
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits); let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
for (i, val) in vals.iter().enumerate() { for (i, val) in vals.iter().enumerate() {
assert_eq!(bitunpacker.get(i), *val); assert_eq!(bitunpacker.get(i as u64), *val);
} }
} }
@@ -172,17 +141,4 @@ mod test {
test_bitpacker_util(6, 14); test_bitpacker_util(6, 14);
test_bitpacker_util(1000, 14); test_bitpacker_util(1000, 14);
} }
#[test]
fn test_bitpacker_range() {
let (bitunpacker, vals) = create_fastfield_bitpacker(100_000, 12);
let buffer_len = 100;
let mut buffer = vec![0u64; buffer_len];
for start in vec![0, 10, 20, 100, 1_000] {
bitunpacker.get_range(start as u32, &mut buffer[..]);
for i in 0..buffer_len {
assert_eq!(buffer[i], vals[start + i]);
}
}
}
} }

View File

@@ -4,8 +4,8 @@ use common::VInt;
use directory::ReadOnlySource; use directory::ReadOnlySource;
use directory::WritePtr; use directory::WritePtr;
use schema::Field; use schema::Field;
use space_usage::PerFieldSpaceUsage;
use space_usage::FieldUsage; use space_usage::FieldUsage;
use space_usage::PerFieldSpaceUsage;
use std::collections::HashMap; use std::collections::HashMap;
use std::io::Write; use std::io::Write;
use std::io::{self, Read}; use std::io::{self, Read};
@@ -39,7 +39,7 @@ impl BinarySerializable for FileAddr {
/// A `CompositeWrite` is used to write a `CompositeFile`. /// A `CompositeWrite` is used to write a `CompositeFile`.
pub struct CompositeWrite<W = WritePtr> { pub struct CompositeWrite<W = WritePtr> {
write: CountingWriter<W>, write: CountingWriter<W>,
offsets: HashMap<FileAddr, usize>, offsets: HashMap<FileAddr, u64>,
} }
impl<W: Write> CompositeWrite<W> { impl<W: Write> CompositeWrite<W> {
@@ -172,7 +172,8 @@ impl CompositeFile {
pub fn space_usage(&self) -> PerFieldSpaceUsage { pub fn space_usage(&self) -> PerFieldSpaceUsage {
let mut fields = HashMap::new(); let mut fields = HashMap::new();
for (&field_addr, &(start, end)) in self.offsets_index.iter() { for (&field_addr, &(start, end)) in self.offsets_index.iter() {
fields.entry(field_addr.field) fields
.entry(field_addr.field)
.or_insert_with(|| FieldUsage::empty(field_addr.field)) .or_insert_with(|| FieldUsage::empty(field_addr.field))
.add_field_idx(field_addr.idx, end - start); .add_field_idx(field_addr.idx, end - start);
} }

View File

@@ -3,7 +3,7 @@ use std::io::Write;
pub struct CountingWriter<W> { pub struct CountingWriter<W> {
underlying: W, underlying: W,
written_bytes: usize, written_bytes: u64,
} }
impl<W: Write> CountingWriter<W> { impl<W: Write> CountingWriter<W> {
@@ -14,11 +14,11 @@ impl<W: Write> CountingWriter<W> {
} }
} }
pub fn written_bytes(&self) -> usize { pub fn written_bytes(&self) -> u64 {
self.written_bytes self.written_bytes
} }
pub fn finish(mut self) -> io::Result<(W, usize)> { pub fn finish(mut self) -> io::Result<(W, u64)> {
self.flush()?; self.flush()?;
Ok((self.underlying, self.written_bytes)) Ok((self.underlying, self.written_bytes))
} }
@@ -27,10 +27,16 @@ impl<W: Write> CountingWriter<W> {
impl<W: Write> Write for CountingWriter<W> { impl<W: Write> Write for CountingWriter<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> { fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
let written_size = self.underlying.write(buf)?; let written_size = self.underlying.write(buf)?;
self.written_bytes += written_size; self.written_bytes += written_size as u64;
Ok(written_size) Ok(written_size)
} }
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
self.underlying.write_all(buf)?;
self.written_bytes += buf.len() as u64;
Ok(())
}
fn flush(&mut self) -> io::Result<()> { fn flush(&mut self) -> io::Result<()> {
self.underlying.flush() self.underlying.flush()
} }
@@ -48,8 +54,8 @@ mod test {
let mut counting_writer = CountingWriter::wrap(buffer); let mut counting_writer = CountingWriter::wrap(buffer);
let bytes = (0u8..10u8).collect::<Vec<u8>>(); let bytes = (0u8..10u8).collect::<Vec<u8>>();
counting_writer.write_all(&bytes).unwrap(); counting_writer.write_all(&bytes).unwrap();
let (w, len): (Vec<u8>, usize) = counting_writer.finish().unwrap(); let (w, len): (Vec<u8>, u64) = counting_writer.finish().unwrap();
assert_eq!(len, 10); assert_eq!(len, 10u64);
assert_eq!(w.len(), 10); assert_eq!(w.len(), 10);
} }
} }

View File

@@ -10,10 +10,13 @@ pub(crate) use self::bitset::TinySet;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite}; pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::counting_writer::CountingWriter; pub use self::counting_writer::CountingWriter;
pub use self::serialize::{BinarySerializable, FixedSize}; pub use self::serialize::{BinarySerializable, FixedSize};
pub use self::vint::VInt; pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
pub use byteorder::LittleEndian as Endianness; pub use byteorder::LittleEndian as Endianness;
use std::io; /// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
/// Computes the number of bits that will be used for bitpacking. /// Computes the number of bits that will be used for bitpacking.
/// ///
@@ -52,11 +55,6 @@ pub(crate) fn is_power_of_2(n: usize) -> bool {
(n > 0) && (n & (n - 1) == 0) (n > 0) && (n & (n - 1) == 0)
} }
/// Create a default io error given a string.
pub(crate) fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
/// Has length trait /// Has length trait
pub trait HasLen { pub trait HasLen {
/// Return length /// Return length
@@ -134,4 +132,11 @@ pub(crate) mod test {
assert_eq!(compute_num_bits(256), 9u8); assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8); assert_eq!(compute_num_bits(5_000_000_000), 33u8);
} }
#[test]
fn test_max_doc() {
// this is the first time I write a unit test for a constant.
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
assert!((super::MAX_DOC_LIMIT as i32) < 0);
}
} }

View File

@@ -1,4 +1,5 @@
use super::BinarySerializable; use super::BinarySerializable;
use byteorder::{ByteOrder, LittleEndian};
use std::io; use std::io;
use std::io::Read; use std::io::Read;
use std::io::Write; use std::io::Write;
@@ -9,6 +10,100 @@ pub struct VInt(pub u64);
const STOP_BIT: u8 = 128; const STOP_BIT: u8 = 128;
pub fn serialize_vint_u32(val: u32) -> (u64, usize) {
const START_2: u64 = 1 << 7;
const START_3: u64 = 1 << 14;
const START_4: u64 = 1 << 21;
const START_5: u64 = 1 << 28;
const STOP_1: u64 = START_2 - 1;
const STOP_2: u64 = START_3 - 1;
const STOP_3: u64 = START_4 - 1;
const STOP_4: u64 = START_5 - 1;
const MASK_1: u64 = 127;
const MASK_2: u64 = MASK_1 << 7;
const MASK_3: u64 = MASK_2 << 7;
const MASK_4: u64 = MASK_3 << 7;
const MASK_5: u64 = MASK_4 << 7;
let val = u64::from(val);
const STOP_BIT: u64 = 128u64;
match val {
0...STOP_1 => (val | STOP_BIT, 1),
START_2...STOP_2 => (
(val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)),
2,
),
START_3...STOP_3 => (
(val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | (STOP_BIT << (8 * 2)),
3,
),
START_4...STOP_4 => (
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
| ((val & MASK_4) << 3)
| (STOP_BIT << (8 * 3)),
4,
),
_ => (
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
| ((val & MASK_4) << 3)
| ((val & MASK_5) << 4)
| (STOP_BIT << (8 * 4)),
5,
),
}
}
/// Returns the number of bytes covered by a
/// serialized vint `u32`.
///
/// Expects a buffer data that starts
/// by the serialized `vint`, scans at most 5 bytes ahead until
/// it finds the vint final byte.
///
/// # May Panic
/// If the payload does not start by a valid `vint`
fn vint_len(data: &[u8]) -> usize {
for (i, &val) in data.iter().enumerate().take(5) {
if val >= STOP_BIT {
return i + 1;
}
}
panic!("Corrupted data. Invalid VInt 32");
}
/// Reads a vint `u32` from a buffer, and
/// consumes its payload data.
///
/// # Panics
///
/// If the buffer does not start by a valid
/// vint payload
pub fn read_u32_vint(data: &mut &[u8]) -> u32 {
let vlen = vint_len(*data);
let mut result = 0u32;
let mut shift = 0u64;
for &b in &data[..vlen] {
result |= u32::from(b & 127u8) << shift;
shift += 7;
}
*data = &data[vlen..];
result
}
/// Write a `u32` as a vint payload.
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
let (val, num_bytes) = serialize_vint_u32(val);
let mut buffer = [0u8; 8];
LittleEndian::write_u64(&mut buffer, val);
writer.write_all(&buffer[..num_bytes])
}
impl VInt { impl VInt {
pub fn val(&self) -> u64 { pub fn val(&self) -> u64 {
self.0 self.0
@@ -24,7 +119,7 @@ impl VInt {
output.extend(&buffer[0..num_bytes]); output.extend(&buffer[0..num_bytes]);
} }
fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize { pub fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
let mut remaining = self.0; let mut remaining = self.0;
for (i, b) in buffer.iter_mut().enumerate() { for (i, b) in buffer.iter_mut().enumerate() {
let next_byte: u8 = (remaining % 128u64) as u8; let next_byte: u8 = (remaining % 128u64) as u8;
@@ -64,7 +159,7 @@ impl BinarySerializable for VInt {
return Err(io::Error::new( return Err(io::Error::new(
io::ErrorKind::InvalidData, io::ErrorKind::InvalidData,
"Reach end of buffer while reading VInt", "Reach end of buffer while reading VInt",
)) ));
} }
} }
} }
@@ -74,7 +169,9 @@ impl BinarySerializable for VInt {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::serialize_vint_u32;
use super::VInt; use super::VInt;
use byteorder::{ByteOrder, LittleEndian};
use common::BinarySerializable; use common::BinarySerializable;
fn aux_test_vint(val: u64) { fn aux_test_vint(val: u64) {
@@ -108,4 +205,28 @@ mod tests {
} }
aux_test_vint(10); aux_test_vint(10);
} }
fn aux_test_serialize_vint_u32(val: u32) {
let mut buffer = [0u8; 10];
let mut buffer2 = [0u8; 10];
let len_vint = VInt(val as u64).serialize_into(&mut buffer);
let (vint, len) = serialize_vint_u32(val);
assert_eq!(len, len_vint, "len wrong for val {}", val);
LittleEndian::write_u64(&mut buffer2, vint);
assert_eq!(&buffer[..len], &buffer2[..len], "array wrong for {}", val);
}
#[test]
fn test_vint_u32() {
aux_test_serialize_vint_u32(0);
aux_test_serialize_vint_u32(1);
aux_test_serialize_vint_u32(5);
for i in 1..3 {
let power_of_128 = 1u32 << (7 * i);
aux_test_serialize_vint_u32(power_of_128 - 1u32);
aux_test_serialize_vint_u32(power_of_128);
aux_test_serialize_vint_u32(power_of_128 + 1u32);
}
aux_test_serialize_vint_u32(u32::max_value());
}
} }

136
src/core/executor.rs Normal file
View File

@@ -0,0 +1,136 @@
use crossbeam::channel;
use scoped_pool::{Pool, ThreadConfig};
use Result;
/// Search executor whether search request are single thread or multithread.
///
/// We don't expose Rayon thread pool directly here for several reasons.
///
/// First dependency hell. It is not a good idea to expose the
/// API of a dependency, knowing it might conflict with a different version
/// used by the client. Second, we may stop using rayon in the future.
pub enum Executor {
SingleThread,
ThreadPool(Pool),
}
impl Executor {
/// Creates an Executor that performs all task in the caller thread.
pub fn single_thread() -> Executor {
Executor::SingleThread
}
// Creates an Executor that dispatches the tasks in a thread pool.
pub fn multi_thread(num_threads: usize, prefix: &'static str) -> Executor {
let thread_config = ThreadConfig::new().prefix(prefix);
let pool = Pool::with_thread_config(num_threads, thread_config);
Executor::ThreadPool(pool)
}
// Perform a map in the thread pool.
//
// Regardless of the executor (`SingleThread` or `ThreadPool`), panics in the task
// will propagate to the caller.
pub fn map<
A: Send,
R: Send,
AIterator: Iterator<Item = A>,
F: Sized + Sync + Fn(A) -> Result<R>,
>(
&self,
f: F,
args: AIterator,
) -> Result<Vec<R>> {
match self {
Executor::SingleThread => args.map(f).collect::<Result<_>>(),
Executor::ThreadPool(pool) => {
let args_with_indices: Vec<(usize, A)> = args.enumerate().collect();
let num_fruits = args_with_indices.len();
let fruit_receiver = {
let (fruit_sender, fruit_receiver) = channel::unbounded();
pool.scoped(|scope| {
for arg_with_idx in args_with_indices {
scope.execute(|| {
let (idx, arg) = arg_with_idx;
let fruit = f(arg);
if let Err(err) = fruit_sender.send((idx, fruit)) {
error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err);
}
});
}
});
fruit_receiver
// This ends the scope of fruit_sender.
// This is important as it makes it possible for the fruit_receiver iteration to
// terminate.
};
// This is lame, but safe.
let mut results_with_position = Vec::with_capacity(num_fruits);
for (pos, fruit_res) in fruit_receiver {
let fruit = fruit_res?;
results_with_position.push((pos, fruit));
}
results_with_position.sort_by_key(|(pos, _)| *pos);
assert_eq!(results_with_position.len(), num_fruits);
Ok(results_with_position
.into_iter()
.map(|(_, fruit)| fruit)
.collect::<Vec<_>>())
}
}
}
}
#[cfg(test)]
mod tests {
use super::Executor;
#[test]
#[should_panic(expected = "panic should propagate")]
fn test_panic_propagates_single_thread() {
let _result: Vec<usize> = Executor::single_thread()
.map(
|_| {
panic!("panic should propagate");
},
vec![0].into_iter(),
)
.unwrap();
}
#[test]
#[should_panic] //< unfortunately the panic message is not propagated
fn test_panic_propagates_multi_thread() {
let _result: Vec<usize> = Executor::multi_thread(1, "search-test")
.map(
|_| {
panic!("panic should propagate");
},
vec![0].into_iter(),
)
.unwrap();
}
#[test]
fn test_map_singlethread() {
let result: Vec<usize> = Executor::single_thread()
.map(|i| Ok(i * 2), 0..1_000)
.unwrap();
assert_eq!(result.len(), 1_000);
for i in 0..1_000 {
assert_eq!(result[i], i * 2);
}
}
#[test]
fn test_map_multithread() {
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
.map(|i| Ok(i * 2), 0..10)
.unwrap();
assert_eq!(result.len(), 10);
for i in 0..10 {
assert_eq!(result[i], i * 2);
}
}
}

View File

@@ -1,31 +1,31 @@
use super::pool::LeasedItem;
use super::pool::Pool;
use super::segment::create_segment; use super::segment::create_segment;
use super::segment::Segment; use super::segment::Segment;
use core::searcher::Searcher; use core::Executor;
use core::IndexMeta; use core::IndexMeta;
use core::SegmentId; use core::SegmentId;
use core::SegmentMeta; use core::SegmentMeta;
use core::SegmentReader;
use core::META_FILEPATH; use core::META_FILEPATH;
use directory::ManagedDirectory; use directory::ManagedDirectory;
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
use directory::MmapDirectory; use directory::MmapDirectory;
use directory::INDEX_WRITER_LOCK;
use directory::{Directory, RAMDirectory}; use directory::{Directory, RAMDirectory};
use error::DataCorruption;
use error::TantivyError; use error::TantivyError;
use indexer::index_writer::open_index_writer; use indexer::index_writer::open_index_writer;
use indexer::index_writer::HEAP_SIZE_MIN; use indexer::index_writer::HEAP_SIZE_MIN;
use indexer::segment_updater::save_new_metas; use indexer::segment_updater::save_new_metas;
use indexer::LockType;
use num_cpus; use num_cpus;
use reader::IndexReader;
use reader::IndexReaderBuilder;
use schema::Field; use schema::Field;
use schema::FieldType; use schema::FieldType;
use schema::Schema; use schema::Schema;
use serde_json; use serde_json;
use std::borrow::BorrowMut; use std::borrow::BorrowMut;
use std::fmt; use std::fmt;
#[cfg(feature = "mmap")]
use std::path::Path; use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc; use std::sync::Arc;
use tokenizer::BoxedTokenizer; use tokenizer::BoxedTokenizer;
use tokenizer::TokenizerManager; use tokenizer::TokenizerManager;
@@ -36,19 +36,53 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?; let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data); let meta_string = String::from_utf8_lossy(&meta_data);
serde_json::from_str(&meta_string) serde_json::from_str(&meta_string)
.map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone())) .map_err(|e| {
DataCorruption::new(
META_FILEPATH.clone(),
format!("Meta file cannot be deserialized. {:?}.", e),
)
})
.map_err(From::from)
} }
/// Search Index /// Search Index
#[derive(Clone)]
pub struct Index { pub struct Index {
directory: ManagedDirectory, directory: ManagedDirectory,
schema: Schema, schema: Schema,
num_searchers: Arc<AtomicUsize>, executor: Arc<Executor>,
searcher_pool: Arc<Pool<Searcher>>,
tokenizers: TokenizerManager, tokenizers: TokenizerManager,
} }
impl Index { impl Index {
/// Examines the director to see if it contains an index
pub fn exists<Dir: Directory>(dir: &Dir) -> bool {
dir.exists(&META_FILEPATH)
}
/// Accessor to the search executor.
///
/// This pool is used by default when calling `searcher.search(...)`
/// to perform search on the individual segments.
///
/// By default the executor is single thread, and simply runs in the calling thread.
pub fn search_executor(&self) -> &Executor {
self.executor.as_ref()
}
/// Replace the default single thread search executor pool
/// by a thread pool with a given number of threads.
pub fn set_multithread_executor(&mut self, num_threads: usize) {
self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-"));
}
/// Replace the default single thread search executor pool
/// by a thread pool with a given number of threads.
pub fn set_default_multithread_executor(&mut self) {
let default_num_threads = num_cpus::get();
self.set_multithread_executor(default_num_threads);
}
/// Creates a new index using the `RAMDirectory`. /// Creates a new index using the `RAMDirectory`.
/// ///
/// The index will be allocated in anonymous memory. /// The index will be allocated in anonymous memory.
@@ -65,9 +99,29 @@ impl Index {
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> { pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?; let mmap_directory = MmapDirectory::open(directory_path)?;
if Index::exists(&mmap_directory) {
return Err(TantivyError::IndexAlreadyExists);
}
Index::create(mmap_directory, schema) Index::create(mmap_directory, schema)
} }
/// Opens or creates a new index in the provided directory
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
if Index::exists(&dir) {
let index = Index::open(dir)?;
if index.schema() == schema {
Ok(index)
} else {
Err(TantivyError::SchemaError(
"An index exists but the schema does not match.".to_string(),
))
}
} else {
Index::create(dir, schema)
}
}
/// Creates a new index in a temp directory. /// Creates a new index in a temp directory.
/// ///
/// The index will use the `MMapDirectory` in a newly created directory. /// The index will use the `MMapDirectory` in a newly created directory.
@@ -84,13 +138,15 @@ impl Index {
/// Creates a new index given an implementation of the trait `Directory` /// Creates a new index given an implementation of the trait `Directory`
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> { pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
let directory = ManagedDirectory::new(dir)?; let directory = ManagedDirectory::wrap(dir)?;
Index::from_directory(directory, schema) Index::from_directory(directory, schema)
} }
/// Create a new index from a directory. /// Create a new index from a directory.
///
/// This will overwrite existing meta.json
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> { fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), 0, directory.borrow_mut())?; save_new_metas(schema.clone(), directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema); let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas) Index::create_from_metas(directory, &metas)
} }
@@ -98,15 +154,12 @@ impl Index {
/// Creates a new index given a directory and an `IndexMeta`. /// Creates a new index given a directory and an `IndexMeta`.
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> { fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
let schema = metas.schema.clone(); let schema = metas.schema.clone();
let n_cpus = num_cpus::get();
let index = Index { let index = Index {
directory, directory,
schema, schema,
num_searchers: Arc::new(AtomicUsize::new(n_cpus)),
searcher_pool: Arc::new(Pool::new()),
tokenizers: TokenizerManager::default(), tokenizers: TokenizerManager::default(),
executor: Arc::new(Executor::single_thread()),
}; };
index.load_searchers()?;
Ok(index) Ok(index)
} }
@@ -136,6 +189,22 @@ impl Index {
} }
} }
/// Create a default `IndexReader` for the given index.
///
/// See [`Index.reader_builder()`](#method.reader_builder).
pub fn reader(&self) -> Result<IndexReader> {
self.reader_builder().try_into()
}
/// Create a `IndexReader` for the given index.
///
/// Most project should create at most one reader for a given index.
/// This method is typically called only once per `Index` instance,
/// over the lifetime of most problem.
pub fn reader_builder(&self) -> IndexReaderBuilder {
IndexReaderBuilder::new(self.clone())
}
/// Opens a new directory from an index path. /// Opens a new directory from an index path.
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> { pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
@@ -145,7 +214,7 @@ impl Index {
/// Open the index using the provided directory /// Open the index using the provided directory
pub fn open<D: Directory>(directory: D) -> Result<Index> { pub fn open<D: Directory>(directory: D) -> Result<Index> {
let directory = ManagedDirectory::new(directory)?; let directory = ManagedDirectory::wrap(directory)?;
let metas = load_metas(&directory)?; let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas) Index::create_from_metas(directory, &metas)
} }
@@ -171,7 +240,8 @@ impl Index {
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`. /// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
/// ///
/// # Errors /// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`. /// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IOError`.
///
/// # Panics /// # Panics
/// If the heap size per thread is too small, panics. /// If the heap size per thread is too small, panics.
pub fn writer_with_num_threads( pub fn writer_with_num_threads(
@@ -179,7 +249,21 @@ impl Index {
num_threads: usize, num_threads: usize,
overall_heap_size_in_bytes: usize, overall_heap_size_in_bytes: usize,
) -> Result<IndexWriter> { ) -> Result<IndexWriter> {
let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?; let directory_lock = self
.directory
.acquire_lock(&INDEX_WRITER_LOCK)
.map_err(|err| {
TantivyError::LockFailure(
err,
Some(
"Failed to acquire index lock. If you are using\
a regular directory, this means there is already an \
`IndexWriter` working on this `Directory`, in this process \
or in a different process."
.to_string(),
),
)
})?;
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads; let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
open_index_writer( open_index_writer(
self, self,
@@ -256,56 +340,9 @@ impl Index {
Ok(self Ok(self
.searchable_segment_metas()? .searchable_segment_metas()?
.iter() .iter()
.map(|segment_meta| segment_meta.id()) .map(SegmentMeta::id)
.collect()) .collect())
} }
/// Sets the number of searchers to use
///
/// Only works after the next call to `load_searchers`
pub fn set_num_searchers(&mut self, num_searchers: usize) {
self.num_searchers.store(num_searchers, Ordering::Release);
}
/// Update searchers so that they reflect the state of the last
/// `.commit()`.
///
/// If indexing happens in the same process as searching,
/// you most likely want to call `.load_searchers()` right after each
/// successful call to `.commit()`.
///
/// If indexing and searching happen in different processes, the way to
/// get the freshest `index` at all time, is to watch `meta.json` and
/// call `load_searchers` whenever a changes happen.
pub fn load_searchers(&self) -> Result<()> {
let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?;
let searchable_segments = self.searchable_segments()?;
let segment_readers: Vec<SegmentReader> = searchable_segments
.iter()
.map(SegmentReader::open)
.collect::<Result<_>>()?;
let schema = self.schema();
let num_searchers: usize = self.num_searchers.load(Ordering::Acquire);
let searchers = (0..num_searchers)
.map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone()))
.collect();
self.searcher_pool.publish_new_generation(searchers);
Ok(())
}
/// Returns a searcher
///
/// This method should be called every single time a search
/// query is performed.
/// The searchers are taken from a pool of `num_searchers` searchers.
/// If no searcher is available
/// this may block.
///
/// The same searcher must be used for a given query, as it ensures
/// the use of a consistent segment set.
pub fn searcher(&self) -> LeasedItem<Searcher> {
self.searcher_pool.acquire()
}
} }
impl fmt::Debug for Index { impl fmt::Debug for Index {
@@ -314,27 +351,22 @@ impl fmt::Debug for Index {
} }
} }
impl Clone for Index {
fn clone(&self) -> Index {
Index {
directory: self.directory.clone(),
schema: self.schema.clone(),
num_searchers: Arc::clone(&self.num_searchers),
searcher_pool: Arc::clone(&self.searcher_pool),
tokenizers: self.tokenizers.clone(),
}
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use schema::{SchemaBuilder, INT_INDEXED, TEXT}; use directory::RAMDirectory;
use schema::Field;
use schema::{Schema, INDEXED, TEXT};
use std::thread;
use std::time::Duration;
use Index; use Index;
use IndexReader;
use IndexWriter;
use ReloadPolicy;
#[test] #[test]
fn test_indexer_for_field() { fn test_indexer_for_field() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED); let num_likes_field = schema_builder.add_u64_field("num_likes", INDEXED);
let body_field = schema_builder.add_text_field("body", TEXT); let body_field = schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -345,4 +377,164 @@ mod tests {
); );
} }
#[test]
fn test_index_exists() {
let directory = RAMDirectory::create();
assert!(!Index::exists(&directory));
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
}
#[test]
fn open_or_create_should_create() {
let directory = RAMDirectory::create();
assert!(!Index::exists(&directory));
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
}
#[test]
fn open_or_create_should_open() {
let directory = RAMDirectory::create();
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
}
#[test]
fn create_should_wipeoff_existing() {
let directory = RAMDirectory::create();
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
assert!(Index::create(directory.clone(), Schema::builder().build()).is_ok());
}
#[test]
fn open_or_create_exists_but_schema_does_not_match() {
let directory = RAMDirectory::create();
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
assert!(Index::exists(&directory));
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
let err = Index::open_or_create(directory, Schema::builder().build());
assert_eq!(
format!("{:?}", err.unwrap_err()),
"SchemaError(\"An index exists but the schema does not match.\")"
);
}
fn throw_away_schema() -> Schema {
let mut schema_builder = Schema::builder();
let _ = schema_builder.add_u64_field("num_likes", INDEXED);
schema_builder.build()
}
#[test]
fn test_index_on_commit_reload_policy() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
#[cfg(feature = "mmap")]
mod mmap_specific {
use super::*;
use std::path::PathBuf;
use tempdir::TempDir;
#[test]
fn test_index_on_commit_reload_policy_mmap() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
#[test]
fn test_index_manual_policy_mmap() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create_from_tempdir(schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.commit().unwrap();
thread::sleep(Duration::from_millis(500));
assert_eq!(reader.searcher().num_docs(), 0);
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 1);
}
#[test]
fn test_index_on_commit_reload_policy_different_directories() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
let reader = read_index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
}
fn test_index_on_commit_reload_policy_aux(
field: Field,
writer: &mut IndexWriter,
reader: &IndexReader,
) {
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.commit().unwrap();
let mut count = 0;
for _ in 0..100 {
count = reader.searcher().num_docs();
if count > 0 {
break;
}
thread::sleep(Duration::from_millis(100));
}
assert_eq!(count, 1);
writer.add_document(doc!(field=>2u64));
writer.commit().unwrap();
let mut count = 0;
for _ in 0..10 {
count = reader.searcher().num_docs();
if count > 1 {
break;
}
thread::sleep(Duration::from_millis(100));
}
assert_eq!(count, 2);
}
} }

View File

@@ -2,6 +2,7 @@ use core::SegmentMeta;
use schema::Schema; use schema::Schema;
use serde_json; use serde_json;
use std::fmt; use std::fmt;
use Opstamp;
/// Meta information about the `Index`. /// Meta information about the `Index`.
/// ///
@@ -15,7 +16,7 @@ use std::fmt;
pub struct IndexMeta { pub struct IndexMeta {
pub segments: Vec<SegmentMeta>, pub segments: Vec<SegmentMeta>,
pub schema: Schema, pub schema: Schema,
pub opstamp: u64, pub opstamp: Opstamp,
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub payload: Option<String>, pub payload: Option<String>,
} }
@@ -46,13 +47,13 @@ impl fmt::Debug for IndexMeta {
mod tests { mod tests {
use super::IndexMeta; use super::IndexMeta;
use schema::{SchemaBuilder, TEXT}; use schema::{Schema, TEXT};
use serde_json; use serde_json;
#[test] #[test]
fn test_serialize_metas() { fn test_serialize_metas() {
let schema = { let schema = {
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
schema_builder.add_text_field("text", TEXT); schema_builder.add_text_field("text", TEXT);
schema_builder.build() schema_builder.build()
}; };

View File

@@ -32,10 +32,7 @@ pub struct InvertedIndexReader {
} }
impl InvertedIndexReader { impl InvertedIndexReader {
#[cfg_attr( #[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symetry
feature = "cargo-clippy",
allow(clippy::needless_pass_by_value)
)] // for symetry
pub(crate) fn new( pub(crate) fn new(
termdict: TermDictionary, termdict: TermDictionary,
postings_source: ReadOnlySource, postings_source: ReadOnlySource,

View File

@@ -1,7 +1,7 @@
mod executor;
pub mod index; pub mod index;
mod index_meta; mod index_meta;
mod inverted_index_reader; mod inverted_index_reader;
mod pool;
pub mod searcher; pub mod searcher;
mod segment; mod segment;
mod segment_component; mod segment_component;
@@ -9,6 +9,7 @@ mod segment_id;
mod segment_meta; mod segment_meta;
mod segment_reader; mod segment_reader;
pub use self::executor::Executor;
pub use self::index::Index; pub use self::index::Index;
pub use self::index_meta::IndexMeta; pub use self::index_meta::IndexMeta;
pub use self::inverted_index_reader::InvertedIndexReader; pub use self::inverted_index_reader::InvertedIndexReader;
@@ -23,6 +24,7 @@ pub use self::segment_reader::SegmentReader;
use std::path::PathBuf; use std::path::PathBuf;
lazy_static! { lazy_static! {
/// The meta file contains all the information about the list of segments and the schema /// The meta file contains all the information about the list of segments and the schema
/// of the index. /// of the index.
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json"); pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");

View File

@@ -1,18 +1,43 @@
use collector::Collector; use collector::Collector;
use collector::SegmentCollector;
use core::Executor;
use core::InvertedIndexReader; use core::InvertedIndexReader;
use core::SegmentReader; use core::SegmentReader;
use query::Query; use query::Query;
use query::Scorer;
use query::Weight;
use schema::Document; use schema::Document;
use schema::Schema; use schema::Schema;
use schema::{Field, Term}; use schema::{Field, Term};
use space_usage::SearcherSpaceUsage; use space_usage::SearcherSpaceUsage;
use std::fmt; use std::fmt;
use std::sync::Arc; use std::sync::Arc;
use store::StoreReader;
use termdict::TermMerger; use termdict::TermMerger;
use DocAddress; use DocAddress;
use Index; use Index;
use Result; use Result;
fn collect_segment<C: Collector>(
collector: &C,
weight: &Weight,
segment_ord: u32,
segment_reader: &SegmentReader,
) -> Result<C::Fruit> {
let mut scorer = weight.scorer(segment_reader)?;
let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?;
if let Some(delete_bitset) = segment_reader.delete_bitset() {
scorer.for_each(&mut |doc, score| {
if !delete_bitset.is_deleted(doc) {
segment_collector.collect(doc, score);
}
});
} else {
scorer.for_each(&mut |doc, score| segment_collector.collect(doc, score));
}
Ok(segment_collector.harvest())
}
/// Holds a list of `SegmentReader`s ready for search. /// Holds a list of `SegmentReader`s ready for search.
/// ///
/// It guarantees that the `Segment` will not be removed before /// It guarantees that the `Segment` will not be removed before
@@ -22,6 +47,7 @@ pub struct Searcher {
schema: Schema, schema: Schema,
index: Index, index: Index,
segment_readers: Vec<SegmentReader>, segment_readers: Vec<SegmentReader>,
store_readers: Vec<StoreReader>,
} }
impl Searcher { impl Searcher {
@@ -31,10 +57,15 @@ impl Searcher {
index: Index, index: Index,
segment_readers: Vec<SegmentReader>, segment_readers: Vec<SegmentReader>,
) -> Searcher { ) -> Searcher {
let store_readers = segment_readers
.iter()
.map(SegmentReader::get_store_reader)
.collect();
Searcher { Searcher {
schema, schema,
index, index,
segment_readers, segment_readers,
store_readers,
} }
} }
@@ -49,8 +80,8 @@ impl Searcher {
/// the request to the right `Segment`. /// the request to the right `Segment`.
pub fn doc(&self, doc_address: DocAddress) -> Result<Document> { pub fn doc(&self, doc_address: DocAddress) -> Result<Document> {
let DocAddress(segment_local_id, doc_id) = doc_address; let DocAddress(segment_local_id, doc_id) = doc_address;
let segment_reader = &self.segment_readers[segment_local_id as usize]; let store_reader = &self.store_readers[segment_local_id as usize];
segment_reader.doc(doc_id) store_reader.get(doc_id)
} }
/// Access the schema associated to the index of this searcher. /// Access the schema associated to the index of this searcher.
@@ -73,7 +104,8 @@ impl Searcher {
.iter() .iter()
.map(|segment_reader| { .map(|segment_reader| {
u64::from(segment_reader.inverted_index(term.field()).doc_freq(term)) u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
}).sum::<u64>() })
.sum::<u64>()
} }
/// Return the list of segment readers /// Return the list of segment readers
@@ -86,9 +118,58 @@ impl Searcher {
&self.segment_readers[segment_ord as usize] &self.segment_readers[segment_ord as usize]
} }
/// Runs a query on the segment readers wrapped by the searcher /// Runs a query on the segment readers wrapped by the searcher.
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> { ///
query.search(self, collector) /// Search works as follows :
///
/// First the weight object associated to the query is created.
///
/// Then, the query loops over the segments and for each segment :
/// - setup the collector and informs it that the segment being processed has changed.
/// - creates a SegmentCollector for collecting documents associated to the segment
/// - creates a `Scorer` object associated for this segment
/// - iterate through the matched documents and push them to the segment collector.
///
/// Finally, the Collector merges each of the child collectors into itself for result usability
/// by the caller.
pub fn search<C: Collector>(&self, query: &Query, collector: &C) -> Result<C::Fruit> {
let executor = self.index.search_executor();
self.search_with_executor(query, collector, executor)
}
/// Same as [`search(...)`](#method.search) but multithreaded.
///
/// The current implementation is rather naive :
/// multithreading is by splitting search into as many task
/// as there are segments.
///
/// It is powerless at making search faster if your index consists in
/// one large segment.
///
/// Also, keep in my multithreading a single query on several
/// threads will not improve your throughput. It can actually
/// hurt it. It will however, decrease the average response time.
pub fn search_with_executor<C: Collector>(
&self,
query: &Query,
collector: &C,
executor: &Executor,
) -> Result<C::Fruit> {
let scoring_enabled = collector.requires_scoring();
let weight = query.weight(self, scoring_enabled)?;
let segment_readers = self.segment_readers();
let fruits = executor.map(
|(segment_ord, segment_reader)| {
collect_segment(
collector,
weight.as_ref(),
segment_ord as u32,
segment_reader,
)
},
segment_readers.iter().enumerate(),
)?;
collector.merge_fruits(fruits)
} }
/// Return the field searcher associated to a `Field`. /// Return the field searcher associated to a `Field`.
@@ -137,7 +218,7 @@ impl fmt::Debug for Searcher {
let segment_ids = self let segment_ids = self
.segment_readers .segment_readers
.iter() .iter()
.map(|segment_reader| segment_reader.segment_id()) .map(SegmentReader::segment_id)
.collect::<Vec<_>>(); .collect::<Vec<_>>();
write!(f, "Searcher({:?})", segment_ids) write!(f, "Searcher({:?})", segment_ids)
} }

View File

@@ -10,6 +10,7 @@ use schema::Schema;
use std::fmt; use std::fmt;
use std::path::PathBuf; use std::path::PathBuf;
use std::result; use std::result;
use Opstamp;
use Result; use Result;
/// A segment is a piece of the index. /// A segment is a piece of the index.
@@ -50,7 +51,7 @@ impl Segment {
} }
#[doc(hidden)] #[doc(hidden)]
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> Segment { pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
Segment { Segment {
index: self.index, index: self.index,
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp), meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),

View File

@@ -41,6 +41,6 @@ impl SegmentComponent {
SegmentComponent::STORE, SegmentComponent::STORE,
SegmentComponent::DELETE, SegmentComponent::DELETE,
]; ];
SEGMENT_COMPONENTS.into_iter() SEGMENT_COMPONENTS.iter()
} }
} }

View File

@@ -19,7 +19,7 @@ pub struct SegmentId(Uuid);
#[cfg(test)] #[cfg(test)]
lazy_static! { lazy_static! {
static ref AUTO_INC_COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::default(); static ref AUTO_INC_COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::default();
static ref EMPTY_ARR: [u8; 8] = [0u8; 8]; static ref ZERO_ARRAY: [u8; 8] = [0u8; 8];
} }
// During tests, we generate the segment id in a autoincrement manner // During tests, we generate the segment id in a autoincrement manner
@@ -30,7 +30,7 @@ lazy_static! {
#[cfg(test)] #[cfg(test)]
fn create_uuid() -> Uuid { fn create_uuid() -> Uuid {
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst); let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*EMPTY_ARR).unwrap() Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*ZERO_ARRAY).unwrap()
} }
#[cfg(not(test))] #[cfg(not(test))]

View File

@@ -5,6 +5,7 @@ use serde;
use std::collections::HashSet; use std::collections::HashSet;
use std::fmt; use std::fmt;
use std::path::PathBuf; use std::path::PathBuf;
use Opstamp;
lazy_static! { lazy_static! {
static ref INVENTORY: Inventory<InnerSegmentMeta> = { Inventory::new() }; static ref INVENTORY: Inventory<InnerSegmentMeta> = { Inventory::new() };
@@ -13,7 +14,7 @@ lazy_static! {
#[derive(Clone, Debug, Serialize, Deserialize)] #[derive(Clone, Debug, Serialize, Deserialize)]
struct DeleteMeta { struct DeleteMeta {
num_deleted_docs: u32, num_deleted_docs: u32,
opstamp: u64, opstamp: Opstamp,
} }
/// `SegmentMeta` contains simple meta information about a segment. /// `SegmentMeta` contains simple meta information about a segment.
@@ -136,9 +137,9 @@ impl SegmentMeta {
self.max_doc() - self.num_deleted_docs() self.max_doc() - self.num_deleted_docs()
} }
/// Returns the opstamp of the last delete operation /// Returns the `Opstamp` of the last delete operation
/// taken in account in this segment. /// taken in account in this segment.
pub fn delete_opstamp(&self) -> Option<u64> { pub fn delete_opstamp(&self) -> Option<Opstamp> {
self.tracked self.tracked
.deletes .deletes
.as_ref() .as_ref()
@@ -152,7 +153,7 @@ impl SegmentMeta {
} }
#[doc(hidden)] #[doc(hidden)]
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> SegmentMeta { pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
let delete_meta = DeleteMeta { let delete_meta = DeleteMeta {
num_deleted_docs, num_deleted_docs,
opstamp, opstamp,

View File

@@ -4,15 +4,11 @@ use core::InvertedIndexReader;
use core::Segment; use core::Segment;
use core::SegmentComponent; use core::SegmentComponent;
use core::SegmentId; use core::SegmentId;
use error::TantivyError; use directory::ReadOnlySource;
use fastfield::DeleteBitSet; use fastfield::DeleteBitSet;
use fastfield::FacetReader; use fastfield::FacetReader;
use fastfield::FastFieldReader; use fastfield::FastFieldReaders;
use fastfield::{self, FastFieldNotAvailableError};
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
use fieldnorm::FieldNormReader; use fieldnorm::FieldNormReader;
use schema::Cardinality;
use schema::Document;
use schema::Field; use schema::Field;
use schema::FieldType; use schema::FieldType;
use schema::Schema; use schema::Schema;
@@ -51,10 +47,10 @@ pub struct SegmentReader {
postings_composite: CompositeFile, postings_composite: CompositeFile,
positions_composite: CompositeFile, positions_composite: CompositeFile,
positions_idx_composite: CompositeFile, positions_idx_composite: CompositeFile,
fast_fields_composite: CompositeFile, fast_fields_readers: Arc<FastFieldReaders>,
fieldnorms_composite: CompositeFile, fieldnorms_composite: CompositeFile,
store_reader: StoreReader, store_source: ReadOnlySource,
delete_bitset_opt: Option<DeleteBitSet>, delete_bitset_opt: Option<DeleteBitSet>,
schema: Schema, schema: Schema,
} }
@@ -105,100 +101,27 @@ impl SegmentReader {
/// ///
/// # Panics /// # Panics
/// May panic if the index is corrupted. /// May panic if the index is corrupted.
pub fn fast_field_reader<Item: FastValue>( pub fn fast_fields(&self) -> &FastFieldReaders {
&self, &self.fast_fields_readers
field: Field,
) -> fastfield::Result<FastFieldReader<Item>> {
let field_entry = self.schema.get_field_entry(field);
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
{
self.fast_fields_composite
.open_read(field)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)
} else {
Err(FastFieldNotAvailableError::new(field_entry))
}
}
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
&self,
field: Field,
idx: usize,
) -> fastfield::Result<FastFieldReader<Item>> {
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
Ok(FastFieldReader::open(ff_source))
} else {
let field_entry = self.schema.get_field_entry(field);
Err(FastFieldNotAvailableError::new(field_entry))
}
}
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
/// May panick if the field is not a multivalued fastfield of the type `Item`.
pub fn multi_fast_field_reader<Item: FastValue>(
&self,
field: Field,
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
let field_entry = self.schema.get_field_entry(field);
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
{
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
} else {
Err(FastFieldNotAvailableError::new(field_entry))
}
}
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
let field_entry = self.schema.get_field_entry(field);
match *field_entry.field_type() {
FieldType::Bytes => {}
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
}
let idx_reader = self
.fast_fields_composite
.open_read_with_idx(field, 0)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
let values = self
.fast_fields_composite
.open_read_with_idx(field, 1)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
Ok(BytesFastFieldReader::open(idx_reader, values))
} }
/// Accessor to the `FacetReader` associated to a given `Field`. /// Accessor to the `FacetReader` associated to a given `Field`.
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> { pub fn facet_reader(&self, field: Field) -> Option<FacetReader> {
let field_entry = self.schema.get_field_entry(field); let field_entry = self.schema.get_field_entry(field);
if field_entry.field_type() != &FieldType::HierarchicalFacet { if field_entry.field_type() != &FieldType::HierarchicalFacet {
return Err(TantivyError::InvalidArgument(format!( return None;
"The field {:?} is not a \
hierarchical facet.",
field_entry
)));
} }
let term_ords_reader = self.multi_fast_field_reader(field)?; let term_ords_reader = self.fast_fields().u64s(field)?;
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| { let termdict_source = self.termdict_composite.open_read(field)?;
TantivyError::InvalidArgument(format!(
"The field \"{}\" is a hierarchical \
but this segment does not seem to have the field term \
dictionary.",
field_entry.name()
))
})?;
let termdict = TermDictionary::from_source(&termdict_source); let termdict = TermDictionary::from_source(&termdict_source);
let facet_reader = FacetReader::new(term_ords_reader, termdict); let facet_reader = FacetReader::new(term_ords_reader, termdict);
Ok(facet_reader) Some(facet_reader)
} }
/// Accessor to the segment's `Field norms`'s reader. /// Accessor to the segment's `Field norms`'s reader.
/// ///
/// Field norms are the length (in tokens) of the fields. /// Field norms are the length (in tokens) of the fields.
/// It is used in the computation of the [TfIdf] /// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
/// (https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
/// ///
/// They are simply stored as a fast field, serialized in /// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment. /// the `.fieldnorm` file of the segment.
@@ -216,8 +139,8 @@ impl SegmentReader {
} }
/// Accessor to the segment's `StoreReader`. /// Accessor to the segment's `StoreReader`.
pub fn get_store_reader(&self) -> &StoreReader { pub fn get_store_reader(&self) -> StoreReader {
&self.store_reader StoreReader::from_source(self.store_source.clone())
} }
/// Open a new segment for reading. /// Open a new segment for reading.
@@ -226,7 +149,6 @@ impl SegmentReader {
let termdict_composite = CompositeFile::open(&termdict_source)?; let termdict_composite = CompositeFile::open(&termdict_source)?;
let store_source = segment.open_read(SegmentComponent::STORE)?; let store_source = segment.open_read(SegmentComponent::STORE)?;
let store_reader = StoreReader::from_source(store_source);
fail_point!("SegmentReader::open#middle"); fail_point!("SegmentReader::open#middle");
@@ -249,8 +171,12 @@ impl SegmentReader {
} }
}; };
let schema = segment.schema();
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?; let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?; let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fast_field_readers =
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?; let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?; let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
@@ -262,17 +188,16 @@ impl SegmentReader {
None None
}; };
let schema = segment.schema();
Ok(SegmentReader { Ok(SegmentReader {
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())), inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
max_doc: segment.meta().max_doc(), max_doc: segment.meta().max_doc(),
num_docs: segment.meta().num_docs(), num_docs: segment.meta().num_docs(),
termdict_composite, termdict_composite,
postings_composite, postings_composite,
fast_fields_composite, fast_fields_readers: fast_field_readers,
fieldnorms_composite, fieldnorms_composite,
segment_id: segment.id(), segment_id: segment.id(),
store_reader, store_source,
delete_bitset_opt, delete_bitset_opt,
positions_composite, positions_composite,
positions_idx_composite, positions_idx_composite,
@@ -351,14 +276,6 @@ impl SegmentReader {
inv_idx_reader inv_idx_reader
} }
/// Returns the document (or to be accurate, its stored field)
/// bearing the given doc id.
/// This method is slow and should seldom be called from
/// within a collector.
pub fn doc(&self, doc_id: DocId) -> Result<Document> {
self.store_reader.get(doc_id)
}
/// Returns the segment id /// Returns the segment id
pub fn segment_id(&self) -> SegmentId { pub fn segment_id(&self) -> SegmentId {
self.segment_id self.segment_id
@@ -391,10 +308,13 @@ impl SegmentReader {
self.postings_composite.space_usage(), self.postings_composite.space_usage(),
self.positions_composite.space_usage(), self.positions_composite.space_usage(),
self.positions_idx_composite.space_usage(), self.positions_idx_composite.space_usage(),
self.fast_fields_composite.space_usage(), self.fast_fields_readers.space_usage(),
self.fieldnorms_composite.space_usage(), self.fieldnorms_composite.space_usage(),
self.store_reader.space_usage(), self.get_store_reader().space_usage(),
self.delete_bitset_opt.as_ref().map(|x| x.space_usage()).unwrap_or(0), self.delete_bitset_opt
.as_ref()
.map(DeleteBitSet::space_usage)
.unwrap_or(0),
) )
} }
} }
@@ -454,12 +374,12 @@ impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use core::Index; use core::Index;
use schema::{SchemaBuilder, Term, STORED, TEXT}; use schema::{Schema, Term, STORED, TEXT};
use DocId; use DocId;
#[test] #[test]
fn test_alive_docs_iterator() { fn test_alive_docs_iterator() {
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
schema_builder.add_text_field("name", TEXT | STORED); schema_builder.add_text_field("name", TEXT | STORED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
@@ -484,9 +404,7 @@ mod test {
// ok, now we should have a deleted doc // ok, now we should have a deleted doc
index_writer2.commit().unwrap(); index_writer2.commit().unwrap();
} }
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let docs: Vec<DocId> = searcher.segment_reader(0).doc_ids_alive().collect(); let docs: Vec<DocId> = searcher.segment_reader(0).doc_ids_alive().collect();
assert_eq!(vec![0u32, 2u32], docs); assert_eq!(vec![0u32, 2u32], docs);
} }

View File

@@ -1,11 +1,104 @@
use directory::directory_lock::Lock;
use directory::error::LockError;
use directory::error::{DeleteError, OpenReadError, OpenWriteError}; use directory::error::{DeleteError, OpenReadError, OpenWriteError};
use directory::WatchCallback;
use directory::WatchHandle;
use directory::{ReadOnlySource, WritePtr}; use directory::{ReadOnlySource, WritePtr};
use std::fmt; use std::fmt;
use std::io; use std::io;
use std::io::Write;
use std::marker::Send; use std::marker::Send;
use std::marker::Sync; use std::marker::Sync;
use std::path::Path; use std::path::Path;
use std::path::PathBuf;
use std::result; use std::result;
use std::thread;
use std::time::Duration;
/// Retry the logic of acquiring locks is pretty simple.
/// We just retry `n` times after a given `duratio`, both
/// depending on the type of lock.
struct RetryPolicy {
num_retries: usize,
wait_in_ms: u64,
}
impl RetryPolicy {
fn no_retry() -> RetryPolicy {
RetryPolicy {
num_retries: 0,
wait_in_ms: 0,
}
}
fn wait_and_retry(&mut self) -> bool {
if self.num_retries == 0 {
false
} else {
self.num_retries -= 1;
let wait_duration = Duration::from_millis(self.wait_in_ms);
thread::sleep(wait_duration);
true
}
}
}
/// The `DirectoryLock` is an object that represents a file lock.
/// See [`LockType`](struct.LockType.html)
///
/// It is transparently associated to a lock file, that gets deleted
/// on `Drop.` The lock is released automatically on `Drop`.
pub struct DirectoryLock(Box<Drop + Send + Sync + 'static>);
struct DirectoryLockGuard {
directory: Box<Directory>,
path: PathBuf,
}
impl<T: Drop + Send + Sync + 'static> From<Box<T>> for DirectoryLock {
fn from(underlying: Box<T>) -> Self {
DirectoryLock(underlying)
}
}
impl Drop for DirectoryLockGuard {
fn drop(&mut self) {
if let Err(e) = self.directory.delete(&*self.path) {
error!("Failed to remove the lock file. {:?}", e);
}
}
}
enum TryAcquireLockError {
FileExists,
IOError(io::Error),
}
fn try_acquire_lock(
filepath: &Path,
directory: &mut Directory,
) -> Result<DirectoryLock, TryAcquireLockError> {
let mut write = directory.open_write(filepath).map_err(|e| match e {
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
OpenWriteError::IOError(io_error) => TryAcquireLockError::IOError(io_error.into()),
})?;
write.flush().map_err(TryAcquireLockError::IOError)?;
Ok(DirectoryLock::from(Box::new(DirectoryLockGuard {
directory: directory.box_clone(),
path: filepath.to_owned(),
})))
}
fn retry_policy(is_blocking: bool) -> RetryPolicy {
if is_blocking {
RetryPolicy {
num_retries: 100,
wait_in_ms: 100,
}
} else {
RetryPolicy::no_retry()
}
}
/// Write-once read many (WORM) abstraction for where /// Write-once read many (WORM) abstraction for where
/// tantivy's data should be stored. /// tantivy's data should be stored.
@@ -73,6 +166,45 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// ///
/// The file may or may not previously exist. /// The file may or may not previously exist.
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>; fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
/// Acquire a lock in the given directory.
///
/// The method is blocking or not depending on the `Lock` object.
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
let mut box_directory = self.box_clone();
let mut retry_policy = retry_policy(lock.is_blocking);
loop {
match try_acquire_lock(&lock.filepath, &mut *box_directory) {
Ok(result) => {
return Ok(result);
}
Err(TryAcquireLockError::FileExists) => {
if !retry_policy.wait_and_retry() {
return Err(LockError::LockBusy);
}
}
Err(TryAcquireLockError::IOError(io_error)) => {
return Err(LockError::IOError(io_error));
}
}
}
}
/// Registers a callback that will be called whenever a change on the `meta.json`
/// using the `atomic_write` API is detected.
///
/// The behavior when using `.watch()` on a file using `.open_write(...)` is, on the other
/// hand, undefined.
///
/// The file will be watched for the lifetime of the returned `WatchHandle`. The caller is
/// required to keep it.
/// It does not override previous callbacks. When the file is modified, all callback that are
/// registered (and whose `WatchHandle` is still alive) are triggered.
///
/// Internally, tantivy only uses this API to detect new commits to implement the
/// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
/// `OnCommit` `ReloadPolicy` to work properly.
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle;
} }
/// DirectoryClone /// DirectoryClone

View File

@@ -0,0 +1,56 @@
use std::path::PathBuf;
/// A directory lock.
///
/// A lock is associated to a specific path and some
/// [`LockParams`](./enum.LockParams.html).
/// Tantivy itself uses only two locks but client application
/// can use the directory facility to define their own locks.
/// - [INDEX_WRITER_LOCK](./struct.INDEX_WRITER_LOCK.html)
/// - [META_LOCK](./struct.META_LOCK.html)
///
/// Check out these locks documentation for more information.
///
#[derive(Debug)]
pub struct Lock {
/// The lock needs to be associated with its own file `path`.
/// Depending on the platform, the lock might rely on the creation
/// and deletion of this filepath.
pub filepath: PathBuf,
/// `lock_params` describes whether acquiring the lock is meant
/// to be a blocking operation or a non-blocking.
///
/// Acquiring a blocking lock blocks until the lock is
/// available.
/// Acquiring a blocking lock returns rapidly, either successfully
/// or with an error signifying that someone is already holding
/// the lock.
pub is_blocking: bool,
}
lazy_static! {
/// Only one process should be able to write tantivy's index at a time.
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
///
/// If the process is killed and this file remains, it is safe to remove it manually.
///
/// Failing to acquire this lock usually means a misuse of tantivy's API,
/// (creating more than one instance of the `IndexWriter`), are a spurious
/// lock file remaining after a crash. In the latter case, removing the file after
/// checking no process running tantivy is running is safe.
pub static ref INDEX_WRITER_LOCK: Lock = Lock {
filepath: PathBuf::from(".tantivy-writer.lock"),
is_blocking: false
};
/// The meta lock file is here to protect the segment files being opened by
/// `IndexReader::reload()` from being garbage collected.
/// It makes it possible for another process to safely consume
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
/// here, but it is difficult to achieve on Windows.
///
/// Opening segment readers is a very fast process.
pub static ref META_LOCK: Lock = Lock {
filepath: PathBuf::from(".tantivy-meta.lock"),
is_blocking: true
};
}

View File

@@ -3,6 +3,22 @@ use std::fmt;
use std::io; use std::io;
use std::path::PathBuf; use std::path::PathBuf;
/// Error while trying to acquire a directory lock.
#[derive(Debug, Fail)]
pub enum LockError {
/// Failed to acquired a lock as it is already hold by another
/// client.
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
#[fail(
display = "Could not acquire lock as it is already held, possibly by a different process."
)]
LockBusy,
/// Trying to acquire a lock failed with an `IOError`
#[fail(display = "Failed to acquire the lock due to an io:Error.")]
IOError(io::Error),
}
/// General IO error with an optional path to the offending file. /// General IO error with an optional path to the offending file.
#[derive(Debug)] #[derive(Debug)]
pub struct IOError { pub struct IOError {
@@ -10,6 +26,12 @@ pub struct IOError {
err: io::Error, err: io::Error,
} }
impl Into<io::Error> for IOError {
fn into(self) -> io::Error {
self.err
}
}
impl fmt::Display for IOError { impl fmt::Display for IOError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self.path { match self.path {
@@ -51,6 +73,14 @@ pub enum OpenDirectoryError {
DoesNotExist(PathBuf), DoesNotExist(PathBuf),
/// The path exists but is not a directory. /// The path exists but is not a directory.
NotADirectory(PathBuf), NotADirectory(PathBuf),
/// IoError
IoError(io::Error),
}
impl From<io::Error> for OpenDirectoryError {
fn from(io_err: io::Error) -> Self {
OpenDirectoryError::IoError(io_err)
}
} }
impl fmt::Display for OpenDirectoryError { impl fmt::Display for OpenDirectoryError {
@@ -62,6 +92,11 @@ impl fmt::Display for OpenDirectoryError {
OpenDirectoryError::NotADirectory(ref path) => { OpenDirectoryError::NotADirectory(ref path) => {
write!(f, "the path '{:?}' exists but is not a directory", path) write!(f, "the path '{:?}' exists but is not a directory", path)
} }
OpenDirectoryError::IoError(ref err) => write!(
f,
"IOError while trying to open/create the directory. {:?}",
err
),
} }
} }
} }

View File

@@ -1,8 +1,11 @@
use core::MANAGED_FILEPATH; use core::MANAGED_FILEPATH;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError}; use directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
use directory::DirectoryLock;
use directory::Lock;
use directory::META_LOCK;
use directory::{ReadOnlySource, WritePtr}; use directory::{ReadOnlySource, WritePtr};
use error::TantivyError; use directory::{WatchCallback, WatchHandle};
use indexer::LockType; use error::DataCorruption;
use serde_json; use serde_json;
use std::collections::HashSet; use std::collections::HashSet;
use std::io; use std::io;
@@ -59,12 +62,17 @@ fn save_managed_paths(
impl ManagedDirectory { impl ManagedDirectory {
/// Wraps a directory as managed directory. /// Wraps a directory as managed directory.
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> { pub fn wrap<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
match directory.atomic_read(&MANAGED_FILEPATH) { match directory.atomic_read(&MANAGED_FILEPATH) {
Ok(data) => { Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data); let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json) let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
.map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?; .map_err(|e| {
DataCorruption::new(
MANAGED_FILEPATH.clone(),
format!("Managed file cannot be deserialized: {:?}. ", e),
)
})?;
Ok(ManagedDirectory { Ok(ManagedDirectory {
directory: Box::new(directory), directory: Box::new(directory),
meta_informations: Arc::new(RwLock::new(MetaInformation { meta_informations: Arc::new(RwLock::new(MetaInformation {
@@ -87,6 +95,9 @@ impl ManagedDirectory {
/// ///
/// * `living_files` - List of files that are still used by the index. /// * `living_files` - List of files that are still used by the index.
/// ///
/// The use a callback ensures that the list of living_files is computed
/// while we hold the lock on meta.
///
/// This method does not panick nor returns errors. /// This method does not panick nor returns errors.
/// If a file cannot be deleted (for permission reasons for instance) /// If a file cannot be deleted (for permission reasons for instance)
/// an error is simply logged, and the file remains in the list of managed /// an error is simply logged, and the file remains in the list of managed
@@ -117,7 +128,7 @@ impl ManagedDirectory {
// 2) writer change meta.json (for instance after a merge or a commit) // 2) writer change meta.json (for instance after a merge or a commit)
// 3) gc kicks in. // 3) gc kicks in.
// 4) gc removes a file that was useful for process B, before process B opened it. // 4) gc removes a file that was useful for process B, before process B opened it.
if let Ok(_meta_lock) = LockType::MetaLock.acquire_lock(self) { if let Ok(_meta_lock) = self.acquire_lock(&META_LOCK) {
let living_files = get_living_files(); let living_files = get_living_files();
for managed_path in &meta_informations_rlock.managed_paths { for managed_path in &meta_informations_rlock.managed_paths {
if !living_files.contains(managed_path) { if !living_files.contains(managed_path) {
@@ -227,6 +238,14 @@ impl Directory for ManagedDirectory {
fn exists(&self, path: &Path) -> bool { fn exists(&self, path: &Path) -> bool {
self.directory.exists(path) self.directory.exists(path)
} }
fn acquire_lock(&self, lock: &Lock) -> result::Result<DirectoryLock, LockError> {
self.directory.acquire_lock(lock)
}
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
self.directory.watch(watch_callback)
}
} }
impl Clone for ManagedDirectory { impl Clone for ManagedDirectory {
@@ -241,95 +260,98 @@ impl Clone for ManagedDirectory {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*;
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
use directory::MmapDirectory; mod mmap_specific {
use std::io::Write;
use std::path::Path;
use tempdir::TempDir;
lazy_static! { use super::super::*;
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test"); use std::path::Path;
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2"); use tempdir::TempDir;
}
#[test] lazy_static! {
#[cfg(feature = "mmap")] static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
fn test_managed_directory() { static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
let tempdir = TempDir::new("index").unwrap(); }
let tempdir_path = PathBuf::from(tempdir.path());
{ use directory::MmapDirectory;
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); use std::io::Write;
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
#[test]
fn test_managed_directory() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
{ {
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap(); let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
write_file.flush().unwrap(); let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
{
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
write_file.flush().unwrap();
}
{
managed_directory
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
.unwrap();
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> =
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
} }
{ {
managed_directory let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8]) let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
.unwrap(); {
} assert!(managed_directory.exists(*TEST_PATH1));
{ assert!(!managed_directory.exists(*TEST_PATH2));
assert!(managed_directory.exists(*TEST_PATH1)); }
assert!(managed_directory.exists(*TEST_PATH2)); {
} let living_files: HashSet<PathBuf> = HashSet::new();
{ managed_directory.garbage_collect(|| living_files);
let living_files: HashSet<PathBuf> = }
[TEST_PATH1.to_owned()].into_iter().cloned().collect(); {
managed_directory.garbage_collect(|| living_files); assert!(!managed_directory.exists(*TEST_PATH1));
} assert!(!managed_directory.exists(*TEST_PATH2));
{ }
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
} }
} }
{
#[test]
fn test_managed_directory_gc_while_mmapped() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap(); let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
{ managed_directory
assert!(managed_directory.exists(*TEST_PATH1)); .atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
assert!(!managed_directory.exists(*TEST_PATH2)); .unwrap();
}
{
let living_files: HashSet<PathBuf> = HashSet::new();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(!managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
}
#[test]
#[cfg(feature = "mmap ")]
fn test_managed_directory_gc_while_mmapped() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(|| living_files.clone());
if cfg!(target_os = "windows") {
// On Windows, gc should try and fail the file as it is mmapped.
assert!(managed_directory.exists(*TEST_PATH1)); assert!(managed_directory.exists(*TEST_PATH1));
// unmap should happen here.
drop(_mmap_read); let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
// The file should still be in the list of managed file and managed_directory.garbage_collect(|| living_files.clone());
// eventually be deleted once mmap is released. if cfg!(target_os = "windows") {
managed_directory.garbage_collect(|| living_files); // On Windows, gc should try and fail the file as it is mmapped.
assert!(!managed_directory.exists(*TEST_PATH1)); assert!(managed_directory.exists(*TEST_PATH1));
} else { // unmap should happen here.
assert!(!managed_directory.exists(*TEST_PATH1)); drop(_mmap_read);
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
managed_directory.garbage_collect(|| living_files);
assert!(!managed_directory.exists(*TEST_PATH1));
} else {
assert!(!managed_directory.exists(*TEST_PATH1));
}
} }
} }
} }

View File

@@ -1,12 +1,24 @@
extern crate fs2;
extern crate notify;
use self::fs2::FileExt;
use self::notify::RawEvent;
use self::notify::RecursiveMode;
use self::notify::Watcher;
use atomicwrites; use atomicwrites;
use common::make_io_err; use core::META_FILEPATH;
use directory::error::LockError;
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError}; use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use directory::shared_vec_slice::SharedVecSlice; use directory::read_only_source::BoxedData;
use directory::Directory; use directory::Directory;
use directory::DirectoryLock;
use directory::Lock;
use directory::ReadOnlySource; use directory::ReadOnlySource;
use directory::WatchCallback;
use directory::WatchCallbackList;
use directory::WatchHandle;
use directory::WritePtr; use directory::WritePtr;
use fst::raw::MmapReadOnly; use memmap::Mmap;
use std::collections::hash_map::Entry as HashMapEntry;
use std::collections::HashMap; use std::collections::HashMap;
use std::convert::From; use std::convert::From;
use std::fmt; use std::fmt;
@@ -16,14 +28,22 @@ use std::io::{self, Seek, SeekFrom};
use std::io::{BufWriter, Read, Write}; use std::io::{BufWriter, Read, Write};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::result; use std::result;
use std::sync::mpsc::{channel, Receiver, Sender};
use std::sync::Arc; use std::sync::Arc;
use std::sync::Mutex;
use std::sync::RwLock; use std::sync::RwLock;
use std::sync::Weak;
use std::thread;
use tempdir::TempDir; use tempdir::TempDir;
/// Create a default io error given a string.
pub(crate) fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
/// Returns None iff the file exists, can be read, but is empty (and hence /// Returns None iff the file exists, can be read, but is empty (and hence
/// cannot be mmapped). /// cannot be mmapped)
/// fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadError> {
let file = File::open(full_path).map_err(|e| { let file = File::open(full_path).map_err(|e| {
if e.kind() == io::ErrorKind::NotFound { if e.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.to_owned()) OpenReadError::FileDoesNotExist(full_path.to_owned())
@@ -42,7 +62,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
return Ok(None); return Ok(None);
} }
unsafe { unsafe {
MmapReadOnly::open(&file) memmap::Mmap::map(&file)
.map(Some) .map(Some)
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e))) .map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
} }
@@ -65,7 +85,7 @@ pub struct CacheInfo {
struct MmapCache { struct MmapCache {
counters: CacheCounters, counters: CacheCounters,
cache: HashMap<PathBuf, MmapReadOnly>, cache: HashMap<PathBuf, Weak<BoxedData>>,
} }
impl Default for MmapCache { impl Default for MmapCache {
@@ -78,12 +98,7 @@ impl Default for MmapCache {
} }
impl MmapCache { impl MmapCache {
/// Removes a `MmapReadOnly` entry from the mmap cache. fn get_info(&self) -> CacheInfo {
fn discard_from_cache(&mut self, full_path: &Path) -> bool {
self.cache.remove(full_path).is_some()
}
fn get_info(&mut self) -> CacheInfo {
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect(); let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
CacheInfo { CacheInfo {
counters: self.counters.clone(), counters: self.counters.clone(),
@@ -91,57 +106,178 @@ impl MmapCache {
} }
} }
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<MmapReadOnly>, OpenReadError> { fn remove_weak_ref(&mut self) {
Ok(match self.cache.entry(full_path.to_owned()) { let keys_to_remove: Vec<PathBuf> = self
HashMapEntry::Occupied(occupied_entry) => { .cache
let mmap = occupied_entry.get(); .iter()
.filter(|(_, mmap_weakref)| mmap_weakref.upgrade().is_none())
.map(|(key, _)| key.clone())
.collect();
for key in keys_to_remove {
self.cache.remove(&key);
}
}
// Returns None if the file exists but as a len of 0 (and hence is not mmappable).
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<Arc<BoxedData>>, OpenReadError> {
if let Some(mmap_weak) = self.cache.get(full_path) {
if let Some(mmap_arc) = mmap_weak.upgrade() {
self.counters.hit += 1; self.counters.hit += 1;
Some(mmap.clone()) return Ok(Some(mmap_arc));
}
HashMapEntry::Vacant(vacant_entry) => {
self.counters.miss += 1;
if let Some(mmap) = open_mmap(full_path)? {
vacant_entry.insert(mmap.clone());
Some(mmap)
} else {
None
}
} }
}
self.cache.remove(full_path);
self.counters.miss += 1;
Ok(if let Some(mmap) = open_mmap(full_path)? {
let mmap_arc: Arc<BoxedData> = Arc::new(Box::new(mmap));
let mmap_weak = Arc::downgrade(&mmap_arc);
self.cache.insert(full_path.to_owned(), mmap_weak);
Some(mmap_arc)
} else {
None
}) })
} }
} }
struct InnerWatcherWrapper {
_watcher: Mutex<notify::RecommendedWatcher>,
watcher_router: WatchCallbackList,
}
impl InnerWatcherWrapper {
pub fn new(path: &Path) -> Result<(Self, Receiver<notify::RawEvent>), notify::Error> {
let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel();
// We need to initialize the
let mut watcher = notify::raw_watcher(tx)?;
watcher.watch(path, RecursiveMode::Recursive)?;
let inner = InnerWatcherWrapper {
_watcher: Mutex::new(watcher),
watcher_router: Default::default(),
};
Ok((inner, watcher_recv))
}
}
#[derive(Clone)]
pub(crate) struct WatcherWrapper {
inner: Arc<InnerWatcherWrapper>,
}
impl WatcherWrapper {
pub fn new(path: &Path) -> Result<Self, OpenDirectoryError> {
let (inner, watcher_recv) = InnerWatcherWrapper::new(path).map_err(|err| match err {
notify::Error::PathNotFound => OpenDirectoryError::DoesNotExist(path.to_owned()),
_ => {
panic!("Unknown error while starting watching directory {:?}", path);
}
})?;
let watcher_wrapper = WatcherWrapper {
inner: Arc::new(inner),
};
let watcher_wrapper_clone = watcher_wrapper.clone();
thread::Builder::new()
.name("meta-file-watch-thread".to_string())
.spawn(move || {
loop {
match watcher_recv.recv().map(|evt| evt.path) {
Ok(Some(changed_path)) => {
// ... Actually subject to false positive.
// We might want to be more accurate than this at one point.
if let Some(filename) = changed_path.file_name() {
if filename == *META_FILEPATH {
watcher_wrapper_clone.inner.watcher_router.broadcast();
}
}
}
Ok(None) => {
// not an event we are interested in.
}
Err(_e) => {
// the watch send channel was dropped
break;
}
}
}
})
.expect("Failed to spawn thread to watch meta.json");
Ok(watcher_wrapper)
}
pub fn watch(&mut self, watch_callback: WatchCallback) -> WatchHandle {
self.inner.watcher_router.subscribe(watch_callback)
}
}
/// Directory storing data in files, read via mmap. /// Directory storing data in files, read via mmap.
/// ///
/// The Mmap object are cached to limit the /// The Mmap object are cached to limit the
/// system calls. /// system calls.
///
/// In the `MmapDirectory`, locks are implemented using the `fs2` crate definition of locks.
///
/// On MacOS & linux, it relies on `flock` (aka `BSD Lock`). These locks solve most of the
/// problems related to POSIX Locks, but may their contract may not be respected on `NFS`
/// depending on the implementation.
///
/// On Windows the semantics are again different.
#[derive(Clone)] #[derive(Clone)]
pub struct MmapDirectory { pub struct MmapDirectory {
inner: Arc<MmapDirectoryInner>,
}
struct MmapDirectoryInner {
root_path: PathBuf, root_path: PathBuf,
mmap_cache: Arc<RwLock<MmapCache>>, mmap_cache: RwLock<MmapCache>,
_temp_directory: Arc<Option<TempDir>>, _temp_directory: Option<TempDir>,
watcher: RwLock<WatcherWrapper>,
}
impl MmapDirectoryInner {
fn new(
root_path: PathBuf,
temp_directory: Option<TempDir>,
) -> Result<MmapDirectoryInner, OpenDirectoryError> {
let watch_wrapper = WatcherWrapper::new(&root_path)?;
let mmap_directory_inner = MmapDirectoryInner {
root_path,
mmap_cache: Default::default(),
_temp_directory: temp_directory,
watcher: RwLock::new(watch_wrapper),
};
Ok(mmap_directory_inner)
}
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
let mut wlock = self.watcher.write().unwrap();
wlock.watch(watch_callback)
}
} }
impl fmt::Debug for MmapDirectory { impl fmt::Debug for MmapDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "MmapDirectory({:?})", self.root_path) write!(f, "MmapDirectory({:?})", self.inner.root_path)
} }
} }
impl MmapDirectory { impl MmapDirectory {
fn new(
root_path: PathBuf,
temp_directory: Option<TempDir>,
) -> Result<MmapDirectory, OpenDirectoryError> {
let inner = MmapDirectoryInner::new(root_path, temp_directory)?;
Ok(MmapDirectory {
inner: Arc::new(inner),
})
}
/// Creates a new MmapDirectory in a temporary directory. /// Creates a new MmapDirectory in a temporary directory.
/// ///
/// This is mostly useful to test the MmapDirectory itself. /// This is mostly useful to test the MmapDirectory itself.
/// For your unit tests, prefer the RAMDirectory. /// For your unit tests, prefer the RAMDirectory.
pub fn create_from_tempdir() -> io::Result<MmapDirectory> { pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
let tempdir = TempDir::new("index")?; let tempdir = TempDir::new("index").map_err(OpenDirectoryError::IoError)?;
let tempdir_path = PathBuf::from(tempdir.path()); let tempdir_path = PathBuf::from(tempdir.path());
let directory = MmapDirectory { MmapDirectory::new(tempdir_path, Some(tempdir))
root_path: tempdir_path,
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(Some(tempdir)),
};
Ok(directory)
} }
/// Opens a MmapDirectory in a directory. /// Opens a MmapDirectory in a directory.
@@ -159,18 +295,14 @@ impl MmapDirectory {
directory_path, directory_path,
))) )))
} else { } else {
Ok(MmapDirectory { Ok(MmapDirectory::new(PathBuf::from(directory_path), None)?)
root_path: PathBuf::from(directory_path),
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(None),
})
} }
} }
/// Joins a relative_path to the directory `root_path` /// Joins a relative_path to the directory `root_path`
/// to create a proper complete `filepath`. /// to create a proper complete `filepath`.
fn resolve_path(&self, relative_path: &Path) -> PathBuf { fn resolve_path(&self, relative_path: &Path) -> PathBuf {
self.root_path.join(relative_path) self.inner.root_path.join(relative_path)
} }
/// Sync the root directory. /// Sync the root directory.
@@ -195,7 +327,7 @@ impl MmapDirectory {
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS); .custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
} }
let fd = open_opts.open(&self.root_path)?; let fd = open_opts.open(&self.inner.root_path)?;
fd.sync_all()?; fd.sync_all()?;
Ok(()) Ok(())
} }
@@ -205,14 +337,35 @@ impl MmapDirectory {
/// ///
/// The `MmapDirectory` embeds a `MmapDirectory` /// The `MmapDirectory` embeds a `MmapDirectory`
/// to avoid multiplying the `mmap` system calls. /// to avoid multiplying the `mmap` system calls.
pub fn get_cache_info(&mut self) -> CacheInfo { pub fn get_cache_info(&self) -> CacheInfo {
self.mmap_cache self.inner
.mmap_cache
.write() .write()
.expect("mmap cache lock is poisoned")
.remove_weak_ref();
self.inner
.mmap_cache
.read()
.expect("Mmap cache lock is poisoned.") .expect("Mmap cache lock is poisoned.")
.get_info() .get_info()
} }
} }
/// We rely on fs2 for file locking. On Windows & MacOS this
/// uses BSD locks (`flock`). The lock is actually released when
/// the `File` object is dropped and its associated file descriptor
/// is closed.
struct ReleaseLockFile {
_file: File,
path: PathBuf,
}
impl Drop for ReleaseLockFile {
fn drop(&mut self) {
debug!("Releasing lock {:?}", self.path);
}
}
/// This Write wraps a File, but has the specificity of /// This Write wraps a File, but has the specificity of
/// call `sync_all` on flush. /// call `sync_all` on flush.
struct SafeFileWriter(File); struct SafeFileWriter(File);
@@ -245,7 +398,7 @@ impl Directory for MmapDirectory {
debug!("Open Read {:?}", path); debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache.write().map_err(|_| { let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| {
let msg = format!( let msg = format!(
"Failed to acquired write lock \ "Failed to acquired write lock \
on mmap cache while reading {:?}", on mmap cache while reading {:?}",
@@ -253,11 +406,34 @@ impl Directory for MmapDirectory {
); );
IOError::with_path(path.to_owned(), make_io_err(msg)) IOError::with_path(path.to_owned(), make_io_err(msg))
})?; })?;
Ok(mmap_cache Ok(mmap_cache
.get_mmap(&full_path)? .get_mmap(&full_path)?
.map(ReadOnlySource::Mmap) .map(ReadOnlySource::from)
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty()))) .unwrap_or_else(ReadOnlySource::empty))
}
/// Any entry associated to the path in the mmap will be
/// removed before the file is deleted.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
debug!("Deleting file {:?}", path);
let full_path = self.resolve_path(path);
match fs::remove_file(&full_path) {
Ok(_) => self
.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned()))
} else {
Err(IOError::with_path(path.to_owned(), e).into())
}
}
}
}
fn exists(&self, path: &Path) -> bool {
let full_path = self.resolve_path(path);
full_path.exists()
} }
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> { fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
@@ -290,44 +466,6 @@ impl Directory for MmapDirectory {
Ok(BufWriter::new(Box::new(writer))) Ok(BufWriter::new(Box::new(writer)))
} }
/// Any entry associated to the path in the mmap will be
/// removed before the file is deleted.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
debug!("Deleting file {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while deleting {:?}",
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
mmap_cache.discard_from_cache(path);
// Removing the entry in the MMap cache.
// The munmap will appear on Drop,
// when the last reference is gone.
mmap_cache.cache.remove(&full_path);
match fs::remove_file(&full_path) {
Ok(_) => self
.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned()))
} else {
Err(IOError::with_path(path.to_owned(), e).into())
}
}
}
}
fn exists(&self, path: &Path) -> bool {
let full_path = self.resolve_path(path);
full_path.exists()
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> { fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
let full_path = self.resolve_path(path); let full_path = self.resolve_path(path);
let mut buffer = Vec::new(); let mut buffer = Vec::new();
@@ -354,6 +492,30 @@ impl Directory for MmapDirectory {
meta_file.write(|f| f.write_all(data))?; meta_file.write(|f| f.write_all(data))?;
Ok(()) Ok(())
} }
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
let full_path = self.resolve_path(&lock.filepath);
// We make sure that the file exists.
let file: File = OpenOptions::new()
.write(true)
.create(true) //< if the file does not exist yet, create it.
.open(&full_path)
.map_err(LockError::IOError)?;
if lock.is_blocking {
file.lock_exclusive().map_err(LockError::IOError)?;
} else {
file.try_lock_exclusive().map_err(|_| LockError::LockBusy)?
}
// dropping the file handle will release the lock.
Ok(DirectoryLock::from(Box::new(ReleaseLockFile {
path: lock.filepath.clone(),
_file: file,
})))
}
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
self.inner.watch(watch_callback)
}
} }
#[cfg(test)] #[cfg(test)]
@@ -363,6 +525,18 @@ mod tests {
// The following tests are specific to the MmapDirectory // The following tests are specific to the MmapDirectory
use super::*; use super::*;
use schema::{Schema, SchemaBuilder, TEXT};
use std::fs;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::thread;
use std::time::Duration;
use Index;
use ReloadPolicy;
#[test]
fn test_open_non_existant_path() {
assert!(MmapDirectory::open(PathBuf::from("./nowhere")).is_err());
}
#[test] #[test]
fn test_open_empty() { fn test_open_empty() {
@@ -382,7 +556,7 @@ mod tests {
#[test] #[test]
fn test_cache() { fn test_cache() {
let content = "abc".as_bytes(); let content = b"abc";
// here we test if the cache releases // here we test if the cache releases
// mmaps correctly. // mmaps correctly.
@@ -398,26 +572,104 @@ mod tests {
w.flush().unwrap(); w.flush().unwrap();
} }
} }
{
for (i, path) in paths.iter().enumerate() { let mut keep = vec![];
let _r = mmap_directory.open_read(path).unwrap(); for (i, path) in paths.iter().enumerate() {
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1); keep.push(mmap_directory.open_read(path).unwrap());
} assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
for path in paths.iter() { }
let _r = mmap_directory.open_read(path).unwrap(); assert_eq!(mmap_directory.get_cache_info().counters.hit, 0);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths); assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
} assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
for (i, path) in paths.iter().enumerate() { for path in paths.iter() {
mmap_directory.delete(path).unwrap(); let _r = mmap_directory.open_read(path).unwrap();
assert_eq!( assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
mmap_directory.get_cache_info().mmapped.len(),
num_paths - i - 1
);
}
} }
assert_eq!(mmap_directory.get_cache_info().counters.hit, 10); assert_eq!(mmap_directory.get_cache_info().counters.hit, 10);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10); assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
drop(keep);
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 1);
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 20);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
for path in &paths {
mmap_directory.delete(path).unwrap();
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 20);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
for path in paths.iter() {
assert!(mmap_directory.open_read(path).is_err());
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 30);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0); assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
} }
#[test]
fn test_watch_wrapper() {
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let tmp_dir: TempDir = tempdir::TempDir::new("test_watch_wrapper").unwrap();
let tmp_dirpath = tmp_dir.path().to_owned();
let mut watch_wrapper = WatcherWrapper::new(&tmp_dirpath).unwrap();
let tmp_file = tmp_dirpath.join("coucou");
let _handle = watch_wrapper.watch(Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
}));
assert_eq!(counter.load(Ordering::SeqCst), 0);
fs::write(&tmp_file, b"whateverwilldo").unwrap();
thread::sleep(Duration::new(0, 1_000u32));
}
#[test]
fn test_mmap_released() {
let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
let mut schema_builder: SchemaBuilder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
{
let index = Index::create(mmap_directory.clone(), schema).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for _num_commits in 0..16 {
for _ in 0..10 {
index_writer.add_document(doc!(text_field=>"abc"));
}
index_writer.commit().unwrap();
}
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
for _ in 0..30 {
index_writer.add_document(doc!(text_field=>"abc"));
index_writer.commit().unwrap();
reader.reload().unwrap();
}
index_writer.wait_merging_threads().unwrap();
reader.reload().unwrap();
let num_segments = reader.searcher().segment_readers().len();
assert_eq!(num_segments, 4);
assert_eq!(
num_segments * 7,
mmap_directory.get_cache_info().mmapped.len()
);
}
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
}
} }

View File

@@ -8,19 +8,23 @@ WORM directory abstraction.
mod mmap_directory; mod mmap_directory;
mod directory; mod directory;
mod directory_lock;
mod managed_directory; mod managed_directory;
mod ram_directory; mod ram_directory;
mod read_only_source; mod read_only_source;
mod shared_vec_slice; mod watch_event_router;
/// Errors specific to the directory module. /// Errors specific to the directory module.
pub mod error; pub mod error;
use std::io::{BufWriter, Seek, Write}; pub use self::directory::DirectoryLock;
pub use self::directory::{Directory, DirectoryClone}; pub use self::directory::{Directory, DirectoryClone};
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
pub use self::ram_directory::RAMDirectory; pub use self::ram_directory::RAMDirectory;
pub use self::read_only_source::ReadOnlySource; pub use self::read_only_source::ReadOnlySource;
pub(crate) use self::watch_event_router::WatchCallbackList;
pub use self::watch_event_router::{WatchCallback, WatchHandle};
use std::io::{BufWriter, Seek, Write};
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
pub use self::mmap_directory::MmapDirectory; pub use self::mmap_directory::MmapDirectory;
@@ -38,128 +42,4 @@ impl<T: Seek + Write> SeekableWrite for T {}
pub type WritePtr = BufWriter<Box<SeekableWrite>>; pub type WritePtr = BufWriter<Box<SeekableWrite>>;
#[cfg(test)] #[cfg(test)]
mod tests { mod tests;
use super::*;
use std::io::{Seek, SeekFrom, Write};
use std::path::Path;
lazy_static! {
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
}
#[test]
fn test_ram_directory() {
let mut ram_directory = RAMDirectory::create();
test_directory(&mut ram_directory);
}
#[test]
#[cfg(feature = "mmap")]
fn test_mmap_directory() {
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
test_directory(&mut mmap_directory);
}
#[test]
#[should_panic]
fn ram_directory_panics_if_flush_forgotten() {
let mut ram_directory = RAMDirectory::create();
let mut write_file = ram_directory.open_write(*TEST_PATH).unwrap();
assert!(write_file.write_all(&[4]).is_ok());
}
fn test_simple(directory: &mut Directory) {
{
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7, 3, 5]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
}
assert!(directory.delete(*TEST_PATH).is_ok());
assert!(!directory.exists(*TEST_PATH));
}
fn test_seek(directory: &mut Directory) {
{
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
write_file.seek(SeekFrom::Start(0)).unwrap();
write_file.write_all(&[3, 1]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
}
assert!(directory.delete(*TEST_PATH).is_ok());
}
fn test_rewrite_forbidden(directory: &mut Directory) {
{
directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
}
{
assert!(directory.open_write(*TEST_PATH).is_err());
}
assert!(directory.delete(*TEST_PATH).is_ok());
}
fn test_write_create_the_file(directory: &mut Directory) {
{
assert!(directory.open_read(*TEST_PATH).is_err());
let _w = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
assert!(directory.open_read(*TEST_PATH).is_ok());
assert!(directory.delete(*TEST_PATH).is_ok());
}
}
fn test_directory_delete(directory: &mut Directory) {
assert!(directory.open_read(*TEST_PATH).is_err());
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[1, 2, 3, 4]).unwrap();
write_file.flush().unwrap();
{
let read_handle = directory.open_read(*TEST_PATH).unwrap();
{
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
// Mapped files can't be deleted on Windows
if !cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
}
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
}
}
if cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
}
assert!(directory.open_read(*TEST_PATH).is_err());
assert!(directory.delete(*TEST_PATH).is_err());
}
fn test_directory(directory: &mut Directory) {
test_simple(directory);
test_seek(directory);
test_rewrite_forbidden(directory);
test_write_create_the_file(directory);
test_directory_delete(directory);
}
}

View File

@@ -1,8 +1,8 @@
use super::shared_vec_slice::SharedVecSlice; use core::META_FILEPATH;
use common::make_io_err; use directory::error::{DeleteError, OpenReadError, OpenWriteError};
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError}; use directory::WatchCallbackList;
use directory::WritePtr; use directory::WritePtr;
use directory::{Directory, ReadOnlySource}; use directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle};
use std::collections::HashMap; use std::collections::HashMap;
use std::fmt; use std::fmt;
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write}; use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
@@ -22,13 +22,13 @@ use std::sync::{Arc, RwLock};
/// ///
struct VecWriter { struct VecWriter {
path: PathBuf, path: PathBuf,
shared_directory: InnerDirectory, shared_directory: RAMDirectory,
data: Cursor<Vec<u8>>, data: Cursor<Vec<u8>>,
is_flushed: bool, is_flushed: bool,
} }
impl VecWriter { impl VecWriter {
fn new(path_buf: PathBuf, shared_directory: InnerDirectory) -> VecWriter { fn new(path_buf: PathBuf, shared_directory: RAMDirectory) -> VecWriter {
VecWriter { VecWriter {
path: path_buf, path: path_buf,
data: Cursor::new(Vec::new()), data: Cursor::new(Vec::new()),
@@ -64,73 +64,44 @@ impl Write for VecWriter {
fn flush(&mut self) -> io::Result<()> { fn flush(&mut self) -> io::Result<()> {
self.is_flushed = true; self.is_flushed = true;
self.shared_directory let mut fs = self.shared_directory.fs.write().unwrap();
.write(self.path.clone(), self.data.get_ref())?; fs.write(self.path.clone(), self.data.get_ref());
Ok(()) Ok(())
} }
} }
#[derive(Clone)] #[derive(Default)]
struct InnerDirectory(Arc<RwLock<HashMap<PathBuf, Arc<Vec<u8>>>>>); struct InnerDirectory {
fs: HashMap<PathBuf, ReadOnlySource>,
watch_router: WatchCallbackList,
}
impl InnerDirectory { impl InnerDirectory {
fn new() -> InnerDirectory { fn write(&mut self, path: PathBuf, data: &[u8]) -> bool {
InnerDirectory(Arc::new(RwLock::new(HashMap::new()))) let data = ReadOnlySource::new(Vec::from(data));
} self.fs.insert(path, data).is_some()
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
let mut map = self.0.write().map_err(|_| {
make_io_err(format!(
"Failed to lock the directory, when trying to write {:?}",
path
))
})?;
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
Ok(prev_value.is_some())
} }
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> { fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
self.0 self.fs
.read() .get(path)
.map_err(|_| { .ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
let msg = format!( .map(Clone::clone)
"Failed to acquire read lock for the \
directory when trying to read {:?}",
path
);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
}).and_then(|readable_map| {
readable_map
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
.map(Arc::clone)
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data)))
})
} }
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { fn delete(&mut self, path: &Path) -> result::Result<(), DeleteError> {
self.0 match self.fs.remove(path) {
.write() Some(_) => Ok(()),
.map_err(|_| { None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
let msg = format!( }
"Failed to acquire write lock for the \
directory when trying to delete {:?}",
path
);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
}).and_then(|mut writable_map| match writable_map.remove(path) {
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})
} }
fn exists(&self, path: &Path) -> bool { fn exists(&self, path: &Path) -> bool {
self.0 self.fs.contains_key(path)
.read() }
.expect("Failed to get read lock directory.")
.contains_key(path) fn watch(&mut self, watch_handle: WatchCallback) -> WatchHandle {
self.watch_router.subscribe(watch_handle)
} }
} }
@@ -145,33 +116,36 @@ impl fmt::Debug for RAMDirectory {
/// It is mainly meant for unit testing. /// It is mainly meant for unit testing.
/// Writes are only made visible upon flushing. /// Writes are only made visible upon flushing.
/// ///
#[derive(Clone)] #[derive(Clone, Default)]
pub struct RAMDirectory { pub struct RAMDirectory {
fs: InnerDirectory, fs: Arc<RwLock<InnerDirectory>>,
} }
impl RAMDirectory { impl RAMDirectory {
/// Constructor /// Constructor
pub fn create() -> RAMDirectory { pub fn create() -> RAMDirectory {
RAMDirectory { Self::default()
fs: InnerDirectory::new(),
}
} }
} }
impl Directory for RAMDirectory { impl Directory for RAMDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> { fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
self.fs.open_read(path) self.fs.read().unwrap().open_read(path)
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
self.fs.write().unwrap().delete(path)
}
fn exists(&self, path: &Path) -> bool {
self.fs.read().unwrap().exists(path)
} }
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> { fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
let mut fs = self.fs.write().unwrap();
let path_buf = PathBuf::from(path); let path_buf = PathBuf::from(path);
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone()); let vec_writer = VecWriter::new(path_buf.clone(), self.clone());
let exists = fs.write(path_buf.clone(), &[]);
let exists = self
.fs
.write(path_buf.clone(), &Vec::new())
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
// force the creation of the file to mimic the MMap directory. // force the creation of the file to mimic the MMap directory.
if exists { if exists {
Err(OpenWriteError::FileAlreadyExists(path_buf)) Err(OpenWriteError::FileAlreadyExists(path_buf))
@@ -180,17 +154,8 @@ impl Directory for RAMDirectory {
} }
} }
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
self.fs.delete(path)
}
fn exists(&self, path: &Path) -> bool {
self.fs.exists(path)
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> { fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
let read = self.open_read(path)?; Ok(self.open_read(path)?.as_slice().to_owned())
Ok(read.as_slice().to_owned())
} }
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
@@ -199,10 +164,20 @@ impl Directory for RAMDirectory {
msg.unwrap_or("Undefined".to_string()) msg.unwrap_or("Undefined".to_string())
))); )));
let path_buf = PathBuf::from(path); let path_buf = PathBuf::from(path);
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
self.fs.write(path_buf, &Vec::new())?; // Reserve the path to prevent calls to .write() to succeed.
self.fs.write().unwrap().write(path_buf.clone(), &[]);
let mut vec_writer = VecWriter::new(path_buf.clone(), self.clone());
vec_writer.write_all(data)?; vec_writer.write_all(data)?;
vec_writer.flush()?; vec_writer.flush()?;
if path == Path::new(&*META_FILEPATH) {
self.fs.write().unwrap().watch_router.broadcast();
}
Ok(()) Ok(())
} }
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
self.fs.write().unwrap().watch(watch_callback)
}
} }

View File

@@ -1,9 +1,9 @@
use super::shared_vec_slice::SharedVecSlice;
use common::HasLen; use common::HasLen;
#[cfg(feature = "mmap")]
use fst::raw::MmapReadOnly;
use stable_deref_trait::{CloneStableDeref, StableDeref}; use stable_deref_trait::{CloneStableDeref, StableDeref};
use std::ops::Deref; use std::ops::Deref;
use std::sync::Arc;
pub type BoxedData = Box<Deref<Target = [u8]> + Send + Sync + 'static>;
/// Read object that represents files in tantivy. /// Read object that represents files in tantivy.
/// ///
@@ -11,12 +11,10 @@ use std::ops::Deref;
/// the data in the form of a constant read-only `&[u8]`. /// the data in the form of a constant read-only `&[u8]`.
/// Whatever happens to the directory file, the data /// Whatever happens to the directory file, the data
/// hold by this object should never be altered or destroyed. /// hold by this object should never be altered or destroyed.
pub enum ReadOnlySource { pub struct ReadOnlySource {
/// Mmap source of data data: Arc<BoxedData>,
#[cfg(feature = "mmap")] start: usize,
Mmap(MmapReadOnly), stop: usize,
/// Wrapping a `Vec<u8>`
Anonymous(SharedVecSlice),
} }
unsafe impl StableDeref for ReadOnlySource {} unsafe impl StableDeref for ReadOnlySource {}
@@ -30,19 +28,38 @@ impl Deref for ReadOnlySource {
} }
} }
impl From<Arc<BoxedData>> for ReadOnlySource {
fn from(data: Arc<BoxedData>) -> Self {
let len = data.len();
ReadOnlySource {
data,
start: 0,
stop: len,
}
}
}
impl ReadOnlySource { impl ReadOnlySource {
pub(crate) fn new<D>(data: D) -> ReadOnlySource
where
D: Deref<Target = [u8]> + Send + Sync + 'static,
{
let len = data.len();
ReadOnlySource {
data: Arc::new(Box::new(data)),
start: 0,
stop: len,
}
}
/// Creates an empty ReadOnlySource /// Creates an empty ReadOnlySource
pub fn empty() -> ReadOnlySource { pub fn empty() -> ReadOnlySource {
ReadOnlySource::Anonymous(SharedVecSlice::empty()) ReadOnlySource::new(&[][..])
} }
/// Returns the data underlying the ReadOnlySource object. /// Returns the data underlying the ReadOnlySource object.
pub fn as_slice(&self) -> &[u8] { pub fn as_slice(&self) -> &[u8] {
match *self { &self.data[self.start..self.stop]
#[cfg(feature = "mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
}
} }
/// Splits into 2 `ReadOnlySource`, at the offset given /// Splits into 2 `ReadOnlySource`, at the offset given
@@ -63,22 +80,18 @@ impl ReadOnlySource {
/// worth of data in anonymous memory, and only a /// worth of data in anonymous memory, and only a
/// 1KB slice is remaining, the whole `500MBs` /// 1KB slice is remaining, the whole `500MBs`
/// are retained in memory. /// are retained in memory.
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource { pub fn slice(&self, start: usize, stop: usize) -> ReadOnlySource {
assert!( assert!(
from_offset <= to_offset, start <= stop,
"Requested negative slice [{}..{}]", "Requested negative slice [{}..{}]",
from_offset, start,
to_offset stop
); );
match *self { assert!(stop <= self.len());
#[cfg(feature = "mmap")] ReadOnlySource {
ReadOnlySource::Mmap(ref mmap_read_only) => { data: self.data.clone(),
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset); start: self.start + start,
ReadOnlySource::Mmap(sliced_mmap) stop: self.start + stop,
}
ReadOnlySource::Anonymous(ref shared_vec) => {
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
}
} }
} }
@@ -87,8 +100,7 @@ impl ReadOnlySource {
/// ///
/// Equivalent to `.slice(from_offset, self.len())` /// Equivalent to `.slice(from_offset, self.len())`
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource { pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
let len = self.len(); self.slice(from_offset, self.len())
self.slice(from_offset, len)
} }
/// Like `.slice(...)` but enforcing only the `to` /// Like `.slice(...)` but enforcing only the `to`
@@ -102,19 +114,18 @@ impl ReadOnlySource {
impl HasLen for ReadOnlySource { impl HasLen for ReadOnlySource {
fn len(&self) -> usize { fn len(&self) -> usize {
self.as_slice().len() self.stop - self.start
} }
} }
impl Clone for ReadOnlySource { impl Clone for ReadOnlySource {
fn clone(&self) -> Self { fn clone(&self) -> Self {
self.slice(0, self.len()) self.slice_from(0)
} }
} }
impl From<Vec<u8>> for ReadOnlySource { impl From<Vec<u8>> for ReadOnlySource {
fn from(data: Vec<u8>) -> ReadOnlySource { fn from(data: Vec<u8>) -> ReadOnlySource {
let shared_data = SharedVecSlice::from(data); ReadOnlySource::new(data)
ReadOnlySource::Anonymous(shared_data)
} }
} }

View File

@@ -1,41 +0,0 @@
use std::sync::Arc;
#[derive(Clone)]
pub struct SharedVecSlice {
pub data: Arc<Vec<u8>>,
pub start: usize,
pub len: usize,
}
impl SharedVecSlice {
pub fn empty() -> SharedVecSlice {
SharedVecSlice::new(Arc::new(Vec::new()))
}
pub fn new(data: Arc<Vec<u8>>) -> SharedVecSlice {
let data_len = data.len();
SharedVecSlice {
data,
start: 0,
len: data_len,
}
}
pub fn as_slice(&self) -> &[u8] {
&self.data[self.start..self.start + self.len]
}
pub fn slice(&self, from_offset: usize, to_offset: usize) -> SharedVecSlice {
SharedVecSlice {
data: Arc::clone(&self.data),
start: self.start + from_offset,
len: to_offset - from_offset,
}
}
}
impl From<Vec<u8>> for SharedVecSlice {
fn from(data: Vec<u8>) -> SharedVecSlice {
SharedVecSlice::new(Arc::new(data))
}
}

222
src/directory/tests.rs Normal file
View File

@@ -0,0 +1,222 @@
use super::*;
use std::io::{Seek, SeekFrom, Write};
use std::mem;
use std::path::{Path, PathBuf};
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::thread;
use std::time;
use std::time::Duration;
lazy_static! {
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
}
#[test]
fn test_ram_directory() {
let mut ram_directory = RAMDirectory::create();
test_directory(&mut ram_directory);
}
#[test]
#[cfg(feature = "mmap")]
fn test_mmap_directory() {
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
test_directory(&mut mmap_directory);
}
#[test]
#[should_panic]
fn ram_directory_panics_if_flush_forgotten() {
let mut ram_directory = RAMDirectory::create();
let mut write_file = ram_directory.open_write(*TEST_PATH).unwrap();
assert!(write_file.write_all(&[4]).is_ok());
}
fn test_simple(directory: &mut Directory) {
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7, 3, 5]).unwrap();
write_file.flush().unwrap();
}
{
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
}
assert!(directory.delete(*TEST_PATH).is_ok());
assert!(!directory.exists(*TEST_PATH));
}
fn test_seek(directory: &mut Directory) {
{
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
write_file.seek(SeekFrom::Start(0)).unwrap();
write_file.write_all(&[3, 1]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
}
assert!(directory.delete(*TEST_PATH).is_ok());
}
fn test_rewrite_forbidden(directory: &mut Directory) {
{
directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
}
{
assert!(directory.open_write(*TEST_PATH).is_err());
}
assert!(directory.delete(*TEST_PATH).is_ok());
}
fn test_write_create_the_file(directory: &mut Directory) {
{
assert!(directory.open_read(*TEST_PATH).is_err());
let _w = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
assert!(directory.open_read(*TEST_PATH).is_ok());
assert!(directory.delete(*TEST_PATH).is_ok());
}
}
fn test_directory_delete(directory: &mut Directory) {
assert!(directory.open_read(*TEST_PATH).is_err());
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[1, 2, 3, 4]).unwrap();
write_file.flush().unwrap();
{
let read_handle = directory.open_read(*TEST_PATH).unwrap();
{
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
// Mapped files can't be deleted on Windows
if !cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
}
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
}
}
if cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
}
assert!(directory.open_read(*TEST_PATH).is_err());
assert!(directory.delete(*TEST_PATH).is_err());
}
fn test_directory(directory: &mut Directory) {
test_simple(directory);
test_seek(directory);
test_rewrite_forbidden(directory);
test_write_create_the_file(directory);
test_directory_delete(directory);
test_lock_non_blocking(directory);
test_lock_blocking(directory);
test_watch(directory);
}
fn test_watch(directory: &mut Directory) {
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let watch_callback = Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
});
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data")
.is_ok());
thread::sleep(Duration::new(0, 10_000));
assert_eq!(0, counter.load(Ordering::SeqCst));
let watch_handle = directory.watch(watch_callback);
for i in 0..10 {
assert_eq!(i, counter.load(Ordering::SeqCst));
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data_2")
.is_ok());
for _ in 0..100 {
if counter.load(Ordering::SeqCst) > i {
break;
}
thread::sleep(Duration::from_millis(10));
}
assert_eq!(i + 1, counter.load(Ordering::SeqCst));
}
mem::drop(watch_handle);
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data")
.is_ok());
thread::sleep(Duration::from_millis(200));
assert_eq!(10, counter.load(Ordering::SeqCst));
}
fn test_lock_non_blocking(directory: &mut Directory) {
{
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: false,
});
assert!(lock_a_res.is_ok());
let lock_b_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("b.lock"),
is_blocking: false,
});
assert!(lock_b_res.is_ok());
let lock_a_res2 = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: false,
});
assert!(lock_a_res2.is_err());
}
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: false,
});
assert!(lock_a_res.is_ok());
}
fn test_lock_blocking(directory: &mut Directory) {
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: true,
});
assert!(lock_a_res.is_ok());
std::thread::spawn(move || {
//< lock_a_res is sent to the thread.
std::thread::sleep(time::Duration::from_millis(10));
// explicitely droping lock_a_res. It would have been sufficient to just force it
// to be part of the move, but the intent seems clearer that way.
drop(lock_a_res);
});
{
// A non-blocking call should fail, as the thread is running and holding the lock.
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: false,
});
assert!(lock_a_res.is_err());
}
{
// the blocking call should wait for at least 10ms.
let start = time::Instant::now();
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: true,
});
assert!(lock_a_res.is_ok());
assert!(start.elapsed().subsec_millis() >= 10);
}
}

View File

@@ -0,0 +1,156 @@
use std::sync::Arc;
use std::sync::RwLock;
use std::sync::Weak;
/// Type alias for callbacks registered when watching files of a `Directory`.
pub type WatchCallback = Box<Fn() -> () + Sync + Send>;
/// Helper struct to implement the watch method in `Directory` implementations.
///
/// It registers callbacks (See `.subscribe(...)`) and
/// calls them upon calls to `.broadcast(...)`.
#[derive(Default)]
pub struct WatchCallbackList {
router: RwLock<Vec<Weak<WatchCallback>>>,
}
/// Controls how long a directory should watch for a file change.
///
/// After all the clones of `WatchHandle` are dropped, the associated will not be called when a
/// file change is detected.
#[must_use = "This `WatchHandle` controls the lifetime of the watch and should therefore be used."]
#[derive(Clone)]
pub struct WatchHandle(Arc<WatchCallback>);
impl WatchCallbackList {
/// Suscribes a new callback and returns a handle that controls the lifetime of the callback.
pub fn subscribe(&self, watch_callback: WatchCallback) -> WatchHandle {
let watch_callback_arc = Arc::new(watch_callback);
let watch_callback_weak = Arc::downgrade(&watch_callback_arc);
self.router.write().unwrap().push(watch_callback_weak);
WatchHandle(watch_callback_arc)
}
fn list_callback(&self) -> Vec<Arc<WatchCallback>> {
let mut callbacks = vec![];
let mut router_wlock = self.router.write().unwrap();
let mut i = 0;
while i < router_wlock.len() {
if let Some(watch) = router_wlock[i].upgrade() {
callbacks.push(watch);
i += 1;
} else {
router_wlock.swap_remove(i);
}
}
callbacks
}
/// Triggers all callbacks
pub fn broadcast(&self) {
let callbacks = self.list_callback();
let spawn_res = std::thread::Builder::new()
.name("watch-callbacks".to_string())
.spawn(move || {
for callback in callbacks {
callback();
}
});
if let Err(err) = spawn_res {
error!(
"Failed to spawn thread to call watch callbacks. Cause: {:?}",
err
);
}
}
}
#[cfg(test)]
mod tests {
use directory::WatchCallbackList;
use std::mem;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::thread;
use std::time::Duration;
const WAIT_TIME: u64 = 20;
#[test]
fn test_watch_event_router_simple() {
let watch_event_router = WatchCallbackList::default();
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let inc_callback = Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
});
watch_event_router.broadcast();
assert_eq!(0, counter.load(Ordering::SeqCst));
let handle_a = watch_event_router.subscribe(inc_callback);
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(0, counter.load(Ordering::SeqCst));
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(1, counter.load(Ordering::SeqCst));
watch_event_router.broadcast();
watch_event_router.broadcast();
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(4, counter.load(Ordering::SeqCst));
mem::drop(handle_a);
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(4, counter.load(Ordering::SeqCst));
}
#[test]
fn test_watch_event_router_multiple_callback_same_key() {
let watch_event_router = WatchCallbackList::default();
let counter: Arc<AtomicUsize> = Default::default();
let inc_callback = |inc: usize| {
let counter_clone = counter.clone();
Box::new(move || {
counter_clone.fetch_add(inc, Ordering::SeqCst);
})
};
let handle_a = watch_event_router.subscribe(inc_callback(1));
let handle_a2 = watch_event_router.subscribe(inc_callback(10));
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(0, counter.load(Ordering::SeqCst));
watch_event_router.broadcast();
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(22, counter.load(Ordering::SeqCst));
mem::drop(handle_a);
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(32, counter.load(Ordering::SeqCst));
mem::drop(handle_a2);
watch_event_router.broadcast();
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(32, counter.load(Ordering::SeqCst));
}
#[test]
fn test_watch_event_router_multiple_callback_different_key() {
let watch_event_router = WatchCallbackList::default();
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let inc_callback = Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
});
let handle_a = watch_event_router.subscribe(inc_callback);
assert_eq!(0, counter.load(Ordering::SeqCst));
watch_event_router.broadcast();
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(2, counter.load(Ordering::SeqCst));
thread::sleep(Duration::from_millis(WAIT_TIME));
mem::drop(handle_a);
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(2, counter.load(Ordering::SeqCst));
}
}

View File

@@ -1,4 +1,5 @@
use common::BitSet; use common::BitSet;
use fastfield::DeleteBitSet;
use std::borrow::Borrow; use std::borrow::Borrow;
use std::borrow::BorrowMut; use std::borrow::BorrowMut;
use std::cmp::Ordering; use std::cmp::Ordering;
@@ -95,9 +96,23 @@ pub trait DocSet {
} }
/// Returns the number documents matching. /// Returns the number documents matching.
///
/// Calling this method consumes the `DocSet`. /// Calling this method consumes the `DocSet`.
fn count(&mut self) -> u32 { fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
let mut count = 0u32;
while self.advance() {
if !delete_bitset.is_deleted(self.doc()) {
count += 1u32;
}
}
count
}
/// Returns the count of documents, deleted or not.
/// Calling this method consumes the `DocSet`.
///
/// Of course, the result is an upper bound of the result
/// given by `count()`.
fn count_including_deleted(&mut self) -> u32 {
let mut count = 0u32; let mut count = 0u32;
while self.advance() { while self.advance() {
count += 1u32; count += 1u32;
@@ -127,9 +142,14 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.size_hint() unboxed.size_hint()
} }
fn count(&mut self) -> u32 { fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
let unboxed: &mut TDocSet = self.borrow_mut(); let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count() unboxed.count(delete_bitset)
}
fn count_including_deleted(&mut self) -> u32 {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count_including_deleted()
} }
fn append_to_bitset(&mut self, bitset: &mut BitSet) { fn append_to_bitset(&mut self, bitset: &mut BitSet) {

View File

@@ -2,53 +2,93 @@
use std::io; use std::io;
use directory::error::LockError;
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError}; use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use fastfield::FastFieldNotAvailableError; use fastfield::FastFieldNotAvailableError;
use indexer::LockType;
use query; use query;
use schema; use schema;
use serde_json; use serde_json;
use std::fmt;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::PoisonError; use std::sync::PoisonError;
pub struct DataCorruption {
filepath: Option<PathBuf>,
comment: String,
}
impl DataCorruption {
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
DataCorruption {
filepath: Some(filepath),
comment,
}
}
pub fn comment_only(comment: String) -> DataCorruption {
DataCorruption {
filepath: None,
comment,
}
}
}
impl fmt::Debug for DataCorruption {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "Data corruption: ")?;
if let Some(ref filepath) = &self.filepath {
write!(f, "(in file `{:?}`)", filepath)?;
}
write!(f, ": {}.", self.comment)?;
Ok(())
}
}
/// The library's failure based error enum /// The library's failure based error enum
#[derive(Debug, Fail)] #[derive(Debug, Fail)]
pub enum TantivyError { pub enum TantivyError {
/// Path does not exist. /// Path does not exist.
#[fail(display = "path does not exist: '{:?}'", _0)] #[fail(display = "Path does not exist: '{:?}'", _0)]
PathDoesNotExist(PathBuf), PathDoesNotExist(PathBuf),
/// File already exists, this is a problem when we try to write into a new file. /// File already exists, this is a problem when we try to write into a new file.
#[fail(display = "file already exists: '{:?}'", _0)] #[fail(display = "File already exists: '{:?}'", _0)]
FileAlreadyExists(PathBuf), FileAlreadyExists(PathBuf),
/// Index already exists in this directory
#[fail(display = "Index already exists")]
IndexAlreadyExists,
/// Failed to acquire file lock /// Failed to acquire file lock
#[fail( #[fail(display = "Failed to acquire Lockfile: {:?}. {:?}", _0, _1)]
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.", LockFailure(LockError, Option<String>),
_0
)]
LockFailure(LockType),
/// IO Error. /// IO Error.
#[fail(display = "an IO error occurred: '{}'", _0)] #[fail(display = "An IO error occurred: '{}'", _0)]
IOError(#[cause] IOError), IOError(#[cause] IOError),
/// The data within is corrupted. /// Data corruption.
/// #[fail(display = "{:?}", _0)]
/// For instance, it contains invalid JSON. DataCorruption(DataCorruption),
#[fail(display = "file contains corrupted data: '{:?}'", _0)]
CorruptedFile(PathBuf),
/// A thread holding the locked panicked and poisoned the lock. /// A thread holding the locked panicked and poisoned the lock.
#[fail(display = "a thread holding the locked panicked and poisoned the lock")] #[fail(display = "A thread holding the locked panicked and poisoned the lock")]
Poisoned, Poisoned,
/// Invalid argument was passed by the user. /// Invalid argument was passed by the user.
#[fail(display = "an invalid argument was passed: '{}'", _0)] #[fail(display = "An invalid argument was passed: '{}'", _0)]
InvalidArgument(String), InvalidArgument(String),
/// An Error happened in one of the thread. /// An Error happened in one of the thread.
#[fail(display = "an error occurred in a thread: '{}'", _0)] #[fail(display = "An error occurred in a thread: '{}'", _0)]
ErrorInThread(String), ErrorInThread(String),
/// An Error appeared related to the schema. /// An Error appeared related to the schema.
#[fail(display = "Schema error: '{}'", _0)] #[fail(display = "Schema error: '{}'", _0)]
SchemaError(String), SchemaError(String),
/// Tried to access a fastfield reader for a field not configured accordingly. /// Tried to access a fastfield reader for a field not configured accordingly.
#[fail(display = "fast field not available: '{:?}'", _0)] #[fail(display = "Fast field not available: '{:?}'", _0)]
FastFieldError(#[cause] FastFieldNotAvailableError), FastFieldError(#[cause] FastFieldNotAvailableError),
/// System error. (e.g.: We failed spawning a new thread)
#[fail(display = "System error.'{}'", _0)]
SystemError(String),
}
impl From<DataCorruption> for TantivyError {
fn from(data_corruption: DataCorruption) -> TantivyError {
TantivyError::DataCorruption(data_corruption)
}
} }
impl From<FastFieldNotAvailableError> for TantivyError { impl From<FastFieldNotAvailableError> for TantivyError {
@@ -57,6 +97,12 @@ impl From<FastFieldNotAvailableError> for TantivyError {
} }
} }
impl From<LockError> for TantivyError {
fn from(lock_error: LockError) -> TantivyError {
TantivyError::LockFailure(lock_error, None)
}
}
impl From<IOError> for TantivyError { impl From<IOError> for TantivyError {
fn from(io_error: IOError) -> TantivyError { fn from(io_error: IOError) -> TantivyError {
TantivyError::IOError(io_error) TantivyError::IOError(io_error)
@@ -116,6 +162,7 @@ impl From<OpenDirectoryError> for TantivyError {
OpenDirectoryError::NotADirectory(directory_path) => { OpenDirectoryError::NotADirectory(directory_path) => {
TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path)) TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path))
} }
OpenDirectoryError::IoError(err) => TantivyError::IOError(IOError::from(err)),
} }
} }
} }

View File

@@ -6,12 +6,12 @@ pub use self::writer::BytesFastFieldWriter;
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use schema::SchemaBuilder; use schema::Schema;
use Index; use Index;
#[test] #[test]
fn test_bytes() { fn test_bytes() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_bytes_field("bytesfield"); let field = schema_builder.add_bytes_field("bytesfield");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -22,17 +22,15 @@ mod tests {
index_writer.add_document(doc!(field=>vec![1u8, 3, 5, 7, 9])); index_writer.add_document(doc!(field=>vec![1u8, 3, 5, 7, 9]));
index_writer.add_document(doc!(field=>vec![0u8; 1000])); index_writer.add_document(doc!(field=>vec![0u8; 1000]));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0);
let bytes_reader = segment_reader.fast_fields().bytes(field).unwrap();
index.load_searchers().unwrap(); assert_eq!(bytes_reader.get_bytes(0), &[0u8, 1, 2, 3]);
let searcher = index.searcher(); assert!(bytes_reader.get_bytes(1).is_empty());
let reader = searcher.segment_reader(0); assert_eq!(bytes_reader.get_bytes(2), &[255u8]);
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap(); assert_eq!(bytes_reader.get_bytes(3), &[1u8, 3, 5, 7, 9]);
assert_eq!(bytes_reader.get_val(0), &[0u8, 1, 2, 3]);
assert!(bytes_reader.get_val(1).is_empty());
assert_eq!(bytes_reader.get_val(2), &[255u8]);
assert_eq!(bytes_reader.get_val(3), &[1u8, 3, 5, 7, 9]);
let long = vec![0u8; 1000]; let long = vec![0u8; 1000];
assert_eq!(bytes_reader.get_val(4), long.as_slice()); assert_eq!(bytes_reader.get_bytes(4), long.as_slice());
} }
} }

View File

@@ -14,6 +14,7 @@ use DocId;
/// ///
/// Reading the value for a document is done by reading the start index for it, /// Reading the value for a document is done by reading the start index for it,
/// and the start index for the next document, and keeping the bytes in between. /// and the start index for the next document, and keeping the bytes in between.
#[derive(Clone)]
pub struct BytesFastFieldReader { pub struct BytesFastFieldReader {
idx_reader: FastFieldReader<u64>, idx_reader: FastFieldReader<u64>,
values: OwningRef<ReadOnlySource, [u8]>, values: OwningRef<ReadOnlySource, [u8]>,
@@ -28,10 +29,20 @@ impl BytesFastFieldReader {
BytesFastFieldReader { idx_reader, values } BytesFastFieldReader { idx_reader, values }
} }
/// Returns the bytes associated to the given `doc` fn range(&self, doc: DocId) -> (usize, usize) {
pub fn get_val(&self, doc: DocId) -> &[u8] {
let start = self.idx_reader.get(doc) as usize; let start = self.idx_reader.get(doc) as usize;
let stop = self.idx_reader.get(doc + 1) as usize; let stop = self.idx_reader.get(doc + 1) as usize;
(start, stop)
}
/// Returns the bytes associated to the given `doc`
pub fn get_bytes(&self, doc: DocId) -> &[u8] {
let (start, stop) = self.range(doc);
&self.values[start..stop] &self.values[start..stop]
} }
/// Returns the overall number of bytes in this bytes fast field.
pub fn total_num_bytes(&self) -> usize {
self.values.len()
}
} }

View File

@@ -53,16 +53,18 @@ impl DeleteBitSet {
} }
} }
/// Returns whether the document has been marked as deleted. /// Returns true iff the document is still "alive". In other words, if it has not been deleted.
pub fn is_alive(&self, doc: DocId) -> bool {
!self.is_deleted(doc)
}
/// Returns true iff the document has been marked as deleted.
#[inline(always)]
pub fn is_deleted(&self, doc: DocId) -> bool { pub fn is_deleted(&self, doc: DocId) -> bool {
if self.len == 0 { let byte_offset = doc / 8u32;
false let b: u8 = (*self.data)[byte_offset as usize];
} else { let shift = (doc & 7u32) as u8;
let byte_offset = doc / 8u32; b & (1u8 << shift) != 0
let b: u8 = (*self.data)[byte_offset as usize];
let shift = (doc & 7u32) as u8;
b & (1u8 << shift) != 0
}
} }
/// Summarize total space usage of this bitset. /// Summarize total space usage of this bitset.

View File

@@ -1,5 +1,6 @@
use super::MultiValueIntFastFieldReader; use super::MultiValueIntFastFieldReader;
use schema::Facet; use schema::Facet;
use std::str;
use termdict::TermDictionary; use termdict::TermDictionary;
use termdict::TermOrdinal; use termdict::TermOrdinal;
use DocId; use DocId;
@@ -20,6 +21,7 @@ use DocId;
pub struct FacetReader { pub struct FacetReader {
term_ords: MultiValueIntFastFieldReader<u64>, term_ords: MultiValueIntFastFieldReader<u64>,
term_dict: TermDictionary, term_dict: TermDictionary,
buffer: Vec<u8>,
} }
impl FacetReader { impl FacetReader {
@@ -37,6 +39,7 @@ impl FacetReader {
FacetReader { FacetReader {
term_ords, term_ords,
term_dict, term_dict,
buffer: vec![],
} }
} }
@@ -55,11 +58,18 @@ impl FacetReader {
} }
/// Given a term ordinal returns the term associated to it. /// Given a term ordinal returns the term associated to it.
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) { pub fn facet_from_ord(
&mut self,
facet_ord: TermOrdinal,
output: &mut Facet,
) -> Result<(), str::Utf8Error> {
let found_term = self let found_term = self
.term_dict .term_dict
.ord_to_term(facet_ord as u64, output.inner_buffer_mut()); .ord_to_term(facet_ord as u64, &mut self.buffer);
assert!(found_term, "Term ordinal {} no found.", facet_ord); assert!(found_term, "Term ordinal {} no found.", facet_ord);
let facet_str = str::from_utf8(&self.buffer[..])?;
output.set_facet_str(facet_str);
Ok(())
} }
/// Return the list of facet ordinals associated to a document. /// Return the list of facet ordinals associated to a document.

View File

@@ -30,6 +30,7 @@ pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader; pub use self::facet_reader::FacetReader;
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter}; pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
pub use self::reader::FastFieldReader; pub use self::reader::FastFieldReader;
pub use self::readers::FastFieldReaders;
pub use self::serializer::FastFieldSerializer; pub use self::serializer::FastFieldSerializer;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter}; pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use common; use common;
@@ -43,6 +44,7 @@ mod error;
mod facet_reader; mod facet_reader;
mod multivalued; mod multivalued;
mod reader; mod reader;
mod readers;
mod serializer; mod serializer;
mod writer; mod writer;
@@ -78,10 +80,6 @@ impl FastValue for u64 {
*self *self
} }
fn as_u64(&self) -> u64 {
*self
}
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> { fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
match *field_type { match *field_type {
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(), FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
@@ -89,6 +87,10 @@ impl FastValue for u64 {
_ => None, _ => None,
} }
} }
fn as_u64(&self) -> u64 {
*self
}
} }
impl FastValue for i64 { impl FastValue for i64 {
@@ -127,19 +129,19 @@ mod tests {
use common::CompositeFile; use common::CompositeFile;
use directory::{Directory, RAMDirectory, WritePtr}; use directory::{Directory, RAMDirectory, WritePtr};
use fastfield::FastFieldReader; use fastfield::FastFieldReader;
use rand::Rng; use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
use rand::SeedableRng; use rand::SeedableRng;
use rand::XorShiftRng;
use schema::Document; use schema::Document;
use schema::Field; use schema::Field;
use schema::Schema;
use schema::FAST; use schema::FAST;
use schema::{Schema, SchemaBuilder};
use std::collections::HashMap; use std::collections::HashMap;
use std::path::Path; use std::path::Path;
lazy_static! { lazy_static! {
pub static ref SCHEMA: Schema = { pub static ref SCHEMA: Schema = {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
schema_builder.add_u64_field("field", FAST); schema_builder.add_u64_field("field", FAST);
schema_builder.build() schema_builder.build()
}; };
@@ -298,7 +300,7 @@ mod tests {
fn test_signed_intfastfield() { fn test_signed_intfastfield() {
let path = Path::new("test"); let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create(); let mut directory: RAMDirectory = RAMDirectory::create();
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
let i64_field = schema_builder.add_i64_field("field", FAST); let i64_field = schema_builder.add_i64_field("field", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -342,7 +344,7 @@ mod tests {
fn test_signed_intfastfield_default_val() { fn test_signed_intfastfield_default_val() {
let path = Path::new("test"); let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create(); let mut directory: RAMDirectory = RAMDirectory::create();
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
let i64_field = schema_builder.add_i64_field("field", FAST); let i64_field = schema_builder.add_i64_field("field", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
@@ -367,11 +369,10 @@ mod tests {
} }
} }
// Warning: this generates the same permutation at each call
pub fn generate_permutation() -> Vec<u64> { pub fn generate_permutation() -> Vec<u64> {
let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
let mut rng = XorShiftRng::from_seed(seed);
let mut permutation: Vec<u64> = (0u64..100_000u64).collect(); let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
rng.shuffle(&mut permutation); permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
permutation permutation
} }

View File

@@ -7,14 +7,20 @@ pub use self::writer::MultiValueIntFastFieldWriter;
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
extern crate time;
use self::time::Duration;
use collector::TopDocs;
use query::QueryParser;
use schema::Cardinality; use schema::Cardinality;
use schema::Facet;
use schema::IntOptions; use schema::IntOptions;
use schema::SchemaBuilder; use schema::Schema;
use Index; use Index;
#[test] #[test]
fn test_multivalued_u64() { fn test_multivalued_u64() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field( let field = schema_builder.add_u64_field(
"multifield", "multifield",
IntOptions::default().set_fast(Cardinality::MultiValues), IntOptions::default().set_fast(Cardinality::MultiValues),
@@ -28,11 +34,10 @@ mod tests {
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64)); index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
index.load_searchers().unwrap(); let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher(); let segment_reader = searcher.segment_reader(0);
let reader = searcher.segment_reader(0);
let mut vals = Vec::new(); let mut vals = Vec::new();
let multi_value_reader = reader.multi_fast_field_reader::<u64>(field).unwrap(); let multi_value_reader = segment_reader.fast_fields().u64s(field).unwrap();
{ {
multi_value_reader.get_vals(2, &mut vals); multi_value_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[4u64]); assert_eq!(&vals, &[4u64]);
@@ -47,9 +52,136 @@ mod tests {
} }
} }
#[test]
fn test_multivalued_date() {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field(
"multi_date_field",
IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed()
.set_stored(),
);
let time_i =
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let first_time_stamp = chrono::Utc::now();
index_writer.add_document(
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
);
index_writer.add_document(doc!(time_i=>0i64));
// add one second
index_writer
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64));
// add another second
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let reader = searcher.segment_reader(0);
assert_eq!(reader.num_docs(), 4);
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()).to_string())
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 1);
for (_score, doc_address) in results {
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
assert_eq!(
retrieved_doc
.get_first(date_field)
.expect("cannot find value")
.date_value()
.timestamp(),
first_time_stamp.timestamp()
);
assert_eq!(
retrieved_doc
.get_first(time_i)
.expect("cannot find value")
.i64_value(),
1i64
);
}
}
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()).to_string())
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 1);
for (_score, doc_address) in results {
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
assert_eq!(
retrieved_doc
.get_first(date_field)
.expect("cannot find value")
.date_value()
.timestamp(),
two_secs_ahead.timestamp()
);
assert_eq!(
retrieved_doc
.get_first(time_i)
.expect("cannot find value")
.i64_value(),
3i64
);
}
}
// TODO: support Date range queries
// {
// let parser = QueryParser::for_index(&index, vec![date_field]);
// let range_q = format!("\"{}\"..\"{}\"",
// (first_time_stamp + Duration::seconds(1)).to_rfc3339(),
// (first_time_stamp + Duration::seconds(3)).to_rfc3339()
// );
// let query = parser.parse_query(&range_q)
// .expect("could not parse query");
// let results = searcher.search(&query, &TopDocs::with_limit(5))
// .expect("could not query index");
//
//
// assert_eq!(results.len(), 2);
// for (i, doc_pair) in results.iter().enumerate() {
// let retrieved_doc = searcher.doc(doc_pair.1).expect("cannot fetch doc");
// let offset_sec = match i {
// 0 => 1,
// 1 => 3,
// _ => panic!("should not have more than 2 docs")
// };
// let time_i_val = match i {
// 0 => 2,
// 1 => 3,
// _ => panic!("should not have more than 2 docs")
// };
// assert_eq!(retrieved_doc.get_first(date_field).expect("cannot find value").date_value().timestamp(),
// (first_time_stamp + Duration::seconds(offset_sec)).timestamp());
// assert_eq!(retrieved_doc.get_first(time_i).expect("cannot find value").i64_value(), time_i_val);
// }
// }
}
#[test] #[test]
fn test_multivalued_i64() { fn test_multivalued_i64() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_i64_field( let field = schema_builder.add_i64_field(
"multifield", "multifield",
IntOptions::default().set_fast(Cardinality::MultiValues), IntOptions::default().set_fast(Cardinality::MultiValues),
@@ -63,11 +195,10 @@ mod tests {
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64)); index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
index.load_searchers().unwrap(); let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher(); let segment_reader = searcher.segment_reader(0);
let reader = searcher.segment_reader(0);
let mut vals = Vec::new(); let mut vals = Vec::new();
let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap(); let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
{ {
multi_value_reader.get_vals(2, &mut vals); multi_value_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[-4i64]); assert_eq!(&vals, &[-4i64]);
@@ -85,4 +216,17 @@ mod tests {
assert_eq!(&vals, &[-5i64, -20i64, 1i64]); assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
} }
} }
#[test]
#[ignore]
fn test_many_facets() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_facet_field("facetfield");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for i in 0..100_000 {
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
}
assert!(index_writer.commit().is_ok());
}
} }

View File

@@ -26,6 +26,13 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
} }
} }
pub(crate) fn into_u64s_reader(self) -> MultiValueIntFastFieldReader<u64> {
MultiValueIntFastFieldReader {
idx_reader: self.idx_reader,
vals_reader: self.vals_reader.into_u64_reader(),
}
}
/// Returns `(start, stop)`, such that the values associated /// Returns `(start, stop)`, such that the values associated
/// to the given document are `start..stop`. /// to the given document are `start..stop`.
fn range(&self, doc: DocId) -> (u64, u64) { fn range(&self, doc: DocId) -> (u64, u64) {
@@ -39,7 +46,18 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
let (start, stop) = self.range(doc); let (start, stop) = self.range(doc);
let len = (stop - start) as usize; let len = (stop - start) as usize;
vals.resize(len, Item::default()); vals.resize(len, Item::default());
self.vals_reader.get_range(start as u32, &mut vals[..]); self.vals_reader.get_range_u64(start, &mut vals[..]);
}
/// Returns the number of values associated with the document `DocId`.
pub fn num_vals(&self, doc: DocId) -> usize {
let (start, stop) = self.range(doc);
(stop - start) as usize
}
/// Returns the overall number of values in this field .
pub fn total_num_vals(&self) -> u64 {
self.idx_reader.max_value()
} }
} }
@@ -47,55 +65,44 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
mod tests { mod tests {
use core::Index; use core::Index;
use schema::{Document, Facet, SchemaBuilder}; use schema::{Facet, Schema};
#[test] #[test]
fn test_multifastfield_reader() { fn test_multifastfield_reader() {
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
let facet_field = schema_builder.add_facet_field("facets"); let facet_field = schema_builder.add_facet_field("facets");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index let mut index_writer = index
.writer_with_num_threads(1, 30_000_000) .writer_with_num_threads(1, 30_000_000)
.expect("Failed to create index writer."); .expect("Failed to create index writer.");
{ index_writer.add_document(doc!(
let mut doc = Document::new(); facet_field => Facet::from("/category/cat2"),
doc.add_facet(facet_field, "/category/cat2"); facet_field => Facet::from("/category/cat1"),
doc.add_facet(facet_field, "/category/cat1"); ));
index_writer.add_document(doc); index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")));
} index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")));
{
let mut doc = Document::new();
doc.add_facet(facet_field, "/category/cat2");
index_writer.add_document(doc);
}
{
let mut doc = Document::new();
doc.add_facet(facet_field, "/category/cat3");
index_writer.add_document(doc);
}
index_writer.commit().expect("Commit failed"); index_writer.commit().expect("Commit failed");
index.load_searchers().expect("Reloading searchers"); let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap(); let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
let mut facet = Facet::root(); let mut facet = Facet::root();
{ {
facet_reader.facet_from_ord(1, &mut facet); facet_reader.facet_from_ord(1, &mut facet).unwrap();
assert_eq!(facet, Facet::from("/category")); assert_eq!(facet, Facet::from("/category"));
} }
{ {
facet_reader.facet_from_ord(2, &mut facet); facet_reader.facet_from_ord(2, &mut facet).unwrap();
assert_eq!(facet, Facet::from("/category/cat1")); assert_eq!(facet, Facet::from("/category/cat1"));
} }
{ {
facet_reader.facet_from_ord(3, &mut facet); facet_reader.facet_from_ord(3, &mut facet).unwrap();
assert_eq!(format!("{}", facet), "/category/cat2"); assert_eq!(format!("{}", facet), "/category/cat2");
assert_eq!(facet, Facet::from("/category/cat2")); assert_eq!(facet, Facet::from("/category/cat2"));
} }
{ {
facet_reader.facet_from_ord(4, &mut facet); facet_reader.facet_from_ord(4, &mut facet).unwrap();
assert_eq!(facet, Facet::from("/category/cat3")); assert_eq!(facet, Facet::from("/category/cat3"));
} }

View File

@@ -32,7 +32,7 @@ use DocId;
/// term ids when the segment is getting serialized. /// term ids when the segment is getting serialized.
pub struct MultiValueIntFastFieldWriter { pub struct MultiValueIntFastFieldWriter {
field: Field, field: Field,
vals: Vec<u64>, vals: Vec<UnorderedTermId>,
doc_index: Vec<u64>, doc_index: Vec<u64>,
is_facet: bool, is_facet: bool,
} }

View File

@@ -7,7 +7,7 @@ use directory::ReadOnlySource;
use directory::{Directory, RAMDirectory, WritePtr}; use directory::{Directory, RAMDirectory, WritePtr};
use fastfield::{FastFieldSerializer, FastFieldsWriter}; use fastfield::{FastFieldSerializer, FastFieldsWriter};
use owning_ref::OwningRef; use owning_ref::OwningRef;
use schema::SchemaBuilder; use schema::Schema;
use schema::FAST; use schema::FAST;
use std::collections::HashMap; use std::collections::HashMap;
use std::marker::PhantomData; use std::marker::PhantomData;
@@ -50,6 +50,15 @@ impl<Item: FastValue> FastFieldReader<Item> {
} }
} }
pub(crate) fn into_u64_reader(self) -> FastFieldReader<u64> {
FastFieldReader {
bit_unpacker: self.bit_unpacker,
min_value_u64: self.min_value_u64,
max_value_u64: self.max_value_u64,
_phantom: PhantomData,
}
}
/// Return the value associated to the given document. /// Return the value associated to the given document.
/// ///
/// This accessor should return as fast as possible. /// This accessor should return as fast as possible.
@@ -59,7 +68,29 @@ impl<Item: FastValue> FastFieldReader<Item> {
/// May panic if `doc` is greater than the segment /// May panic if `doc` is greater than the segment
// `maxdoc`. // `maxdoc`.
pub fn get(&self, doc: DocId) -> Item { pub fn get(&self, doc: DocId) -> Item {
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc as usize)) self.get_u64(u64::from(doc))
}
pub(crate) fn get_u64(&self, doc: u64) -> Item {
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
}
/// Internally `multivalued` also use SingleValue Fast fields.
/// It works as follows... A first column contains the list of start index
/// for each document, a second column contains the actual values.
///
/// The values associated to a given doc, are then
/// `second_column[first_column.get(doc)..first_column.get(doc+1)]`.
///
/// Which means single value fast field reader can be indexed internally with
/// something different from a `DocId`. For this use case, we want to use `u64`
/// values.
///
/// See `get_range` for an actual documentation about this method.
pub(crate) fn get_range_u64(&self, start: u64, output: &mut [Item]) {
for (i, out) in output.iter_mut().enumerate() {
*out = self.get_u64(start + (i as u64));
}
} }
/// Fills an output buffer with the fast field values /// Fills an output buffer with the fast field values
@@ -75,16 +106,8 @@ impl<Item: FastValue> FastFieldReader<Item> {
/// ///
/// May panic if `start + output.len()` is greater than /// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`. /// the segment's `maxdoc`.
/// pub fn get_range(&self, start: DocId, output: &mut [Item]) {
// TODO change start to `u64`. self.get_range_u64(u64::from(start), output);
// For multifastfield, start is an index in a second fastfield, not a `DocId`
pub fn get_range(&self, start: u32, output: &mut [Item]) {
// ok: Item is either `u64` or `i64`
let output_u64: &mut [u64] = unsafe { &mut *(output as *mut [Item] as *mut [u64]) };
self.bit_unpacker.get_range(start, output_u64);
for out in output_u64.iter_mut() {
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
}
} }
/// Returns the minimum value for this fast field. /// Returns the minimum value for this fast field.
@@ -108,7 +131,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> { impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
fn from(vals: Vec<Item>) -> FastFieldReader<Item> { fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", FAST); let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let path = Path::new("__dummy__"); let path = Path::new("__dummy__");

191
src/fastfield/readers.rs Normal file
View File

@@ -0,0 +1,191 @@
use common::CompositeFile;
use fastfield::BytesFastFieldReader;
use fastfield::MultiValueIntFastFieldReader;
use fastfield::{FastFieldNotAvailableError, FastFieldReader};
use schema::{Cardinality, Field, FieldType, Schema};
use space_usage::PerFieldSpaceUsage;
use std::collections::HashMap;
use Result;
/// Provides access to all of the FastFieldReader.
///
/// Internally, `FastFieldReaders` have preloaded fast field readers,
/// and just wraps several `HashMap`.
pub struct FastFieldReaders {
fast_field_i64: HashMap<Field, FastFieldReader<i64>>,
fast_field_u64: HashMap<Field, FastFieldReader<u64>>,
fast_field_i64s: HashMap<Field, MultiValueIntFastFieldReader<i64>>,
fast_field_u64s: HashMap<Field, MultiValueIntFastFieldReader<u64>>,
fast_bytes: HashMap<Field, BytesFastFieldReader>,
fast_fields_composite: CompositeFile,
}
enum FastType {
I64,
U64,
}
fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
match field_type {
FieldType::U64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::U64, cardinality)),
FieldType::I64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::I64, cardinality)),
FieldType::HierarchicalFacet => Some((FastType::U64, Cardinality::MultiValues)),
_ => None,
}
}
impl FastFieldReaders {
pub(crate) fn load_all(
schema: &Schema,
fast_fields_composite: &CompositeFile,
) -> Result<FastFieldReaders> {
let mut fast_field_readers = FastFieldReaders {
fast_field_i64: Default::default(),
fast_field_u64: Default::default(),
fast_field_i64s: Default::default(),
fast_field_u64s: Default::default(),
fast_bytes: Default::default(),
fast_fields_composite: fast_fields_composite.clone(),
};
for (field_id, field_entry) in schema.fields().iter().enumerate() {
let field = Field(field_id as u32);
let field_type = field_entry.field_type();
if field_type == &FieldType::Bytes {
let idx_reader = fast_fields_composite
.open_read_with_idx(field, 0)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
let data = fast_fields_composite
.open_read_with_idx(field, 1)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
fast_field_readers
.fast_bytes
.insert(field, BytesFastFieldReader::open(idx_reader, data));
} else if let Some((fast_type, cardinality)) = type_and_cardinality(field_type) {
match cardinality {
Cardinality::SingleValue => {
if let Some(fast_field_data) = fast_fields_composite.open_read(field) {
match fast_type {
FastType::U64 => {
let fast_field_reader = FastFieldReader::open(fast_field_data);
fast_field_readers
.fast_field_u64
.insert(field, fast_field_reader);
}
FastType::I64 => {
fast_field_readers.fast_field_i64.insert(
field,
FastFieldReader::open(fast_field_data.clone()),
);
}
}
} else {
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
}
}
Cardinality::MultiValues => {
let idx_opt = fast_fields_composite.open_read_with_idx(field, 0);
let data_opt = fast_fields_composite.open_read_with_idx(field, 1);
if let (Some(fast_field_idx), Some(fast_field_data)) = (idx_opt, data_opt) {
let idx_reader = FastFieldReader::open(fast_field_idx);
match fast_type {
FastType::I64 => {
let vals_reader = FastFieldReader::open(fast_field_data);
let multivalued_int_fast_field =
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
fast_field_readers
.fast_field_i64s
.insert(field, multivalued_int_fast_field);
}
FastType::U64 => {
let vals_reader = FastFieldReader::open(fast_field_data);
let multivalued_int_fast_field =
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
fast_field_readers
.fast_field_u64s
.insert(field, multivalued_int_fast_field);
}
}
} else {
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
}
}
}
}
}
Ok(fast_field_readers)
}
pub(crate) fn space_usage(&self) -> PerFieldSpaceUsage {
self.fast_fields_composite.space_usage()
}
/// Returns the `u64` fast field reader reader associated to `field`.
///
/// If `field` is not a u64 fast field, this method returns `None`.
pub fn u64(&self, field: Field) -> Option<FastFieldReader<u64>> {
self.fast_field_u64.get(&field).cloned()
}
/// If the field is a u64-fast field return the associated reader.
/// If the field is a i64-fast field, return the associated u64 reader. Values are
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping. ///
///
/// This method is useful when merging segment reader.
pub(crate) fn u64_lenient(&self, field: Field) -> Option<FastFieldReader<u64>> {
if let Some(u64_ff_reader) = self.u64(field) {
return Some(u64_ff_reader);
}
if let Some(i64_ff_reader) = self.i64(field) {
return Some(i64_ff_reader.into_u64_reader());
}
None
}
/// Returns the `i64` fast field reader reader associated to `field`.
///
/// If `field` is not a i64 fast field, this method returns `None`.
pub fn i64(&self, field: Field) -> Option<FastFieldReader<i64>> {
self.fast_field_i64.get(&field).cloned()
}
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a u64 multi-valued fast field, this method returns `None`.
pub fn u64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
self.fast_field_u64s.get(&field).cloned()
}
/// If the field is a u64s-fast field return the associated reader.
/// If the field is a i64s-fast field, return the associated u64s reader. Values are
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping.
///
/// This method is useful when merging segment reader.
pub(crate) fn u64s_lenient(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
if let Some(u64s_ff_reader) = self.u64s(field) {
return Some(u64s_ff_reader);
}
if let Some(i64s_ff_reader) = self.i64s(field) {
return Some(i64s_ff_reader.into_u64s_reader());
}
None
}
/// Returns a `i64s` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a i64 multi-valued fast field, this method returns `None`.
pub fn i64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<i64>> {
self.fast_field_i64s.get(&field).cloned()
}
/// Returns the `bytes` fast field reader associated to `field`.
///
/// If `field` is not a bytes fast field, returns `None`.
pub fn bytes(&self, field: Field) -> Option<BytesFastFieldReader> {
self.fast_bytes.get(&field).cloned()
}
}

View File

@@ -15,7 +15,7 @@
//! precompute computationally expensive functions of the fieldnorm //! precompute computationally expensive functions of the fieldnorm
//! in a very short array. //! in a very short array.
//! //!
//! This trick is used by the [BM25 similarity](). //! This trick is used by the BM25 similarity.
mod code; mod code;
mod reader; mod reader;
mod serializer; mod serializer;

View File

@@ -1,7 +1,6 @@
use rand::thread_rng; use rand::thread_rng;
use std::collections::HashSet; use std::collections::HashSet;
use rand::distributions::Range;
use rand::Rng; use rand::Rng;
use schema::*; use schema::*;
use Index; use Index;
@@ -14,17 +13,16 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
#[test] #[test]
#[ignore] #[ignore]
#[cfg(feature = "mmap")]
fn test_indexing() { fn test_indexing() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INT_INDEXED); let id_field = schema_builder.add_u64_field("id", INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED); let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap(); let index = Index::create_from_tempdir(schema).unwrap();
let reader = index.reader().unwrap();
let universe = Range::new(0u64, 20u64);
let mut rng = thread_rng(); let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
@@ -33,13 +31,13 @@ fn test_indexing() {
let mut uncommitted_docs: HashSet<u64> = HashSet::new(); let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..200 { for _ in 0..200 {
let random_val = rng.sample(&universe); let random_val = rng.gen_range(0, 20);
if random_val == 0 { if random_val == 0 {
index_writer.commit().expect("Commit failed"); index_writer.commit().expect("Commit failed");
committed_docs.extend(&uncommitted_docs); committed_docs.extend(&uncommitted_docs);
uncommitted_docs.clear(); uncommitted_docs.clear();
index.load_searchers().unwrap(); reader.reload().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
// check that everything is correct. // check that everything is correct.
check_index_content(&searcher, &committed_docs); check_index_content(&searcher, &committed_docs);
} else { } else {

View File

@@ -2,6 +2,7 @@ use super::operation::DeleteOperation;
use std::mem; use std::mem;
use std::ops::DerefMut; use std::ops::DerefMut;
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
use Opstamp;
// The DeleteQueue is similar in conceptually to a multiple // The DeleteQueue is similar in conceptually to a multiple
// consumer single producer broadcast channel. // consumer single producer broadcast channel.
@@ -184,18 +185,15 @@ impl DeleteCursor {
/// queue are consume and the next get will return None. /// queue are consume and the next get will return None.
/// - the next get will return the first operation with an /// - the next get will return the first operation with an
/// `opstamp >= target_opstamp`. /// `opstamp >= target_opstamp`.
pub fn skip_to(&mut self, target_opstamp: u64) { pub fn skip_to(&mut self, target_opstamp: Opstamp) {
// TODO Can be optimize as we work with block. // TODO Can be optimize as we work with block.
while self.is_behind_opstamp(target_opstamp) { while self.is_behind_opstamp(target_opstamp) {
self.advance(); self.advance();
} }
} }
#[cfg_attr( #[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
feature = "cargo-clippy", fn is_behind_opstamp(&mut self, target_opstamp: Opstamp) -> bool {
allow(clippy::wrong_self_convention)
)]
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
self.get() self.get()
.map(|operation| operation.opstamp < target_opstamp) .map(|operation| operation.opstamp < target_opstamp)
.unwrap_or(false) .unwrap_or(false)

View File

@@ -1,131 +0,0 @@
use directory::error::OpenWriteError;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::thread;
use std::time::Duration;
use Directory;
use TantivyError;
#[derive(Debug, Clone, Copy)]
pub enum LockType {
/// Only one process should be able to write tantivy's index at a time.
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
///
/// If the process is killed and this file remains, it is safe to remove it manually.
///
/// Failing to acquire this lock usually means a misuse of tantivy's API,
/// (creating more than one instance of the `IndexWriter`), are a spurious
/// lock file remaining after a crash. In the latter case, removing the file after
/// checking no process running tantivy is running is safe.
IndexWriterLock,
/// The meta lock file is here to protect the segment files being opened by
/// `.load_searchers()` from being garbage collected.
/// It makes it possible for another process to safely consume
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
/// here, but it is difficult to achieve on Windows.
///
/// Opening segment readers is a very fast process.
/// Right now if the lock cannot be acquire on the first attempt, the logic
/// is very simplistic. We retry after `100ms` until we effectively
/// acquire the lock.
/// This lock should not have much contention in normal usage.
MetaLock,
}
/// Retry the logic of acquiring locks is pretty simple.
/// We just retry `n` times after a given `duratio`, both
/// depending on the type of lock.
struct RetryPolicy {
num_retries: usize,
wait_in_ms: u64,
}
impl RetryPolicy {
fn no_retry() -> RetryPolicy {
RetryPolicy {
num_retries: 0,
wait_in_ms: 0,
}
}
fn wait_and_retry(&mut self) -> bool {
if self.num_retries == 0 {
false
} else {
self.num_retries -= 1;
let wait_duration = Duration::from_millis(self.wait_in_ms);
thread::sleep(wait_duration);
true
}
}
}
impl LockType {
fn retry_policy(self) -> RetryPolicy {
match self {
LockType::IndexWriterLock => RetryPolicy::no_retry(),
LockType::MetaLock => RetryPolicy {
num_retries: 100,
wait_in_ms: 100,
},
}
}
fn try_acquire_lock(self, directory: &mut Directory) -> Result<DirectoryLock, TantivyError> {
let path = self.filename();
let mut write = directory.open_write(path).map_err(|e| match e {
OpenWriteError::FileAlreadyExists(_) => TantivyError::LockFailure(self),
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
})?;
write.flush()?;
Ok(DirectoryLock {
directory: directory.box_clone(),
path: path.to_owned(),
})
}
/// Acquire a lock in the given directory.
pub fn acquire_lock(self, directory: &Directory) -> Result<DirectoryLock, TantivyError> {
let mut box_directory = directory.box_clone();
let mut retry_policy = self.retry_policy();
loop {
let lock_result = self.try_acquire_lock(&mut *box_directory);
match lock_result {
Ok(result) => {
return Ok(result);
}
Err(TantivyError::LockFailure(ref filepath)) => {
if !retry_policy.wait_and_retry() {
return Err(TantivyError::LockFailure(filepath.to_owned()));
}
}
Err(_) => {}
}
}
}
fn filename(&self) -> &Path {
match *self {
LockType::MetaLock => Path::new(".tantivy-meta.lock"),
LockType::IndexWriterLock => Path::new(".tantivy-indexer.lock"),
}
}
}
/// The `DirectoryLock` is an object that represents a file lock.
/// See [`LockType`](struct.LockType.html)
///
/// It is transparently associated to a lock file, that gets deleted
/// on `Drop.` The lock is release automatically on `Drop`.
pub struct DirectoryLock {
directory: Box<Directory>,
path: PathBuf,
}
impl Drop for DirectoryLock {
fn drop(&mut self) {
if let Err(e) = self.directory.delete(&*self.path) {
error!("Failed to remove the lock file. {:?}", e);
}
}
}

View File

@@ -1,5 +1,6 @@
use std::sync::Arc; use std::sync::Arc;
use DocId; use DocId;
use Opstamp;
// Doc to opstamp is used to identify which // Doc to opstamp is used to identify which
// document should be deleted. // document should be deleted.
@@ -23,7 +24,7 @@ pub enum DocToOpstampMapping {
} }
impl From<Vec<u64>> for DocToOpstampMapping { impl From<Vec<u64>> for DocToOpstampMapping {
fn from(opstamps: Vec<u64>) -> DocToOpstampMapping { fn from(opstamps: Vec<Opstamp>) -> DocToOpstampMapping {
DocToOpstampMapping::WithMap(Arc::new(opstamps)) DocToOpstampMapping::WithMap(Arc::new(opstamps))
} }
} }
@@ -35,7 +36,7 @@ impl DocToOpstampMapping {
// //
// The edge case opstamp = some doc opstamp is in practise // The edge case opstamp = some doc opstamp is in practise
// never called. // never called.
pub fn compute_doc_limit(&self, target_opstamp: u64) -> DocId { pub fn compute_doc_limit(&self, target_opstamp: Opstamp) -> DocId {
match *self { match *self {
DocToOpstampMapping::WithMap(ref doc_opstamps) => { DocToOpstampMapping::WithMap(ref doc_opstamps) => {
match doc_opstamps.binary_search(&target_opstamp) { match doc_opstamps.binary_search(&target_opstamp) {

View File

@@ -1,4 +1,4 @@
use super::operation::AddOperation; use super::operation::{AddOperation, UserOperation};
use super::segment_updater::SegmentUpdater; use super::segment_updater::SegmentUpdater;
use super::PreparedCommit; use super::PreparedCommit;
use bit_set::BitSet; use bit_set::BitSet;
@@ -8,16 +8,16 @@ use core::SegmentComponent;
use core::SegmentId; use core::SegmentId;
use core::SegmentMeta; use core::SegmentMeta;
use core::SegmentReader; use core::SegmentReader;
use crossbeam_channel as channel; use crossbeam::channel;
use directory::DirectoryLock;
use docset::DocSet; use docset::DocSet;
use error::TantivyError; use error::TantivyError;
use fastfield::write_delete_bitset; use fastfield::write_delete_bitset;
use futures::sync::oneshot::Receiver; use futures::{Canceled, Future};
use indexer::delete_queue::{DeleteCursor, DeleteQueue}; use indexer::delete_queue::{DeleteCursor, DeleteQueue};
use indexer::doc_opstamp_mapping::DocToOpstampMapping; use indexer::doc_opstamp_mapping::DocToOpstampMapping;
use indexer::operation::DeleteOperation; use indexer::operation::DeleteOperation;
use indexer::stamper::Stamper; use indexer::stamper::Stamper;
use indexer::DirectoryLock;
use indexer::MergePolicy; use indexer::MergePolicy;
use indexer::SegmentEntry; use indexer::SegmentEntry;
use indexer::SegmentWriter; use indexer::SegmentWriter;
@@ -26,9 +26,11 @@ use schema::Document;
use schema::IndexRecordOption; use schema::IndexRecordOption;
use schema::Term; use schema::Term;
use std::mem; use std::mem;
use std::mem::swap; use std::ops::Range;
use std::sync::Arc;
use std::thread; use std::thread;
use std::thread::JoinHandle; use std::thread::JoinHandle;
use Opstamp;
use Result; use Result;
// Size of the margin for the heap. A segment is closed when the remaining memory // Size of the margin for the heap. A segment is closed when the remaining memory
@@ -43,8 +45,8 @@ pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
// reaches `PIPELINE_MAX_SIZE_IN_DOCS` // reaches `PIPELINE_MAX_SIZE_IN_DOCS`
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000; const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
type DocumentSender = channel::Sender<AddOperation>; type OperationSender = channel::Sender<Vec<AddOperation>>;
type DocumentReceiver = channel::Receiver<AddOperation>; type OperationReceiver = channel::Receiver<Vec<AddOperation>>;
/// Split the thread memory budget into /// Split the thread memory budget into
/// - the heap size /// - the heap size
@@ -52,16 +54,19 @@ type DocumentReceiver = channel::Receiver<AddOperation>;
/// ///
/// Returns (the heap size in bytes, the hash table size in number of bits) /// Returns (the heap size in bytes, the hash table size in number of bits)
fn initial_table_size(per_thread_memory_budget: usize) -> usize { fn initial_table_size(per_thread_memory_budget: usize) -> usize {
assert!(per_thread_memory_budget > 1_000);
let table_size_limit: usize = per_thread_memory_budget / 3; let table_size_limit: usize = per_thread_memory_budget / 3;
(1..) if let Some(limit) = (1..)
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit) .take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
.last() .last()
.unwrap_or_else(|| { {
panic!( limit.min(19) // we cap it at 2^19 = 512K.
"Per thread memory is too small: {}", } else {
per_thread_memory_budget unreachable!(
) "Per thread memory is too small: {}",
}).min(19) // we cap it at 512K per_thread_memory_budget
);
}
} }
/// `IndexWriter` is the user entry-point to add document to an index. /// `IndexWriter` is the user entry-point to add document to an index.
@@ -81,8 +86,8 @@ pub struct IndexWriter {
workers_join_handle: Vec<JoinHandle<Result<()>>>, workers_join_handle: Vec<JoinHandle<Result<()>>>,
document_receiver: DocumentReceiver, operation_receiver: OperationReceiver,
document_sender: DocumentSender, operation_sender: OperationSender,
segment_updater: SegmentUpdater, segment_updater: SegmentUpdater,
@@ -95,7 +100,7 @@ pub struct IndexWriter {
delete_queue: DeleteQueue, delete_queue: DeleteQueue,
stamper: Stamper, stamper: Stamper,
committed_opstamp: u64, committed_opstamp: Opstamp,
} }
/// Open a new index writer. Attempts to acquire a lockfile. /// Open a new index writer. Attempts to acquire a lockfile.
@@ -129,7 +134,7 @@ pub fn open_index_writer(
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX); let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
return Err(TantivyError::InvalidArgument(err_msg)); return Err(TantivyError::InvalidArgument(err_msg));
} }
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) = let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS); channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let delete_queue = DeleteQueue::new(); let delete_queue = DeleteQueue::new();
@@ -139,7 +144,7 @@ pub fn open_index_writer(
let stamper = Stamper::new(current_opstamp); let stamper = Stamper::new(current_opstamp);
let segment_updater = let segment_updater =
SegmentUpdater::new(index.clone(), stamper.clone(), &delete_queue.cursor())?; SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
let mut index_writer = IndexWriter { let mut index_writer = IndexWriter {
_directory_lock: Some(directory_lock), _directory_lock: Some(directory_lock),
@@ -147,8 +152,8 @@ pub fn open_index_writer(
heap_size_in_bytes_per_thread, heap_size_in_bytes_per_thread,
index: index.clone(), index: index.clone(),
document_receiver, operation_receiver: document_receiver,
document_sender, operation_sender: document_sender,
segment_updater, segment_updater,
@@ -173,7 +178,7 @@ pub fn compute_deleted_bitset(
segment_reader: &SegmentReader, segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor, delete_cursor: &mut DeleteCursor,
doc_opstamps: &DocToOpstampMapping, doc_opstamps: &DocToOpstampMapping,
target_opstamp: u64, target_opstamp: Opstamp,
) -> Result<bool> { ) -> Result<bool> {
let mut might_have_changed = false; let mut might_have_changed = false;
@@ -215,7 +220,7 @@ pub fn compute_deleted_bitset(
pub fn advance_deletes( pub fn advance_deletes(
mut segment: Segment, mut segment: Segment,
segment_entry: &mut SegmentEntry, segment_entry: &mut SegmentEntry,
target_opstamp: u64, target_opstamp: Opstamp,
) -> Result<()> { ) -> Result<()> {
{ {
if segment_entry.meta().delete_opstamp() == Some(target_opstamp) { if segment_entry.meta().delete_opstamp() == Some(target_opstamp) {
@@ -255,7 +260,7 @@ pub fn advance_deletes(
write_delete_bitset(&delete_bitset, &mut delete_file)?; write_delete_bitset(&delete_bitset, &mut delete_file)?;
} }
} }
segment_entry.set_meta((*segment.meta()).clone()); segment_entry.set_meta(segment.meta().clone());
Ok(()) Ok(())
} }
@@ -263,7 +268,7 @@ fn index_documents(
memory_budget: usize, memory_budget: usize,
segment: &Segment, segment: &Segment,
generation: usize, generation: usize,
document_iterator: &mut Iterator<Item = AddOperation>, document_iterator: &mut Iterator<Item = Vec<AddOperation>>,
segment_updater: &mut SegmentUpdater, segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor, mut delete_cursor: DeleteCursor,
) -> Result<bool> { ) -> Result<bool> {
@@ -271,11 +276,11 @@ fn index_documents(
let segment_id = segment.id(); let segment_id = segment.id();
let table_size = initial_table_size(memory_budget); let table_size = initial_table_size(memory_budget);
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?; let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
for doc in document_iterator { for documents in document_iterator {
segment_writer.add_document(doc, &schema)?; for doc in documents {
segment_writer.add_document(doc, &schema)?;
}
let mem_usage = segment_writer.mem_usage(); let mem_usage = segment_writer.mem_usage();
if mem_usage >= memory_budget - MARGIN_IN_BYTES { if mem_usage >= memory_budget - MARGIN_IN_BYTES {
info!( info!(
"Buffer limit reached, flushing segment with maxdoc={}.", "Buffer limit reached, flushing segment with maxdoc={}.",
@@ -295,13 +300,13 @@ fn index_documents(
// the worker thread. // the worker thread.
assert!(num_docs > 0); assert!(num_docs > 0);
let doc_opstamps: Vec<u64> = segment_writer.finalize()?; let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
let segment_meta = SegmentMeta::new(segment_id, num_docs); let segment_meta = SegmentMeta::new(segment_id, num_docs);
let last_docstamp: u64 = *(doc_opstamps.last().unwrap()); let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
let segment_entry: SegmentEntry = if delete_cursor.get().is_some() { let delete_bitset_opt = if delete_cursor.get().is_some() {
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let segment_reader = SegmentReader::open(segment)?; let segment_reader = SegmentReader::open(segment)?;
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize); let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
@@ -312,18 +317,17 @@ fn index_documents(
&doc_to_opstamps, &doc_to_opstamps,
last_docstamp, last_docstamp,
)?; )?;
SegmentEntry::new(segment_meta, delete_cursor, { if may_have_deletes {
if may_have_deletes { Some(deleted_bitset)
Some(deleted_bitset) } else {
} else { None
None }
}
})
} else { } else {
// if there are no delete operation in the queue, no need // if there are no delete operation in the queue, no need
// to even open the segment. // to even open the segment.
SegmentEntry::new(segment_meta, delete_cursor, None) None
}; };
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
Ok(segment_updater.add_segment(generation, segment_entry)) Ok(segment_updater.add_segment(generation, segment_entry))
} }
@@ -332,7 +336,7 @@ impl IndexWriter {
pub fn wait_merging_threads(mut self) -> Result<()> { pub fn wait_merging_threads(mut self) -> Result<()> {
// this will stop the indexing thread, // this will stop the indexing thread,
// dropping the last reference to the segment_updater. // dropping the last reference to the segment_updater.
drop(self.document_sender); drop(self.operation_sender);
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]); let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]);
for join_handle in former_workers_handles { for join_handle in former_workers_handles {
@@ -365,20 +369,23 @@ impl IndexWriter {
.add_segment(self.generation, segment_entry); .add_segment(self.generation, segment_entry);
} }
/// *Experimental & Advanced API* Creates a new segment. /// Creates a new segment.
/// and marks it as currently in write.
/// ///
/// This method is useful only for users trying to do complex /// This method is useful only for users trying to do complex
/// operations, like converting an index format to another. /// operations, like converting an index format to another.
///
/// It is safe to start writing file associated to the new `Segment`.
/// These will not be garbage collected as long as an instance object of
/// `SegmentMeta` object associated to the new `Segment` is "alive".
pub fn new_segment(&self) -> Segment { pub fn new_segment(&self) -> Segment {
self.segment_updater.new_segment() self.index.new_segment()
} }
/// Spawns a new worker thread for indexing. /// Spawns a new worker thread for indexing.
/// The thread consumes documents from the pipeline. /// The thread consumes documents from the pipeline.
/// ///
fn add_indexing_worker(&mut self) -> Result<()> { fn add_indexing_worker(&mut self) -> Result<()> {
let document_receiver_clone = self.document_receiver.clone(); let document_receiver_clone = self.operation_receiver.clone();
let mut segment_updater = self.segment_updater.clone(); let mut segment_updater = self.segment_updater.clone();
let generation = self.generation; let generation = self.generation;
@@ -386,13 +393,16 @@ impl IndexWriter {
let mut delete_cursor = self.delete_queue.cursor(); let mut delete_cursor = self.delete_queue.cursor();
let mem_budget = self.heap_size_in_bytes_per_thread; let mem_budget = self.heap_size_in_bytes_per_thread;
let index = self.index.clone();
let join_handle: JoinHandle<Result<()>> = thread::Builder::new() let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
.name(format!( .name(format!(
"indexing thread {} for gen {}", "thrd-tantivy-index{}-gen{}",
self.worker_id, generation self.worker_id, generation
)).spawn(move || { ))
.spawn(move || {
loop { loop {
let mut document_iterator = document_receiver_clone.clone().peekable(); let mut document_iterator =
document_receiver_clone.clone().into_iter().peekable();
// the peeking here is to avoid // the peeking here is to avoid
// creating a new segment's files // creating a new segment's files
@@ -401,15 +411,19 @@ impl IndexWriter {
// this is a valid guarantee as the // this is a valid guarantee as the
// peeked document now belongs to // peeked document now belongs to
// our local iterator. // our local iterator.
if let Some(operation) = document_iterator.peek() { if let Some(operations) = document_iterator.peek() {
delete_cursor.skip_to(operation.opstamp); if let Some(first) = operations.first() {
delete_cursor.skip_to(first.opstamp);
} else {
return Ok(());
}
} else { } else {
// No more documents. // No more documents.
// Happens when there is a commit, or if the `IndexWriter` // Happens when there is a commit, or if the `IndexWriter`
// was dropped. // was dropped.
return Ok(()); return Ok(());
} }
let segment = segment_updater.new_segment(); let segment = index.new_segment();
index_documents( index_documents(
mem_budget, mem_budget,
&segment, &segment,
@@ -426,7 +440,7 @@ impl IndexWriter {
} }
/// Accessor to the merge policy. /// Accessor to the merge policy.
pub fn get_merge_policy(&self) -> Box<MergePolicy> { pub fn get_merge_policy(&self) -> Arc<Box<MergePolicy>> {
self.segment_updater.get_merge_policy() self.segment_updater.get_merge_policy()
} }
@@ -451,7 +465,10 @@ impl IndexWriter {
/// Merges a given list of segments /// Merges a given list of segments
/// ///
/// `segment_ids` is required to be non-empty. /// `segment_ids` is required to be non-empty.
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> { pub fn merge(
&mut self,
segment_ids: &[SegmentId],
) -> Result<impl Future<Item = SegmentMeta, Error = Canceled>> {
self.segment_updater.start_merge(segment_ids) self.segment_updater.start_merge(segment_ids)
} }
@@ -463,14 +480,11 @@ impl IndexWriter {
/// when no documents are remaining. /// when no documents are remaining.
/// ///
/// Returns the former segment_ready channel. /// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) -> DocumentReceiver { fn recreate_document_channel(&mut self) -> OperationReceiver {
let (mut document_sender, mut document_receiver): ( let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
DocumentSender, channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
DocumentReceiver, mem::replace(&mut self.operation_sender, document_sender);
) = channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS); mem::replace(&mut self.operation_receiver, document_receiver)
swap(&mut self.document_sender, &mut document_sender);
swap(&mut self.document_receiver, &mut document_receiver);
document_receiver
} }
/// Rollback to the last commit /// Rollback to the last commit
@@ -481,14 +495,14 @@ impl IndexWriter {
/// state as it was after the last commit. /// state as it was after the last commit.
/// ///
/// The opstamp at the last commit is returned. /// The opstamp at the last commit is returned.
pub fn rollback(&mut self) -> Result<()> { pub fn rollback(&mut self) -> Result<Opstamp> {
info!("Rolling back to opstamp {}", self.committed_opstamp); info!("Rolling back to opstamp {}", self.committed_opstamp);
// marks the segment updater as killed. From now on, all // marks the segment updater as killed. From now on, all
// segment updates will be ignored. // segment updates will be ignored.
self.segment_updater.kill(); self.segment_updater.kill();
let document_receiver = self.document_receiver.clone(); let document_receiver = self.operation_receiver.clone();
// take the directory lock to create a new index_writer. // take the directory lock to create a new index_writer.
let directory_lock = self let directory_lock = self
@@ -516,7 +530,7 @@ impl IndexWriter {
// was dropped with the index_writer. // was dropped with the index_writer.
for _ in document_receiver.clone() {} for _ in document_receiver.clone() {}
Ok(()) Ok(self.committed_opstamp)
} }
/// Prepares a commit. /// Prepares a commit.
@@ -554,20 +568,15 @@ impl IndexWriter {
info!("Preparing commit"); info!("Preparing commit");
// this will drop the current document channel // this will drop the current document channel
// and recreate a new one channels. // and recreate a new one.
self.recreate_document_channel(); self.recreate_document_channel();
let mut former_workers_join_handle = Vec::new(); let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
swap(
&mut former_workers_join_handle,
&mut self.workers_join_handle,
);
for worker_handle in former_workers_join_handle { for worker_handle in former_workers_join_handle {
let indexing_worker_result = worker_handle let indexing_worker_result = worker_handle
.join() .join()
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?; .map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
indexing_worker_result?; indexing_worker_result?;
// add a new worker for the next generation. // add a new worker for the next generation.
self.add_indexing_worker()?; self.add_indexing_worker()?;
@@ -593,7 +602,7 @@ impl IndexWriter {
/// Commit returns the `opstamp` of the last document /// Commit returns the `opstamp` of the last document
/// that made it in the commit. /// that made it in the commit.
/// ///
pub fn commit(&mut self) -> Result<u64> { pub fn commit(&mut self) -> Result<Opstamp> {
self.prepare_commit()?.commit() self.prepare_commit()?.commit()
} }
@@ -609,7 +618,7 @@ impl IndexWriter {
/// ///
/// Like adds, the deletion itself will be visible /// Like adds, the deletion itself will be visible
/// only after calling `commit()`. /// only after calling `commit()`.
pub fn delete_term(&mut self, term: Term) -> u64 { pub fn delete_term(&self, term: Term) -> Opstamp {
let opstamp = self.stamper.stamp(); let opstamp = self.stamper.stamp();
let delete_operation = DeleteOperation { opstamp, term }; let delete_operation = DeleteOperation { opstamp, term };
self.delete_queue.push(delete_operation); self.delete_queue.push(delete_operation);
@@ -623,7 +632,7 @@ impl IndexWriter {
/// ///
/// This is also the opstamp of the commit that is currently /// This is also the opstamp of the commit that is currently
/// available for searchers. /// available for searchers.
pub fn commit_opstamp(&self) -> u64 { pub fn commit_opstamp(&self) -> Opstamp {
self.committed_opstamp self.committed_opstamp
} }
@@ -637,45 +646,191 @@ impl IndexWriter {
/// ///
/// Currently it represents the number of documents that /// Currently it represents the number of documents that
/// have been added since the creation of the index. /// have been added since the creation of the index.
pub fn add_document(&mut self, document: Document) -> u64 { pub fn add_document(&self, document: Document) -> Opstamp {
let opstamp = self.stamper.stamp(); let opstamp = self.stamper.stamp();
let add_operation = AddOperation { opstamp, document }; let add_operation = AddOperation { opstamp, document };
self.document_sender.send(add_operation); let send_result = self.operation_sender.send(vec![add_operation]);
if let Err(e) = send_result {
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
}
opstamp opstamp
} }
/// Gets a range of stamps from the stamper and "pops" the last stamp
/// from the range returning a tuple of the last optstamp and the popped
/// range.
///
/// The total number of stamps generated by this method is `count + 1`;
/// each operation gets a stamp from the `stamps` iterator and `last_opstamp`
/// is for the batch itself.
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
let Range { start, end } = self.stamper.stamps(count + 1u64);
let last_opstamp = end - 1;
let stamps = Range {
start,
end: last_opstamp,
};
(last_opstamp, stamps)
}
/// Runs a group of document operations ensuring that the operations are
/// assigned contigous u64 opstamps and that add operations of the same
/// group are flushed into the same segment.
///
/// If the indexing pipeline is full, this call may block.
///
/// Each operation of the given `user_operations` will receive an in-order,
/// contiguous u64 opstamp. The entire batch itself is also given an
/// opstamp that is 1 greater than the last given operation. This
/// `batch_opstamp` is the return value of `run`. An empty group of
/// `user_operations`, an empty `Vec<UserOperation>`, still receives
/// a valid opstamp even though no changes were _actually_ made to the index.
///
/// Like adds and deletes (see `IndexWriter.add_document` and
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
/// visible to readers only after calling `commit()`.
pub fn run(&self, user_operations: Vec<UserOperation>) -> Opstamp {
let count = user_operations.len() as u64;
if count == 0 {
return self.stamper.stamp();
}
let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
let mut adds: Vec<AddOperation> = Vec::new();
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
match user_op {
UserOperation::Delete(term) => {
let delete_operation = DeleteOperation { opstamp, term };
self.delete_queue.push(delete_operation);
}
UserOperation::Add(document) => {
let add_operation = AddOperation { opstamp, document };
adds.push(add_operation);
}
}
}
let send_result = self.operation_sender.send(adds);
if let Err(e) = send_result {
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
};
batch_opstamp
}
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::super::operation::UserOperation;
use super::initial_table_size; use super::initial_table_size;
use collector::TopDocs;
use directory::error::LockError;
use error::*; use error::*;
use indexer::NoMergePolicy; use indexer::NoMergePolicy;
use schema::{self, Document}; use query::TermQuery;
use schema::{self, IndexRecordOption};
use Index; use Index;
use ReloadPolicy;
use Term; use Term;
#[test] #[test]
fn test_lockfile_stops_duplicates() { fn test_operations_group() {
let schema_builder = schema::SchemaBuilder::default(); // an operations group with 2 items should cause 3 opstamps 0, 1, and 2.
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer(40_000_000).unwrap(); let index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
match index.writer(40_000_000) { let operations = vec![
Err(TantivyError::LockFailure(_)) => {} UserOperation::Add(doc!(text_field=>"a")),
_ => panic!("Expected FileAlreadyExists error"), UserOperation::Add(doc!(text_field=>"b")),
];
let batch_opstamp1 = index_writer.run(operations);
assert_eq!(batch_opstamp1, 2u64);
}
#[test]
fn test_ordered_batched_operations() {
// * one delete for `doc!(field=>"a")`
// * one add for `doc!(field=>"a")`
// * one add for `doc!(field=>"b")`
// * one delete for `doc!(field=>"b")`
// after commit there is one doc with "a" and 0 doc with "b"
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b");
let operations = vec![
UserOperation::Delete(a_term),
UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")),
UserOperation::Delete(b_term),
];
index_writer.run(operations);
index_writer.commit().expect("failed to commit");
reader.reload().expect("failed to load searchers");
let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b");
let a_query = TermQuery::new(a_term, IndexRecordOption::Basic);
let b_query = TermQuery::new(b_term, IndexRecordOption::Basic);
let searcher = reader.searcher();
let a_docs = searcher
.search(&a_query, &TopDocs::with_limit(1))
.expect("search for a failed");
let b_docs = searcher
.search(&b_query, &TopDocs::with_limit(1))
.expect("search for b failed");
assert_eq!(a_docs.len(), 1);
assert_eq!(b_docs.len(), 0);
}
#[test]
fn test_empty_operations_group() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer(3_000_000).unwrap();
let operations1 = vec![];
let batch_opstamp1 = index_writer.run(operations1);
assert_eq!(batch_opstamp1, 0u64);
let operations2 = vec![];
let batch_opstamp2 = index_writer.run(operations2);
assert_eq!(batch_opstamp2, 1u64);
}
#[test]
fn test_lockfile_stops_duplicates() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer(3_000_000).unwrap();
match index.writer(3_000_000) {
Err(TantivyError::LockFailure(LockError::LockBusy, _)) => {}
_ => panic!("Expected a `LockFailure` error"),
} }
} }
#[test] #[test]
fn test_lockfile_already_exists_error_msg() { fn test_lockfile_already_exists_error_msg() {
let schema_builder = schema::SchemaBuilder::default(); let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
match index.writer_with_num_threads(1, 3_000_000) { match index.writer_with_num_threads(1, 3_000_000) {
Err(err) => { Err(err) => {
let err_msg = err.to_string(); let err_msg = err.to_string();
assert!(err_msg.contains("Lockfile")); assert!(err_msg.contains("already an `IndexWriter`"));
assert!(err_msg.contains("Possible causes:"))
} }
_ => panic!("Expected LockfileAlreadyExists error"), _ => panic!("Expected LockfileAlreadyExists error"),
} }
@@ -683,9 +838,9 @@ mod tests {
#[test] #[test]
fn test_set_merge_policy() { fn test_set_merge_policy() {
let schema_builder = schema::SchemaBuilder::default(); let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer(40_000_000).unwrap(); let index_writer = index.writer(3_000_000).unwrap();
assert_eq!( assert_eq!(
format!("{:?}", index_writer.get_merge_policy()), format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \ "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
@@ -701,24 +856,28 @@ mod tests {
#[test] #[test]
fn test_lockfile_released_on_drop() { fn test_lockfile_released_on_drop() {
let schema_builder = schema::SchemaBuilder::default(); let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ {
let _index_writer = index.writer(40_000_000).unwrap(); let _index_writer = index.writer(3_000_000).unwrap();
// the lock should be released when the // the lock should be released when the
// index_writer leaves the scope. // index_writer leaves the scope.
} }
let _index_writer_two = index.writer(40_000_000).unwrap(); let _index_writer_two = index.writer(3_000_000).unwrap();
} }
#[test] #[test]
fn test_commit_and_rollback() { fn test_commit_and_rollback() {
let mut schema_builder = schema::SchemaBuilder::default(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let num_docs_containing = |s: &str| { let num_docs_containing = |s: &str| {
let searcher = index.searcher(); let searcher = reader.searcher();
let term = Term::from_field_text(text_field, s); let term = Term::from_field_text(text_field, s);
searcher.doc_freq(&term) searcher.doc_freq(&term)
}; };
@@ -728,54 +887,54 @@ mod tests {
let mut index_writer = index.writer(3_000_000).unwrap(); let mut index_writer = index.writer(3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a")); index_writer.add_document(doc!(text_field=>"a"));
index_writer.rollback().unwrap(); index_writer.rollback().unwrap();
assert_eq!(index_writer.commit_opstamp(), 0u64); assert_eq!(index_writer.commit_opstamp(), 0u64);
assert_eq!(num_docs_containing("a"), 0); assert_eq!(num_docs_containing("a"), 0);
{ {
index_writer.add_document(doc!(text_field=>"b")); index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"c")); index_writer.add_document(doc!(text_field=>"c"));
} }
assert_eq!(index_writer.commit().unwrap(), 2u64); assert!(index_writer.commit().is_ok());
index.load_searchers().unwrap(); reader.reload().unwrap();
assert_eq!(num_docs_containing("a"), 0); assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 1); assert_eq!(num_docs_containing("b"), 1);
assert_eq!(num_docs_containing("c"), 1); assert_eq!(num_docs_containing("c"), 1);
} }
index.load_searchers().unwrap(); reader.reload().unwrap();
index.searcher(); reader.searcher();
} }
#[test] #[test]
fn test_with_merges() { fn test_with_merges() {
let mut schema_builder = schema::SchemaBuilder::default(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let num_docs_containing = |s: &str| { let num_docs_containing = |s: &str| {
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, s); let term_a = Term::from_field_text(text_field, s);
searcher.doc_freq(&term_a) reader.searcher().doc_freq(&term_a)
}; };
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer(12_000_000).unwrap(); let mut index_writer = index.writer(12_000_000).unwrap();
// create 8 segments with 100 tiny docs // create 8 segments with 100 tiny docs
for _doc in 0..100 { for _doc in 0..100 {
let mut doc = Document::default(); index_writer.add_document(doc!(text_field=>"a"));
doc.add_text(text_field, "a");
index_writer.add_document(doc);
} }
index_writer.commit().expect("commit failed"); index_writer.commit().expect("commit failed");
for _doc in 0..100 { for _doc in 0..100 {
let mut doc = Document::default(); index_writer.add_document(doc!(text_field=>"a"));
doc.add_text(text_field, "a");
index_writer.add_document(doc);
} }
// this should create 8 segments and trigger a merge. // this should create 8 segments and trigger a merge.
index_writer.commit().expect("commit failed"); index_writer.commit().expect("commit failed");
index_writer index_writer
.wait_merging_threads() .wait_merging_threads()
.expect("waiting merging thread failed"); .expect("waiting merging thread failed");
index.load_searchers().unwrap();
reader.reload().unwrap();
assert_eq!(num_docs_containing("a"), 200); assert_eq!(num_docs_containing("a"), 200);
assert!(index.searchable_segments().unwrap().len() < 8); assert!(index.searchable_segments().unwrap().len() < 8);
@@ -784,7 +943,7 @@ mod tests {
#[test] #[test]
fn test_prepare_with_commit_message() { fn test_prepare_with_commit_message() {
let mut schema_builder = schema::SchemaBuilder::default(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
@@ -798,7 +957,6 @@ mod tests {
{ {
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed"); let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit"); prepared_commit.set_payload("first commit");
assert_eq!(prepared_commit.opstamp(), 100);
prepared_commit.commit().expect("commit failed"); prepared_commit.commit().expect("commit failed");
} }
{ {
@@ -818,7 +976,7 @@ mod tests {
#[test] #[test]
fn test_prepare_but_rollback() { fn test_prepare_but_rollback() {
let mut schema_builder = schema::SchemaBuilder::default(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
@@ -832,7 +990,6 @@ mod tests {
{ {
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed"); let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit"); prepared_commit.set_payload("first commit");
assert_eq!(prepared_commit.opstamp(), 100);
prepared_commit.abort().expect("commit failed"); prepared_commit.abort().expect("commit failed");
} }
{ {
@@ -844,11 +1001,15 @@ mod tests {
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
index.load_searchers().unwrap();
let num_docs_containing = |s: &str| { let num_docs_containing = |s: &str| {
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, s); let term_a = Term::from_field_text(text_field, s);
searcher.doc_freq(&term_a) index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap()
.searcher()
.doc_freq(&term_a)
}; };
assert_eq!(num_docs_containing("a"), 0); assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 100); assert_eq!(num_docs_containing("b"), 100);
@@ -856,9 +1017,9 @@ mod tests {
#[test] #[test]
fn test_hashmap_size() { fn test_hashmap_size() {
assert_eq!(initial_table_size(100_000), 12); assert_eq!(initial_table_size(100_000), 11);
assert_eq!(initial_table_size(1_000_000), 15); assert_eq!(initial_table_size(1_000_000), 14);
assert_eq!(initial_table_size(10_000_000), 18); assert_eq!(initial_table_size(10_000_000), 17);
assert_eq!(initial_table_size(1_000_000_000), 19); assert_eq!(initial_table_size(1_000_000_000), 19);
} }
@@ -866,7 +1027,7 @@ mod tests {
#[test] #[test]
fn test_write_commit_fails() { fn test_write_commit_fails() {
use fail; use fail;
let mut schema_builder = schema::SchemaBuilder::default(); let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT); let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
@@ -880,11 +1041,9 @@ mod tests {
index_writer.add_document(doc!(text_field => "b")); index_writer.add_document(doc!(text_field => "b"));
} }
assert!(index_writer.commit().is_err()); assert!(index_writer.commit().is_err());
index.load_searchers().unwrap();
let num_docs_containing = |s: &str| { let num_docs_containing = |s: &str| {
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, s); let term_a = Term::from_field_text(text_field, s);
searcher.doc_freq(&term_a) index.reader().unwrap().searcher().doc_freq(&term_a)
}; };
assert_eq!(num_docs_containing("a"), 100); assert_eq!(num_docs_containing("a"), 100);
assert_eq!(num_docs_containing("b"), 0); assert_eq!(num_docs_containing("b"), 0);

View File

@@ -52,7 +52,7 @@ impl MergePolicy for LogMergePolicy {
let mut size_sorted_tuples = segments let mut size_sorted_tuples = segments
.iter() .iter()
.map(|x| x.num_docs()) .map(SegmentMeta::num_docs)
.enumerate() .enumerate()
.collect::<Vec<(usize, u32)>>(); .collect::<Vec<(usize, u32)>>();

View File

@@ -0,0 +1,65 @@
use census::{Inventory, TrackedObject};
use std::collections::HashSet;
use Opstamp;
use SegmentId;
#[derive(Default)]
pub struct MergeOperationInventory(Inventory<InnerMergeOperation>);
impl MergeOperationInventory {
pub fn segment_in_merge(&self) -> HashSet<SegmentId> {
let mut segment_in_merge = HashSet::default();
for merge_op in self.0.list() {
for &segment_id in &merge_op.segment_ids {
segment_in_merge.insert(segment_id);
}
}
segment_in_merge
}
}
/// A `MergeOperation` has two roles.
/// It carries all of the information required to describe a merge:
/// - `target_opstamp` is the opstamp up to which we want to consume the
/// delete queue and reflect their deletes.
/// - `segment_ids` is the list of segment to be merged.
///
/// The second role is to ensure keep track of the fact that these
/// segments are in merge and avoid starting a merge operation that
/// may conflict with this one.
///
/// This works by tracking merge operations. When considering computing
/// merge candidates, we simply list tracked merge operations and remove
/// their segments from possible merge candidates.
pub struct MergeOperation {
inner: TrackedObject<InnerMergeOperation>,
}
struct InnerMergeOperation {
target_opstamp: Opstamp,
segment_ids: Vec<SegmentId>,
}
impl MergeOperation {
pub fn new(
inventory: &MergeOperationInventory,
target_opstamp: Opstamp,
segment_ids: Vec<SegmentId>,
) -> MergeOperation {
let inner_merge_operation = InnerMergeOperation {
target_opstamp,
segment_ids,
};
MergeOperation {
inner: inventory.0.track(inner_merge_operation),
}
}
pub fn target_opstamp(&self) -> Opstamp {
self.inner.target_opstamp
}
pub fn segment_ids(&self) -> &[SegmentId] {
&self.inner.segment_ids[..]
}
}

View File

@@ -11,7 +11,7 @@ pub struct MergeCandidate(pub Vec<SegmentId>);
/// ///
/// Every time a the list of segments changes, the segment updater /// Every time a the list of segments changes, the segment updater
/// asks the merge policy if some segments should be merged. /// asks the merge policy if some segments should be merged.
pub trait MergePolicy: MergePolicyClone + marker::Send + marker::Sync + Debug { pub trait MergePolicy: marker::Send + marker::Sync + Debug {
/// Given the list of segment metas, returns the list of merge candidates. /// Given the list of segment metas, returns the list of merge candidates.
/// ///
/// This call happens on the segment updater thread, and will block /// This call happens on the segment updater thread, and will block
@@ -19,21 +19,6 @@ pub trait MergePolicy: MergePolicyClone + marker::Send + marker::Sync + Debug {
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>; fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
} }
/// MergePolicyClone
pub trait MergePolicyClone {
/// Returns a boxed clone of the MergePolicy.
fn box_clone(&self) -> Box<MergePolicy>;
}
impl<T> MergePolicyClone for T
where
T: 'static + MergePolicy + Clone,
{
fn box_clone(&self) -> Box<MergePolicy> {
Box::new(self.clone())
}
}
/// Never merge segments. /// Never merge segments.
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct NoMergePolicy; pub struct NoMergePolicy;

View File

@@ -1,7 +1,9 @@
use common::MAX_DOC_LIMIT;
use core::Segment; use core::Segment;
use core::SegmentReader; use core::SegmentReader;
use core::SerializableSegment; use core::SerializableSegment;
use docset::DocSet; use docset::DocSet;
use fastfield::BytesFastFieldReader;
use fastfield::DeleteBitSet; use fastfield::DeleteBitSet;
use fastfield::FastFieldReader; use fastfield::FastFieldReader;
use fastfield::FastFieldSerializer; use fastfield::FastFieldSerializer;
@@ -23,6 +25,7 @@ use termdict::TermMerger;
use termdict::TermOrdinal; use termdict::TermOrdinal;
use DocId; use DocId;
use Result; use Result;
use TantivyError;
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 { fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
let mut total_tokens = 0u64; let mut total_tokens = 0u64;
@@ -40,13 +43,15 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
total_tokens += reader.inverted_index(field).total_num_tokens(); total_tokens += reader.inverted_index(field).total_num_tokens();
} }
} }
total_tokens + count total_tokens
.iter() + count
.cloned() .iter()
.enumerate() .cloned()
.map(|(fieldnorm_ord, count)| { .enumerate()
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8)) .map(|(fieldnorm_ord, count)| {
}).sum::<u64>() count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
})
.sum::<u64>()
} }
pub struct IndexMerger { pub struct IndexMerger {
@@ -68,7 +73,7 @@ fn compute_min_max_val(
// some deleted documents, // some deleted documents,
// we need to recompute the max / min // we need to recompute the max / min
(0..max_doc) (0..max_doc)
.filter(|doc_id| !delete_bitset.is_deleted(*doc_id)) .filter(|doc_id| delete_bitset.is_alive(*doc_id))
.map(|doc_id| u64_reader.get(doc_id)) .map(|doc_id| u64_reader.get(doc_id))
.minmax() .minmax()
.into_option() .into_option()
@@ -148,6 +153,14 @@ impl IndexMerger {
readers.push(reader); readers.push(reader);
} }
} }
if max_doc >= MAX_DOC_LIMIT {
let err_msg = format!(
"The segment resulting from this merge would have {} docs,\
which exceeds the limit {}.",
max_doc, MAX_DOC_LIMIT
);
return Err(TantivyError::InvalidArgument(err_msg));
}
Ok(IndexMerger { Ok(IndexMerger {
schema, schema,
readers, readers,
@@ -192,17 +205,17 @@ impl IndexMerger {
fast_field_serializer, fast_field_serializer,
)?; )?;
} }
FieldType::U64(ref options) | FieldType::I64(ref options) => { FieldType::U64(ref options)
match options.get_fastfield_cardinality() { | FieldType::I64(ref options)
Some(Cardinality::SingleValue) => { | FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
self.write_single_fast_field(field, fast_field_serializer)?; Some(Cardinality::SingleValue) => {
} self.write_single_fast_field(field, fast_field_serializer)?;
Some(Cardinality::MultiValues) => {
self.write_multi_fast_field(field, fast_field_serializer)?;
}
None => {}
} }
} Some(Cardinality::MultiValues) => {
self.write_multi_fast_field(field, fast_field_serializer)?;
}
None => {}
},
FieldType::Str(_) => { FieldType::Str(_) => {
// We don't handle str fast field for the moment // We don't handle str fast field for the moment
// They can be implemented using what is done // They can be implemented using what is done
@@ -227,7 +240,10 @@ impl IndexMerger {
let mut max_value = u64::min_value(); let mut max_value = u64::min_value();
for reader in &self.readers { for reader in &self.readers {
let u64_reader: FastFieldReader<u64> = reader.fast_field_reader(field)?; let u64_reader: FastFieldReader<u64> = reader
.fast_fields()
.u64_lenient(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
if let Some((seg_min_val, seg_max_val)) = if let Some((seg_min_val, seg_max_val)) =
compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset()) compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset())
{ {
@@ -270,24 +286,28 @@ impl IndexMerger {
fast_field_serializer: &mut FastFieldSerializer, fast_field_serializer: &mut FastFieldSerializer,
) -> Result<()> { ) -> Result<()> {
let mut total_num_vals = 0u64; let mut total_num_vals = 0u64;
let mut u64s_readers: Vec<MultiValueIntFastFieldReader<u64>> = Vec::new();
// In the first pass, we compute the total number of vals. // In the first pass, we compute the total number of vals.
// //
// This is required by the bitpacker, as it needs to know // This is required by the bitpacker, as it needs to know
// what should be the bit length use for bitpacking. // what should be the bit length use for bitpacking.
for reader in &self.readers { for reader in &self.readers {
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?; let u64s_reader = reader.fast_fields()
.u64s_lenient(field)
.expect("Failed to find index for multivalued field. This is a bug in tantivy, please report.");
if let Some(delete_bitset) = reader.delete_bitset() { if let Some(delete_bitset) = reader.delete_bitset() {
for doc in 0u32..reader.max_doc() { for doc in 0u32..reader.max_doc() {
if !delete_bitset.is_deleted(doc) { if delete_bitset.is_alive(doc) {
let start = idx_reader.get(doc); let num_vals = u64s_reader.num_vals(doc) as u64;
let end = idx_reader.get(doc + 1); total_num_vals += num_vals;
total_num_vals += end - start;
} }
} }
} else { } else {
total_num_vals += idx_reader.max_value(); total_num_vals += u64s_reader.total_num_vals();
} }
u64s_readers.push(u64s_reader);
} }
// We can now create our `idx` serializer, and in a second pass, // We can now create our `idx` serializer, and in a second pass,
@@ -295,13 +315,10 @@ impl IndexMerger {
let mut serialize_idx = let mut serialize_idx =
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?; fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
let mut idx = 0; let mut idx = 0;
for reader in &self.readers { for (segment_reader, u64s_reader) in self.readers.iter().zip(&u64s_readers) {
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?; for doc in segment_reader.doc_ids_alive() {
for doc in reader.doc_ids_alive() {
serialize_idx.add_val(idx)?; serialize_idx.add_val(idx)?;
let start = idx_reader.get(doc); idx += u64s_reader.num_vals(doc) as u64;
let end = idx_reader.get(doc + 1);
idx += end - start;
} }
} }
serialize_idx.add_val(idx)?; serialize_idx.add_val(idx)?;
@@ -332,8 +349,10 @@ impl IndexMerger {
for (segment_ord, segment_reader) in self.readers.iter().enumerate() { for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
let term_ordinal_mapping: &[TermOrdinal] = let term_ordinal_mapping: &[TermOrdinal] =
term_ordinal_mappings.get_segment(segment_ord); term_ordinal_mappings.get_segment(segment_ord);
let ff_reader: MultiValueIntFastFieldReader<u64> = let ff_reader: MultiValueIntFastFieldReader<u64> = segment_reader
segment_reader.multi_fast_field_reader(field)?; .fast_fields()
.u64s(field)
.expect("Could not find multivalued u64 fast value reader.");
// TODO optimize if no deletes // TODO optimize if no deletes
for doc in segment_reader.doc_ids_alive() { for doc in segment_reader.doc_ids_alive() {
ff_reader.get_vals(doc, &mut vals); ff_reader.get_vals(doc, &mut vals);
@@ -365,6 +384,8 @@ impl IndexMerger {
let mut vals = Vec::with_capacity(100); let mut vals = Vec::with_capacity(100);
let mut ff_readers = Vec::new();
// Our values are bitpacked and we need to know what should be // Our values are bitpacked and we need to know what should be
// our bitwidth and our minimum value before serializing any values. // our bitwidth and our minimum value before serializing any values.
// //
@@ -373,7 +394,10 @@ impl IndexMerger {
// maximum value and initialize our Serializer. // maximum value and initialize our Serializer.
for reader in &self.readers { for reader in &self.readers {
let ff_reader: MultiValueIntFastFieldReader<u64> = let ff_reader: MultiValueIntFastFieldReader<u64> =
reader.multi_fast_field_reader(field)?; reader.fast_fields().u64s_lenient(field).expect(
"Failed to find multivalued fast field reader. This is a bug in \
tantivy. Please report.",
);
for doc in reader.doc_ids_alive() { for doc in reader.doc_ids_alive() {
ff_reader.get_vals(doc, &mut vals); ff_reader.get_vals(doc, &mut vals);
for &val in &vals { for &val in &vals {
@@ -381,6 +405,7 @@ impl IndexMerger {
max_value = cmp::max(val, max_value); max_value = cmp::max(val, max_value);
} }
} }
ff_readers.push(ff_reader);
// TODO optimize when no deletes // TODO optimize when no deletes
} }
@@ -393,9 +418,7 @@ impl IndexMerger {
{ {
let mut serialize_vals = fast_field_serializer let mut serialize_vals = fast_field_serializer
.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?; .new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
for reader in &self.readers { for (reader, ff_reader) in self.readers.iter().zip(ff_readers) {
let ff_reader: MultiValueIntFastFieldReader<u64> =
reader.multi_fast_field_reader(field)?;
// TODO optimize if no deletes // TODO optimize if no deletes
for doc in reader.doc_ids_alive() { for doc in reader.doc_ids_alive() {
ff_reader.get_vals(doc, &mut vals); ff_reader.get_vals(doc, &mut vals);
@@ -414,19 +437,53 @@ impl IndexMerger {
field: Field, field: Field,
fast_field_serializer: &mut FastFieldSerializer, fast_field_serializer: &mut FastFieldSerializer,
) -> Result<()> { ) -> Result<()> {
self.write_fast_field_idx(field, fast_field_serializer)?; let mut total_num_vals = 0u64;
let mut bytes_readers: Vec<BytesFastFieldReader> = Vec::new();
for reader in &self.readers {
let bytes_reader = reader.fast_fields().bytes(field).expect(
"Failed to find bytes fast field reader. This is a bug in tantivy, please report.",
);
if let Some(delete_bitset) = reader.delete_bitset() {
for doc in 0u32..reader.max_doc() {
if delete_bitset.is_alive(doc) {
let num_vals = bytes_reader.get_bytes(doc).len() as u64;
total_num_vals += num_vals;
}
}
} else {
total_num_vals += bytes_reader.total_num_bytes() as u64;
}
bytes_readers.push(bytes_reader);
}
{
// We can now create our `idx` serializer, and in a second pass,
// can effectively push the different indexes.
let mut serialize_idx =
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
let mut idx = 0;
for (segment_reader, bytes_reader) in self.readers.iter().zip(&bytes_readers) {
for doc in segment_reader.doc_ids_alive() {
serialize_idx.add_val(idx)?;
idx += bytes_reader.get_bytes(doc).len() as u64;
}
}
serialize_idx.add_val(idx)?;
serialize_idx.close_field()?;
}
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1)?; let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1)?;
for reader in &self.readers { for segment_reader in &self.readers {
let bytes_reader = reader.bytes_fast_field_reader(field)?; let bytes_reader = segment_reader.fast_fields().bytes(field)
.expect("Failed to find bytes field in fast field reader. This is a bug in tantivy. Please report.");
// TODO: optimize if no deletes // TODO: optimize if no deletes
for doc in reader.doc_ids_alive() { for doc in segment_reader.doc_ids_alive() {
let val = bytes_reader.get_val(doc); let val = bytes_reader.get_bytes(doc);
serialize_vals.write_all(val)?; serialize_vals.write_all(val)?;
} }
} }
serialize_vals.flush()?; serialize_vals.flush()?;
Ok(()) Ok(())
} }
@@ -523,7 +580,8 @@ impl IndexMerger {
} }
} }
None None
}).collect(); })
.collect();
// At this point, `segment_postings` contains the posting list // At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term. // of all of the segments containing the given term.
@@ -614,7 +672,7 @@ impl IndexMerger {
store_writer.store(&doc)?; store_writer.store(&doc)?;
} }
} else { } else {
store_writer.stack(store_reader)?; store_writer.stack(&store_reader)?;
} }
} }
Ok(()) Ok(())
@@ -635,10 +693,9 @@ impl SerializableSegment for IndexMerger {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
use collector::chain;
use collector::tests::TestCollector; use collector::tests::TestCollector;
use collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector}; use collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
use collector::FacetCollector; use collector::{Count, FacetCollector};
use core::Index; use core::Index;
use futures::Future; use futures::Future;
use query::AllQuery; use query::AllQuery;
@@ -647,10 +704,12 @@ mod tests {
use schema; use schema;
use schema::Cardinality; use schema::Cardinality;
use schema::Document; use schema::Document;
use schema::Facet;
use schema::IndexRecordOption; use schema::IndexRecordOption;
use schema::IntOptions; use schema::IntOptions;
use schema::Term; use schema::Term;
use schema::TextFieldIndexing; use schema::TextFieldIndexing;
use schema::INDEXED;
use std::io::Cursor; use std::io::Cursor;
use DocAddress; use DocAddress;
use IndexWriter; use IndexWriter;
@@ -658,19 +717,22 @@ mod tests {
#[test] #[test]
fn test_index_merger_no_deletes() { fn test_index_merger_no_deletes() {
let mut schema_builder = schema::SchemaBuilder::default(); let mut schema_builder = schema::Schema::builder();
let text_fieldtype = schema::TextOptions::default() let text_fieldtype = schema::TextOptions::default()
.set_indexing_options( .set_indexing_options(
TextFieldIndexing::default() TextFieldIndexing::default()
.set_tokenizer("default") .set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs), .set_index_option(IndexRecordOption::WithFreqs),
).set_stored(); )
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype); let text_field = schema_builder.add_text_field("text", text_fieldtype);
let date_field = schema_builder.add_date_field("date", INDEXED);
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue); let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype); let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let bytes_score_field = schema_builder.add_bytes_field("score_bytes"); let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
let curr_time = chrono::Utc::now();
let add_score_bytes = |doc: &mut Document, score: u32| { let add_score_bytes = |doc: &mut Document, score: u32| {
let mut bytes = Vec::new(); let mut bytes = Vec::new();
bytes bytes
@@ -687,6 +749,7 @@ mod tests {
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_text(text_field, "af b"); doc.add_text(text_field, "af b");
doc.add_u64(score_field, 3); doc.add_u64(score_field, 3);
doc.add_date(date_field, &curr_time);
add_score_bytes(&mut doc, 3); add_score_bytes(&mut doc, 3);
index_writer.add_document(doc); index_writer.add_document(doc);
} }
@@ -712,6 +775,7 @@ mod tests {
{ {
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_text(text_field, "af b"); doc.add_text(text_field, "af b");
doc.add_date(date_field, &curr_time);
doc.add_u64(score_field, 11); doc.add_u64(score_field, 11);
add_score_bytes(&mut doc, 11); add_score_bytes(&mut doc, 11);
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -739,30 +803,39 @@ mod tests {
index_writer.wait_merging_threads().unwrap(); index_writer.wait_merging_threads().unwrap();
} }
{ {
index.load_searchers().unwrap(); reader.reload().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
let get_doc_ids = |terms: Vec<Term>| { let get_doc_ids = |terms: Vec<Term>| {
let mut collector = TestCollector::default();
let query = BooleanQuery::new_multiterms_query(terms); let query = BooleanQuery::new_multiterms_query(terms);
assert!(searcher.search(&query, &mut collector).is_ok()); let top_docs = searcher.search(&query, &TestCollector).unwrap();
collector.docs() top_docs.docs().to_vec()
}; };
{ {
assert_eq!( assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")]), get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2, 4] vec![DocAddress(0, 1), DocAddress(0, 2), DocAddress(0, 4)]
); );
assert_eq!( assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")]), get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0, 3] vec![DocAddress(0, 0), DocAddress(0, 3)]
); );
assert_eq!( assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "g")]), get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
vec![4] vec![DocAddress(0, 4)]
); );
assert_eq!( assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")]), get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2, 3, 4] vec![
DocAddress(0, 0),
DocAddress(0, 1),
DocAddress(0, 2),
DocAddress(0, 3),
DocAddress(0, 4)
]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_date(date_field, &curr_time)]),
vec![DocAddress(0, 0), DocAddress(0, 3)]
); );
} }
{ {
@@ -788,17 +861,18 @@ mod tests {
{ {
let get_fast_vals = |terms: Vec<Term>| { let get_fast_vals = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms); let query = BooleanQuery::new_multiterms_query(terms);
let mut collector = FastFieldTestCollector::for_field(score_field); searcher
assert!(searcher.search(&query, &mut collector).is_ok()); .search(&query, &FastFieldTestCollector::for_field(score_field))
collector.vals() .unwrap()
}; };
let get_fast_vals_bytes = |terms: Vec<Term>| { let get_fast_vals_bytes = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms); let query = BooleanQuery::new_multiterms_query(terms);
let mut collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
searcher searcher
.search(&query, &mut collector) .search(
.expect("failed to search"); &query,
collector.vals() &BytesFastFieldTestCollector::for_field(bytes_score_field),
)
.expect("failed to search")
}; };
assert_eq!( assert_eq!(
get_fast_vals(vec![Term::from_field_text(text_field, "a")]), get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
@@ -814,34 +888,27 @@ mod tests {
#[test] #[test]
fn test_index_merger_with_deletes() { fn test_index_merger_with_deletes() {
let mut schema_builder = schema::SchemaBuilder::default(); let mut schema_builder = schema::Schema::builder();
let text_fieldtype = schema::TextOptions::default() let text_fieldtype = schema::TextOptions::default()
.set_indexing_options( .set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
).set_stored(); )
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype); let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue); let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype); let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let bytes_score_field = schema_builder.add_bytes_field("score_bytes"); let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let reader = index.reader().unwrap();
let search_term = |searcher: &Searcher, term: Term| { let search_term = |searcher: &Searcher, term: Term| {
let mut collector = FastFieldTestCollector::for_field(score_field); let collector = FastFieldTestCollector::for_field(score_field);
let mut bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field); let bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
let term_query = TermQuery::new(term, IndexRecordOption::Basic); let term_query = TermQuery::new(term, IndexRecordOption::Basic);
let (scores, bytes) = searcher
{ .search(&term_query, &(collector, bytes_collector))
let mut combined_collector = .unwrap();
chain().push(&mut collector).push(&mut bytes_collector); let mut score_bytes = Cursor::new(bytes);
searcher
.search(&term_query, &mut combined_collector)
.unwrap();
}
let scores = collector.vals();
let mut score_bytes = Cursor::new(bytes_collector.vals());
for &score in &scores { for &score in &scores {
assert_eq!(score as u32, score_bytes.read_u32::<BigEndian>().unwrap()); assert_eq!(score as u32, score_bytes.read_u32::<BigEndian>().unwrap());
} }
@@ -854,24 +921,24 @@ mod tests {
{ {
// a first commit // a first commit
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "a b d", text_field => "a b d",
score_field => 1u64, score_field => 1u64,
bytes_score_field => vec![0u8, 0, 0, 1], bytes_score_field => vec![0u8, 0, 0, 1],
)); ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "b c", text_field => "b c",
score_field => 2u64, score_field => 2u64,
bytes_score_field => vec![0u8, 0, 0, 2], bytes_score_field => vec![0u8, 0, 0, 2],
)); ));
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "c d", text_field => "c d",
score_field => 3u64, score_field => 3u64,
bytes_score_field => vec![0u8, 0, 0, 3], bytes_score_field => vec![0u8, 0, 0, 3],
)); ));
index_writer.commit().expect("committed"); index_writer.commit().expect("committed");
index.load_searchers().unwrap(); reader.reload().unwrap();
let ref searcher = *index.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3); assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
@@ -895,37 +962,37 @@ mod tests {
{ {
// a second commit // a second commit
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "a d e", text_field => "a d e",
score_field => 4_000u64, score_field => 4_000u64,
bytes_score_field => vec![0u8, 0, 0, 4], bytes_score_field => vec![0u8, 0, 0, 4],
)); ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "e f", text_field => "e f",
score_field => 5_000u64, score_field => 5_000u64,
bytes_score_field => vec![0u8, 0, 0, 5], bytes_score_field => vec![0u8, 0, 0, 5],
)); ));
index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.delete_term(Term::from_field_text(text_field, "f")); index_writer.delete_term(Term::from_field_text(text_field, "f"));
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "f g", text_field => "f g",
score_field => 6_000u64, score_field => 6_000u64,
bytes_score_field => vec![0u8, 0, 23, 112], bytes_score_field => vec![0u8, 0, 23, 112],
)); ));
index_writer.add_document(doc!( index_writer.add_document(doc!(
text_field => "g h", text_field => "g h",
score_field => 7_000u64, score_field => 7_000u64,
bytes_score_field => vec![0u8, 0, 27, 88], bytes_score_field => vec![0u8, 0, 27, 88],
)); ));
index_writer.commit().expect("committed"); index_writer.commit().expect("committed");
index.load_searchers().unwrap(); reader.reload().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 2); assert_eq!(searcher.segment_readers().len(), 2);
assert_eq!(searcher.num_docs(), 3); assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 1); assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3); assert_eq!(searcher.segment_readers()[0].max_doc(), 4);
assert_eq!(searcher.segment_readers()[1].num_docs(), 2); assert_eq!(searcher.segment_readers()[1].num_docs(), 1);
assert_eq!(searcher.segment_readers()[1].max_doc(), 4); assert_eq!(searcher.segment_readers()[1].max_doc(), 3);
assert_eq!( assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")), search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec empty_vec
@@ -957,17 +1024,19 @@ mod tests {
let score_field_reader = searcher let score_field_reader = searcher
.segment_reader(0) .segment_reader(0)
.fast_field_reader::<u64>(score_field) .fast_fields()
.unwrap(); .u64(score_field)
assert_eq!(score_field_reader.min_value(), 1);
assert_eq!(score_field_reader.max_value(), 3);
let score_field_reader = searcher
.segment_reader(1)
.fast_field_reader::<u64>(score_field)
.unwrap(); .unwrap();
assert_eq!(score_field_reader.min_value(), 4000); assert_eq!(score_field_reader.min_value(), 4000);
assert_eq!(score_field_reader.max_value(), 7000); assert_eq!(score_field_reader.max_value(), 7000);
let score_field_reader = searcher
.segment_reader(1)
.fast_fields()
.u64(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 1);
assert_eq!(score_field_reader.max_value(), 3);
} }
{ {
// merging the segments // merging the segments
@@ -979,8 +1048,8 @@ mod tests {
.expect("Failed to initiate merge") .expect("Failed to initiate merge")
.wait() .wait()
.expect("Merging failed"); .expect("Merging failed");
index.load_searchers().unwrap(); reader.reload().unwrap();
let ref searcher = *index.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 3); assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 3); assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
@@ -1015,7 +1084,8 @@ mod tests {
); );
let score_field_reader = searcher let score_field_reader = searcher
.segment_reader(0) .segment_reader(0)
.fast_field_reader::<u64>(score_field) .fast_fields()
.u64(score_field)
.unwrap(); .unwrap();
assert_eq!(score_field_reader.min_value(), 3); assert_eq!(score_field_reader.min_value(), 3);
assert_eq!(score_field_reader.max_value(), 7000); assert_eq!(score_field_reader.max_value(), 7000);
@@ -1025,8 +1095,8 @@ mod tests {
index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap(); reader.reload().unwrap();
let ref searcher = *index.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
@@ -1061,7 +1131,8 @@ mod tests {
); );
let score_field_reader = searcher let score_field_reader = searcher
.segment_reader(0) .segment_reader(0)
.fast_field_reader::<u64>(score_field) .fast_fields()
.u64(score_field)
.unwrap(); .unwrap();
assert_eq!(score_field_reader.min_value(), 3); assert_eq!(score_field_reader.min_value(), 3);
assert_eq!(score_field_reader.max_value(), 7000); assert_eq!(score_field_reader.max_value(), 7000);
@@ -1076,9 +1147,9 @@ mod tests {
.expect("Failed to initiate merge") .expect("Failed to initiate merge")
.wait() .wait()
.expect("Merging failed"); .expect("Merging failed");
index.load_searchers().unwrap(); reader.reload().unwrap();
let ref searcher = *index.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1); assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 2); assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2); assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
@@ -1113,7 +1184,8 @@ mod tests {
); );
let score_field_reader = searcher let score_field_reader = searcher
.segment_reader(0) .segment_reader(0)
.fast_field_reader::<u64>(score_field) .fast_fields()
.u64(score_field)
.unwrap(); .unwrap();
assert_eq!(score_field_reader.min_value(), 6000); assert_eq!(score_field_reader.min_value(), 6000);
assert_eq!(score_field_reader.max_value(), 7000); assert_eq!(score_field_reader.max_value(), 7000);
@@ -1122,30 +1194,27 @@ mod tests {
{ {
// Test removing all docs // Test removing all docs
index_writer.delete_term(Term::from_field_text(text_field, "g")); index_writer.delete_term(Term::from_field_text(text_field, "g"));
index_writer.commit().unwrap();
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
index_writer reader.reload().unwrap();
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
let ref searcher = *index.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1); assert!(segment_ids.is_empty());
assert!(searcher.segment_readers().is_empty());
assert_eq!(searcher.num_docs(), 0); assert_eq!(searcher.num_docs(), 0);
} }
} }
#[test] #[test]
fn test_merge_facets() { fn test_merge_facets() {
let mut schema_builder = schema::SchemaBuilder::default(); let mut schema_builder = schema::Schema::builder();
let facet_field = schema_builder.add_facet_field("facet"); let facet_field = schema_builder.add_facet_field("facet");
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
use schema::Facet; let reader = index.reader().unwrap();
{ {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| { let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
let mut doc = Document::default(); let mut doc = Document::default();
for facet in doc_facets { for facet in doc_facets {
@@ -1172,20 +1241,16 @@ mod tests {
index_doc(&mut index_writer, &["/top/e", "/top/f"]); index_doc(&mut index_writer, &["/top/e", "/top/f"]);
index_writer.commit().expect("committed"); index_writer.commit().expect("committed");
} }
index.load_searchers().unwrap();
reader.reload().unwrap();
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| { let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
let searcher = index.searcher(); let searcher = reader.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field); let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top")); facet_collector.add_facet(Facet::from("/top"));
use collector::{CountCollector, MultiCollector}; let (count, facet_counts) = searcher
let mut count_collector = CountCollector::default(); .search(&AllQuery, &(Count, facet_collector))
{ .unwrap();
let mut multi_collectors = assert_eq!(count, expected_num_docs);
MultiCollector::from(vec![&mut count_collector, &mut facet_collector]);
searcher.search(&AllQuery, &mut multi_collectors).unwrap();
}
assert_eq!(count_collector.count(), expected_num_docs);
let facet_counts = facet_collector.harvest();
let facets: Vec<(String, u64)> = facet_counts let facets: Vec<(String, u64)> = facet_counts
.get("/top") .get("/top")
.map(|(facet, count)| (facet.to_string(), count)) .map(|(facet, count)| (facet.to_string(), count))
@@ -1209,21 +1274,19 @@ mod tests {
("/top/f", 1), ("/top/f", 1),
], ],
); );
// Merging the segments // Merging the segments
{ {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer index_writer
.merge(&segment_ids) .merge(&segment_ids)
.expect("Failed to initiate merge") .expect("Failed to initiate merge")
.wait() .wait()
.expect("Merging failed"); .expect("Merging failed");
index_writer.wait_merging_threads().unwrap(); index_writer.wait_merging_threads().unwrap();
reader.reload().unwrap();
index.load_searchers().unwrap();
test_searcher( test_searcher(
11, 11,
&[ &[
@@ -1239,12 +1302,12 @@ mod tests {
// Deleting one term // Deleting one term
{ {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]); let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet); let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term); index_writer.delete_term(facet_term);
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap(); reader.reload().unwrap();
test_searcher( test_searcher(
9, 9,
&[ &[
@@ -1259,17 +1322,45 @@ mod tests {
} }
} }
#[test]
fn test_bug_merge() {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INDEXED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 2);
index_writer.delete_term(Term::from_field_u64(int_field, 1));
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
reader.reload().unwrap();
// commit has not been called yet. The document should still be
// there.
assert_eq!(reader.searcher().num_docs(), 2);
}
#[test] #[test]
fn test_merge_multivalued_int_fields_all_deleted() { fn test_merge_multivalued_int_fields_all_deleted() {
let mut schema_builder = schema::SchemaBuilder::default(); let mut schema_builder = schema::Schema::builder();
let int_options = IntOptions::default() let int_options = IntOptions::default()
.set_fast(Cardinality::MultiValues) .set_fast(Cardinality::MultiValues)
.set_indexed(); .set_indexed();
let int_field = schema_builder.add_u64_field("intvals", int_options); let int_field = schema_builder.add_u64_field("intvals", int_options);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
{ {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_u64(int_field, 1); doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone()); index_writer.add_document(doc.clone());
@@ -1277,32 +1368,34 @@ mod tests {
index_writer.add_document(doc); index_writer.add_document(doc);
index_writer.commit().expect("commit failed"); index_writer.commit().expect("commit failed");
index_writer.delete_term(Term::from_field_u64(int_field, 1)); index_writer.delete_term(Term::from_field_u64(int_field, 1));
index_writer.commit().expect("commit failed");
}
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 0);
// Merging the segments
{
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer index_writer
.merge(&segment_ids) .merge(&segment_ids)
.expect("Failed to initiate merge") .expect("Failed to initiate merge")
.wait() .wait()
.expect("Merging failed"); .expect("Merging failed");
// assert delete has not been committed
reader.reload().expect("failed to load searcher 1");
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 2);
index_writer.commit().unwrap();
index_writer.wait_merging_threads().unwrap(); index_writer.wait_merging_threads().unwrap();
} }
index.load_searchers().unwrap();
let searcher = index.searcher(); reader.reload().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 0); assert_eq!(searcher.num_docs(), 0);
} }
#[test] #[test]
fn test_merge_multivalued_int_fields() { fn test_merge_multivalued_int_fields_simple() {
let mut schema_builder = schema::SchemaBuilder::default(); let mut schema_builder = schema::Schema::builder();
let int_options = IntOptions::default() let int_options = IntOptions::default()
.set_fast(Cardinality::MultiValues) .set_fast(Cardinality::MultiValues)
.set_indexed(); .set_indexed();
@@ -1310,7 +1403,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| { let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = Document::default(); let mut doc = Document::default();
for &val in int_vals { for &val in int_vals {
@@ -1318,7 +1411,6 @@ mod tests {
} }
index_writer.add_document(doc); index_writer.add_document(doc);
}; };
index_doc(&mut index_writer, &[1, 2]); index_doc(&mut index_writer, &[1, 2]);
index_doc(&mut index_writer, &[1, 2, 3]); index_doc(&mut index_writer, &[1, 2, 3]);
index_doc(&mut index_writer, &[4, 5]); index_doc(&mut index_writer, &[4, 5]);
@@ -1327,24 +1419,19 @@ mod tests {
index_doc(&mut index_writer, &[3]); index_doc(&mut index_writer, &[3]);
index_doc(&mut index_writer, &[17]); index_doc(&mut index_writer, &[17]);
index_writer.commit().expect("committed"); index_writer.commit().expect("committed");
index_doc(&mut index_writer, &[20]); index_doc(&mut index_writer, &[20]);
index_writer.commit().expect("committed"); index_writer.commit().expect("committed");
index_doc(&mut index_writer, &[28, 27]); index_doc(&mut index_writer, &[28, 27]);
index_doc(&mut index_writer, &[1_000]); index_doc(&mut index_writer, &[1_000]);
index_writer.commit().expect("committed"); index_writer.commit().expect("committed");
} }
index.load_searchers().unwrap(); let reader = index.reader().unwrap();
let searcher = reader.searcher();
let searcher = index.searcher();
let mut vals: Vec<u64> = Vec::new(); let mut vals: Vec<u64> = Vec::new();
{ {
let segment = searcher.segment_reader(0u32); let segment = searcher.segment_reader(0u32);
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap(); let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
ff_reader.get_vals(0, &mut vals); ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[1, 2]); assert_eq!(&vals, &[1, 2]);
@@ -1368,16 +1455,18 @@ mod tests {
assert_eq!(&vals, &[17]); assert_eq!(&vals, &[17]);
} }
{ println!(
let segment = searcher.segment_reader(1u32); "{:?}",
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap(); searcher
ff_reader.get_vals(0, &mut vals); .segment_readers()
assert_eq!(&vals, &[20]); .iter()
} .map(|reader| reader.max_doc())
.collect::<Vec<_>>()
);
{ {
let segment = searcher.segment_reader(2u32); let segment = searcher.segment_reader(1u32);
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap(); let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
ff_reader.get_vals(0, &mut vals); ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[28, 27]); assert_eq!(&vals, &[28, 27]);
@@ -1385,26 +1474,42 @@ mod tests {
assert_eq!(&vals, &[1_000]); assert_eq!(&vals, &[1_000]);
} }
{
let segment = searcher.segment_reader(2u32);
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[20]);
}
// Merging the segments // Merging the segments
{ {
let segment_ids = index let segment_ids = index
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer index_writer
.merge(&segment_ids) .merge(&segment_ids)
.expect("Failed to initiate merge") .expect("Failed to initiate merge")
.wait() .wait()
.expect("Merging failed"); .expect("Merging failed");
index_writer.wait_merging_threads().unwrap(); index_writer
.wait_merging_threads()
.expect("Wait for merging threads");
} }
reader.reload().expect("Load searcher");
index.load_searchers().unwrap();
{ {
let searcher = index.searcher(); let searcher = reader.searcher();
println!(
"{:?}",
searcher
.segment_readers()
.iter()
.map(|reader| reader.max_doc())
.collect::<Vec<_>>()
);
let segment = searcher.segment_reader(0u32); let segment = searcher.segment_reader(0u32);
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap(); let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
ff_reader.get_vals(0, &mut vals); ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[1, 2]); assert_eq!(&vals, &[1, 2]);
@@ -1428,13 +1533,13 @@ mod tests {
assert_eq!(&vals, &[17]); assert_eq!(&vals, &[17]);
ff_reader.get_vals(7, &mut vals); ff_reader.get_vals(7, &mut vals);
assert_eq!(&vals, &[20]);
ff_reader.get_vals(8, &mut vals);
assert_eq!(&vals, &[28, 27]); assert_eq!(&vals, &[28, 27]);
ff_reader.get_vals(9, &mut vals); ff_reader.get_vals(8, &mut vals);
assert_eq!(&vals, &[1_000]); assert_eq!(&vals, &[1_000]);
ff_reader.get_vals(9, &mut vals);
assert_eq!(&vals, &[20]);
} }
} }
} }

View File

@@ -1,8 +1,9 @@
pub mod delete_queue; pub mod delete_queue;
mod directory_lock;
mod doc_opstamp_mapping; mod doc_opstamp_mapping;
pub mod index_writer; pub mod index_writer;
mod log_merge_policy; mod log_merge_policy;
mod merge_operation;
pub mod merge_policy; pub mod merge_policy;
pub mod merger; pub mod merger;
pub mod operation; pub mod operation;
@@ -15,14 +16,12 @@ pub mod segment_updater;
mod segment_writer; mod segment_writer;
mod stamper; mod stamper;
pub(crate) use self::directory_lock::DirectoryLock;
pub use self::directory_lock::LockType;
pub use self::index_writer::IndexWriter; pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy; pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::{MergeOperation, MergeOperationInventory};
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy}; pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
pub use self::prepared_commit::PreparedCommit; pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::{SegmentEntry, SegmentState}; pub use self::segment_entry::SegmentEntry;
pub use self::segment_manager::SegmentManager; pub use self::segment_manager::SegmentManager;
pub use self::segment_serializer::SegmentSerializer; pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_writer::SegmentWriter; pub use self::segment_writer::SegmentWriter;

View File

@@ -1,16 +1,24 @@
use schema::Document; use schema::Document;
use schema::Term; use schema::Term;
use Opstamp;
/// Timestamped Delete operation. /// Timestamped Delete operation.
#[derive(Clone, Eq, PartialEq, Debug)] #[derive(Clone, Eq, PartialEq, Debug)]
pub struct DeleteOperation { pub struct DeleteOperation {
pub opstamp: u64, pub opstamp: Opstamp,
pub term: Term, pub term: Term,
} }
/// Timestamped Add operation. /// Timestamped Add operation.
#[derive(Eq, PartialEq, Debug)] #[derive(Eq, PartialEq, Debug)]
pub struct AddOperation { pub struct AddOperation {
pub opstamp: u64, pub opstamp: Opstamp,
pub document: Document, pub document: Document,
} }
/// UserOperation is an enum type that encapsulates other operation types.
#[derive(Eq, PartialEq, Debug)]
pub enum UserOperation {
Add(Document),
Delete(Term),
}

View File

@@ -1,15 +1,16 @@
use super::IndexWriter; use super::IndexWriter;
use Opstamp;
use Result; use Result;
/// A prepared commit /// A prepared commit
pub struct PreparedCommit<'a> { pub struct PreparedCommit<'a> {
index_writer: &'a mut IndexWriter, index_writer: &'a mut IndexWriter,
payload: Option<String>, payload: Option<String>,
opstamp: u64, opstamp: Opstamp,
} }
impl<'a> PreparedCommit<'a> { impl<'a> PreparedCommit<'a> {
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit { pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit {
PreparedCommit { PreparedCommit {
index_writer, index_writer,
payload: None, payload: None,
@@ -17,7 +18,7 @@ impl<'a> PreparedCommit<'a> {
} }
} }
pub fn opstamp(&self) -> u64 { pub fn opstamp(&self) -> Opstamp {
self.opstamp self.opstamp
} }
@@ -25,11 +26,11 @@ impl<'a> PreparedCommit<'a> {
self.payload = Some(payload.to_string()) self.payload = Some(payload.to_string())
} }
pub fn abort(self) -> Result<()> { pub fn abort(self) -> Result<Opstamp> {
self.index_writer.rollback() self.index_writer.rollback()
} }
pub fn commit(self) -> Result<u64> { pub fn commit(self) -> Result<Opstamp> {
info!("committing {}", self.opstamp); info!("committing {}", self.opstamp);
self.index_writer self.index_writer
.segment_updater() .segment_updater()

View File

@@ -4,21 +4,6 @@ use core::SegmentMeta;
use indexer::delete_queue::DeleteCursor; use indexer::delete_queue::DeleteCursor;
use std::fmt; use std::fmt;
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum SegmentState {
Ready,
InMerge,
}
impl SegmentState {
pub fn letter_code(self) -> char {
match self {
SegmentState::InMerge => 'M',
SegmentState::Ready => 'R',
}
}
}
/// A segment entry describes the state of /// A segment entry describes the state of
/// a given segment, at a given instant. /// a given segment, at a given instant.
/// ///
@@ -35,7 +20,6 @@ impl SegmentState {
#[derive(Clone)] #[derive(Clone)]
pub struct SegmentEntry { pub struct SegmentEntry {
meta: SegmentMeta, meta: SegmentMeta,
state: SegmentState,
delete_bitset: Option<BitSet>, delete_bitset: Option<BitSet>,
delete_cursor: DeleteCursor, delete_cursor: DeleteCursor,
} }
@@ -49,7 +33,6 @@ impl SegmentEntry {
) -> SegmentEntry { ) -> SegmentEntry {
SegmentEntry { SegmentEntry {
meta: segment_meta, meta: segment_meta,
state: SegmentState::Ready,
delete_bitset, delete_bitset,
delete_cursor, delete_cursor,
} }
@@ -72,14 +55,6 @@ impl SegmentEntry {
&mut self.delete_cursor &mut self.delete_cursor
} }
/// Return the `SegmentEntry`.
///
/// The state describes whether the segment is available for
/// a merge or not.
pub fn state(&self) -> SegmentState {
self.state
}
/// Returns the segment id. /// Returns the segment id.
pub fn segment_id(&self) -> SegmentId { pub fn segment_id(&self) -> SegmentId {
self.meta.id() self.meta.id()
@@ -89,33 +64,10 @@ impl SegmentEntry {
pub fn meta(&self) -> &SegmentMeta { pub fn meta(&self) -> &SegmentMeta {
&self.meta &self.meta
} }
/// Mark the `SegmentEntry` as in merge.
///
/// Only segments that are not already
/// in a merge are elligible for future merge.
pub fn start_merge(&mut self) {
self.state = SegmentState::InMerge;
}
/// Cancel a merge
///
/// If a merge fails, it is important to switch
/// the segment back to a idle state, so that it
/// may be elligible for future merges.
pub fn cancel_merge(&mut self) {
self.state = SegmentState::Ready;
}
/// Returns true iff a segment should
/// be considered for a merge.
pub fn is_ready(&self) -> bool {
self.state == SegmentState::Ready
}
} }
impl fmt::Debug for SegmentEntry { impl fmt::Debug for SegmentEntry {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
write!(formatter, "SegmentEntry({:?}, {:?})", self.meta, self.state) write!(formatter, "SegmentEntry({:?})", self.meta)
} }
} }

View File

@@ -16,7 +16,6 @@ use Result as TantivyResult;
struct SegmentRegisters { struct SegmentRegisters {
uncommitted: SegmentRegister, uncommitted: SegmentRegister,
committed: SegmentRegister, committed: SegmentRegister,
writing: HashSet<SegmentId>,
} }
/// The segment manager stores the list of segments /// The segment manager stores the list of segments
@@ -41,12 +40,17 @@ impl Debug for SegmentManager {
} }
pub fn get_mergeable_segments( pub fn get_mergeable_segments(
in_merge_segment_ids: &HashSet<SegmentId>,
segment_manager: &SegmentManager, segment_manager: &SegmentManager,
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) { ) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let registers_lock = segment_manager.read(); let registers_lock = segment_manager.read();
( (
registers_lock.committed.get_mergeable_segments(), registers_lock
registers_lock.uncommitted.get_mergeable_segments(), .committed
.get_mergeable_segments(in_merge_segment_ids),
registers_lock
.uncommitted
.get_mergeable_segments(in_merge_segment_ids),
) )
} }
@@ -59,7 +63,6 @@ impl SegmentManager {
registers: RwLock::new(SegmentRegisters { registers: RwLock::new(SegmentRegisters {
uncommitted: SegmentRegister::default(), uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas, delete_cursor), committed: SegmentRegister::new(segment_metas, delete_cursor),
writing: HashSet::new(),
}), }),
} }
} }
@@ -72,12 +75,6 @@ impl SegmentManager {
segment_entries segment_entries
} }
/// Returns the overall number of segments in the `SegmentManager`
pub fn num_segments(&self) -> usize {
let registers_lock = self.read();
registers_lock.committed.len() + registers_lock.uncommitted.len()
}
/// List the files that are useful to the index. /// List the files that are useful to the index.
/// ///
/// This does not include lock files, or files that are obsolete /// This does not include lock files, or files that are obsolete
@@ -106,6 +103,21 @@ impl SegmentManager {
.expect("Failed to acquire write lock on SegmentManager.") .expect("Failed to acquire write lock on SegmentManager.")
} }
/// Deletes all empty segments
fn remove_empty_segments(&self) {
let mut registers_lock = self.write();
registers_lock
.committed
.segment_entries()
.iter()
.filter(|segment| segment.meta().num_docs() == 0)
.for_each(|segment| {
registers_lock
.committed
.remove_segment(&segment.segment_id())
});
}
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) { pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
let mut registers_lock = self.write(); let mut registers_lock = self.write();
registers_lock.committed.clear(); registers_lock.committed.clear();
@@ -121,25 +133,22 @@ impl SegmentManager {
/// the `segment_ids` are not either all committed or all /// the `segment_ids` are not either all committed or all
/// uncommitted. /// uncommitted.
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> { pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
let mut registers_lock = self.write(); let registers_lock = self.read();
let mut segment_entries = vec![]; let mut segment_entries = vec![];
if registers_lock.uncommitted.contains_all(segment_ids) { if registers_lock.uncommitted.contains_all(segment_ids) {
for segment_id in segment_ids { for segment_id in segment_ids {
let segment_entry = registers_lock.uncommitted let segment_entry = registers_lock.uncommitted
.start_merge(segment_id) .get(segment_id)
.expect("Segment id not found {}. Should never happen because of the contains all if-block."); .expect("Segment id not found {}. Should never happen because of the contains all if-block.");
segment_entries.push(segment_entry); segment_entries.push(segment_entry);
} }
} else if registers_lock.committed.contains_all(segment_ids) { } else if registers_lock.committed.contains_all(segment_ids) {
for segment_id in segment_ids { for segment_id in segment_ids {
let segment_entry = registers_lock.committed let segment_entry = registers_lock.committed
.start_merge(segment_id) .get(segment_id)
.expect("Segment id not found {}. Should never happen because of the contains all if-block."); .expect("Segment id not found {}. Should never happen because of the contains all if-block.");
segment_entries.push(segment_entry); segment_entries.push(segment_entry);
} }
for segment_id in segment_ids {
registers_lock.committed.start_merge(segment_id);
}
} else { } else {
let error_msg = "Merge operation sent for segments that are not \ let error_msg = "Merge operation sent for segments that are not \
all uncommited or commited." all uncommited or commited."
@@ -149,50 +158,8 @@ impl SegmentManager {
Ok(segment_entries) Ok(segment_entries)
} }
pub fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_id: SegmentId,
) {
let mut registers_lock = self.write();
// we mark all segments are ready for merge.
{
let target_segment_register: &mut SegmentRegister;
target_segment_register = {
if registers_lock
.uncommitted
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.uncommitted
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");
return;
}
};
for segment_id in before_merge_segment_ids {
target_segment_register.cancel_merge(segment_id);
}
}
// ... and we make sure the target segment entry
// can be garbage collected.
registers_lock.writing.remove(&after_merge_segment_id);
}
pub fn write_segment(&self, segment_id: SegmentId) {
let mut registers_lock = self.write();
registers_lock.writing.insert(segment_id);
}
pub fn add_segment(&self, segment_entry: SegmentEntry) { pub fn add_segment(&self, segment_entry: SegmentEntry) {
let mut registers_lock = self.write(); let mut registers_lock = self.write();
registers_lock.writing.remove(&segment_entry.segment_id());
registers_lock.uncommitted.add_segment_entry(segment_entry); registers_lock.uncommitted.add_segment_entry(segment_entry);
} }
@@ -202,10 +169,6 @@ impl SegmentManager {
after_merge_segment_entry: SegmentEntry, after_merge_segment_entry: SegmentEntry,
) { ) {
let mut registers_lock = self.write(); let mut registers_lock = self.write();
registers_lock
.writing
.remove(&after_merge_segment_entry.segment_id());
let target_register: &mut SegmentRegister = { let target_register: &mut SegmentRegister = {
if registers_lock if registers_lock
.uncommitted .uncommitted
@@ -229,6 +192,7 @@ impl SegmentManager {
} }
pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> { pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> {
self.remove_empty_segments();
let registers_lock = self.read(); let registers_lock = self.read();
registers_lock.committed.segment_metas() registers_lock.committed.segment_metas()
} }

View File

@@ -3,6 +3,7 @@ use core::SegmentMeta;
use indexer::delete_queue::DeleteCursor; use indexer::delete_queue::DeleteCursor;
use indexer::segment_entry::SegmentEntry; use indexer::segment_entry::SegmentEntry;
use std::collections::HashMap; use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt::{self, Debug, Formatter}; use std::fmt::{self, Debug, Formatter};
/// The segment register keeps track /// The segment register keeps track
@@ -21,8 +22,8 @@ pub struct SegmentRegister {
impl Debug for SegmentRegister { impl Debug for SegmentRegister {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
write!(f, "SegmentRegister(")?; write!(f, "SegmentRegister(")?;
for (k, v) in &self.segment_states { for k in self.segment_states.keys() {
write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code())?; write!(f, "{}, ", k.short_uuid_string())?;
} }
write!(f, ")")?; write!(f, ")")?;
Ok(()) Ok(())
@@ -34,14 +35,13 @@ impl SegmentRegister {
self.segment_states.clear(); self.segment_states.clear();
} }
pub fn len(&self) -> usize { pub fn get_mergeable_segments(
self.segment_states.len() &self,
} in_merge_segment_ids: &HashSet<SegmentId>,
) -> Vec<SegmentMeta> {
pub fn get_mergeable_segments(&self) -> Vec<SegmentMeta> {
self.segment_states self.segment_states
.values() .values()
.filter(|segment_entry| segment_entry.is_ready()) .filter(|segment_entry| !in_merge_segment_ids.contains(&segment_entry.segment_id()))
.map(|segment_entry| segment_entry.meta().clone()) .map(|segment_entry| segment_entry.meta().clone())
.collect() .collect()
} }
@@ -56,11 +56,11 @@ impl SegmentRegister {
.values() .values()
.map(|segment_entry| segment_entry.meta().clone()) .map(|segment_entry| segment_entry.meta().clone())
.collect(); .collect();
segment_ids.sort_by_key(|meta| meta.id()); segment_ids.sort_by_key(SegmentMeta::id);
segment_ids segment_ids
} }
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool { pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {
segment_ids segment_ids
.iter() .iter()
.all(|segment_id| self.segment_states.contains_key(segment_id)) .all(|segment_id| self.segment_states.contains_key(segment_id))
@@ -75,20 +75,8 @@ impl SegmentRegister {
self.segment_states.remove(segment_id); self.segment_states.remove(segment_id);
} }
pub fn cancel_merge(&mut self, segment_id: &SegmentId) { pub fn get(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
self.segment_states self.segment_states.get(segment_id).cloned()
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.cancel_merge();
}
pub fn start_merge(&mut self, segment_id: &SegmentId) -> Option<SegmentEntry> {
if let Some(segment_entry) = self.segment_states.get_mut(segment_id) {
segment_entry.start_merge();
Some(segment_entry.clone())
} else {
None
}
} }
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: &DeleteCursor) -> SegmentRegister { pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: &DeleteCursor) -> SegmentRegister {
@@ -100,11 +88,6 @@ impl SegmentRegister {
} }
SegmentRegister { segment_states } SegmentRegister { segment_states }
} }
#[cfg(test)]
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
self.segment_states.get(segment_id).cloned()
}
} }
#[cfg(test)] #[cfg(test)]
@@ -113,7 +96,6 @@ mod tests {
use core::SegmentId; use core::SegmentId;
use core::SegmentMeta; use core::SegmentMeta;
use indexer::delete_queue::*; use indexer::delete_queue::*;
use indexer::SegmentState;
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> { fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
segment_register segment_register
@@ -137,42 +119,12 @@ mod tests {
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None); let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry); segment_register.add_segment_entry(segment_entry);
} }
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::Ready
);
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]); assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
{ {
let segment_meta = SegmentMeta::new(segment_id_b, 0u32); let segment_meta = SegmentMeta::new(segment_id_b, 0u32);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None); let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry); segment_register.add_segment_entry(segment_entry);
} }
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::Ready
);
segment_register.start_merge(&segment_id_a);
segment_register.start_merge(&segment_id_b);
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::InMerge
);
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::InMerge
);
segment_register.remove_segment(&segment_id_a); segment_register.remove_segment(&segment_id_a);
segment_register.remove_segment(&segment_id_b); segment_register.remove_segment(&segment_id_b);
{ {

View File

@@ -16,9 +16,10 @@ use futures_cpupool::CpuFuture;
use futures_cpupool::CpuPool; use futures_cpupool::CpuPool;
use indexer::delete_queue::DeleteCursor; use indexer::delete_queue::DeleteCursor;
use indexer::index_writer::advance_deletes; use indexer::index_writer::advance_deletes;
use indexer::merge_operation::MergeOperationInventory;
use indexer::merger::IndexMerger; use indexer::merger::IndexMerger;
use indexer::stamper::Stamper; use indexer::stamper::Stamper;
use indexer::MergeCandidate; use indexer::MergeOperation;
use indexer::SegmentEntry; use indexer::SegmentEntry;
use indexer::SegmentSerializer; use indexer::SegmentSerializer;
use indexer::{DefaultMergePolicy, MergePolicy}; use indexer::{DefaultMergePolicy, MergePolicy};
@@ -26,6 +27,7 @@ use schema::Schema;
use serde_json; use serde_json;
use std::borrow::BorrowMut; use std::borrow::BorrowMut;
use std::collections::HashMap; use std::collections::HashMap;
use std::collections::HashSet;
use std::io::Write; use std::io::Write;
use std::mem; use std::mem;
use std::ops::DerefMut; use std::ops::DerefMut;
@@ -34,6 +36,7 @@ use std::sync::Arc;
use std::sync::RwLock; use std::sync::RwLock;
use std::thread; use std::thread;
use std::thread::JoinHandle; use std::thread::JoinHandle;
use Opstamp;
use Result; use Result;
/// Save the index meta file. /// Save the index meta file.
@@ -45,33 +48,30 @@ use Result;
/// and flushed. /// and flushed.
/// ///
/// This method is not part of tantivy's public API /// This method is not part of tantivy's public API
pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> { pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
save_metas(vec![], schema, opstamp, None, directory) save_metas(
&IndexMeta {
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None,
},
directory,
)
} }
/// Save the index meta file. /// Save the index meta file.
/// This operation is atomic: /// This operation is atomic:
/// Either /// Either
// - it fails, in which case an error is returned, /// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched, /// and the `meta.json` remains untouched,
/// - it success, and `meta.json` is written /// - it success, and `meta.json` is written
/// and flushed. /// and flushed.
/// ///
/// This method is not part of tantivy's public API /// This method is not part of tantivy's public API
pub fn save_metas( fn save_metas(metas: &IndexMeta, directory: &mut Directory) -> Result<()> {
segment_metas: Vec<SegmentMeta>, let mut buffer = serde_json::to_vec_pretty(metas)?;
schema: Schema, // Just adding a new line at the end of the buffer.
opstamp: u64,
payload: Option<String>,
directory: &mut Directory,
) -> Result<()> {
let metas = IndexMeta {
segments: segment_metas,
schema,
opstamp,
payload,
};
let mut buffer = serde_json::to_vec_pretty(&metas)?;
writeln!(&mut buffer)?; writeln!(&mut buffer)?;
directory.atomic_write(&META_FILEPATH, &buffer[..])?; directory.atomic_write(&META_FILEPATH, &buffer[..])?;
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas)); debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
@@ -83,16 +83,21 @@ pub fn save_metas(
// //
// All this processing happens on a single thread // All this processing happens on a single thread
// consuming a common queue. // consuming a common queue.
//
// We voluntarily pass a merge_operation ref to guarantee that
// the merge_operation is alive during the process
#[derive(Clone)] #[derive(Clone)]
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>); pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
fn perform_merge( fn perform_merge(
merge_operation: &MergeOperation,
index: &Index, index: &Index,
mut segment_entries: Vec<SegmentEntry>, mut segment_entries: Vec<SegmentEntry>,
mut merged_segment: Segment,
target_opstamp: u64,
) -> Result<SegmentEntry> { ) -> Result<SegmentEntry> {
let target_opstamp = merge_operation.target_opstamp();
// first we need to apply deletes to our segment. // first we need to apply deletes to our segment.
let mut merged_segment = index.new_segment();
// TODO add logging // TODO add logging
let schema = index.schema(); let schema = index.schema();
@@ -126,19 +131,27 @@ fn perform_merge(
} }
struct InnerSegmentUpdater { struct InnerSegmentUpdater {
// we keep a copy of the current active IndexMeta to
// avoid loading the file everytime we need it in the
// `SegmentUpdater`.
//
// This should be up to date as all update happen through
// the unique active `SegmentUpdater`.
active_metas: RwLock<Arc<IndexMeta>>,
pool: CpuPool, pool: CpuPool,
index: Index, index: Index,
segment_manager: SegmentManager, segment_manager: SegmentManager,
merge_policy: RwLock<Box<MergePolicy>>, merge_policy: RwLock<Arc<Box<MergePolicy>>>,
merging_thread_id: AtomicUsize, merging_thread_id: AtomicUsize,
merging_threads: RwLock<HashMap<usize, JoinHandle<Result<()>>>>, merging_threads: RwLock<HashMap<usize, JoinHandle<Result<()>>>>,
generation: AtomicUsize, generation: AtomicUsize,
killed: AtomicBool, killed: AtomicBool,
stamper: Stamper, stamper: Stamper,
merge_operations: MergeOperationInventory,
} }
impl SegmentUpdater { impl SegmentUpdater {
pub fn new( pub fn create(
index: Index, index: Index,
stamper: Stamper, stamper: Stamper,
delete_cursor: &DeleteCursor, delete_cursor: &DeleteCursor,
@@ -149,32 +162,29 @@ impl SegmentUpdater {
.name_prefix("segment_updater") .name_prefix("segment_updater")
.pool_size(1) .pool_size(1)
.create(); .create();
let index_meta = index.load_metas()?;
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater { Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
active_metas: RwLock::new(Arc::new(index_meta)),
pool, pool,
index, index,
segment_manager, segment_manager,
merge_policy: RwLock::new(Box::new(DefaultMergePolicy::default())), merge_policy: RwLock::new(Arc::new(Box::new(DefaultMergePolicy::default()))),
merging_thread_id: AtomicUsize::default(), merging_thread_id: AtomicUsize::default(),
merging_threads: RwLock::new(HashMap::new()), merging_threads: RwLock::new(HashMap::new()),
generation: AtomicUsize::default(), generation: AtomicUsize::default(),
killed: AtomicBool::new(false), killed: AtomicBool::new(false),
stamper, stamper,
merge_operations: Default::default(),
}))) })))
} }
pub fn new_segment(&self) -> Segment { pub fn get_merge_policy(&self) -> Arc<Box<MergePolicy>> {
let new_segment = self.0.index.new_segment(); self.0.merge_policy.read().unwrap().clone()
let segment_id = new_segment.id();
self.0.segment_manager.write_segment(segment_id);
new_segment
}
pub fn get_merge_policy(&self) -> Box<MergePolicy> {
self.0.merge_policy.read().unwrap().box_clone()
} }
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) { pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
*self.0.merge_policy.write().unwrap() = merge_policy; let arc_merge_policy = Arc::new(merge_policy);
*self.0.merge_policy.write().unwrap() = arc_merge_policy;
} }
fn get_merging_thread_id(&self) -> usize { fn get_merging_thread_id(&self) -> usize {
@@ -195,7 +205,8 @@ impl SegmentUpdater {
segment_updater.0.segment_manager.add_segment(segment_entry); segment_updater.0.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options(); segment_updater.consider_merge_options();
true true
}).forget(); })
.forget();
true true
} else { } else {
false false
@@ -214,7 +225,7 @@ impl SegmentUpdater {
/// ///
/// Tne method returns copies of the segment entries, /// Tne method returns copies of the segment entries,
/// updated with the delete information. /// updated with the delete information.
fn purge_deletes(&self, target_opstamp: u64) -> Result<Vec<SegmentEntry>> { fn purge_deletes(&self, target_opstamp: Opstamp) -> Result<Vec<SegmentEntry>> {
let mut segment_entries = self.0.segment_manager.segment_entries(); let mut segment_entries = self.0.segment_manager.segment_entries();
for segment_entry in &mut segment_entries { for segment_entry in &mut segment_entries {
let segment = self.0.index.segment(segment_entry.meta().clone()); let segment = self.0.index.segment(segment_entry.meta().clone());
@@ -223,24 +234,43 @@ impl SegmentUpdater {
Ok(segment_entries) Ok(segment_entries)
} }
pub fn save_metas(&self, opstamp: u64, commit_message: Option<String>) { pub fn save_metas(&self, opstamp: Opstamp, commit_message: Option<String>) {
if self.is_alive() { if self.is_alive() {
let index = &self.0.index; let index = &self.0.index;
let directory = index.directory(); let directory = index.directory();
save_metas( let mut commited_segment_metas = self.0.segment_manager.committed_segment_metas();
self.0.segment_manager.committed_segment_metas(),
index.schema(), // We sort segment_readers by number of documents.
// This is an heuristic to make multithreading more efficient.
//
// This is not done at the searcher level because I had a strange
// use case in which I was dealing with a large static index,
// dispatched over 5 SSD drives.
//
// A `UnionDirectory` makes it possible to read from these
// 5 different drives and creates a meta.json on the fly.
// In order to optimize the throughput, it creates a lasagna of segments
// from the different drives.
//
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
let index_meta = IndexMeta {
segments: commited_segment_metas,
schema: index.schema(),
opstamp, opstamp,
commit_message, payload: commit_message,
directory.box_clone().borrow_mut(), };
).expect("Could not save metas."); save_metas(&index_meta, directory.box_clone().borrow_mut())
.expect("Could not save metas.");
self.store_meta(&index_meta);
} }
} }
pub fn garbage_collect_files(&self) -> Result<()> { pub fn garbage_collect_files(&self) -> Result<()> {
self.run_async(move |segment_updater| { self.run_async(move |segment_updater| {
segment_updater.garbage_collect_files_exec(); segment_updater.garbage_collect_files_exec();
}).wait() })
.wait()
} }
fn garbage_collect_files_exec(&self) { fn garbage_collect_files_exec(&self) {
@@ -251,7 +281,7 @@ impl SegmentUpdater {
.garbage_collect(|| self.0.segment_manager.list_files()); .garbage_collect(|| self.0.segment_manager.list_files());
} }
pub fn commit(&self, opstamp: u64, payload: Option<String>) -> Result<()> { pub fn commit(&self, opstamp: Opstamp, payload: Option<String>) -> Result<()> {
self.run_async(move |segment_updater| { self.run_async(move |segment_updater| {
if segment_updater.is_alive() { if segment_updater.is_alive() {
let segment_entries = segment_updater let segment_entries = segment_updater
@@ -262,54 +292,67 @@ impl SegmentUpdater {
segment_updater.garbage_collect_files_exec(); segment_updater.garbage_collect_files_exec();
segment_updater.consider_merge_options(); segment_updater.consider_merge_options();
} }
}).wait() })
.wait()
} }
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> { pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
//let future_merged_segment = */ let commit_opstamp = self.load_metas().opstamp;
let segment_ids_vec = segment_ids.to_vec(); let merge_operation = MergeOperation::new(
self.run_async(move |segment_updater| { &self.0.merge_operations,
segment_updater.start_merge_impl(&segment_ids_vec[..]) commit_opstamp,
}).wait()? segment_ids.to_vec(),
);
self.run_async(move |segment_updater| segment_updater.start_merge_impl(merge_operation))
.wait()?
}
fn store_meta(&self, index_meta: &IndexMeta) {
*self.0.active_metas.write().unwrap() = Arc::new(index_meta.clone());
}
fn load_metas(&self) -> Arc<IndexMeta> {
self.0.active_metas.read().unwrap().clone()
} }
// `segment_ids` is required to be non-empty. // `segment_ids` is required to be non-empty.
fn start_merge_impl(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> { fn start_merge_impl(&self, merge_operation: MergeOperation) -> Result<Receiver<SegmentMeta>> {
assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty."); assert!(
!merge_operation.segment_ids().is_empty(),
"Segment_ids cannot be empty."
);
let segment_updater_clone = self.clone(); let segment_updater_clone = self.clone();
let segment_entries: Vec<SegmentEntry> = self.0.segment_manager.start_merge(segment_ids)?; let segment_entries: Vec<SegmentEntry> = self
.0
.segment_manager
.start_merge(merge_operation.segment_ids())?;
let segment_ids_vec = segment_ids.to_vec(); // let segment_ids_vec = merge_operation.segment_ids.to_vec();
let merging_thread_id = self.get_merging_thread_id(); let merging_thread_id = self.get_merging_thread_id();
info!( info!(
"Starting merge thread #{} - {:?}", "Starting merge thread #{} - {:?}",
merging_thread_id, segment_ids merging_thread_id,
merge_operation.segment_ids()
); );
let (merging_future_send, merging_future_recv) = oneshot(); let (merging_future_send, merging_future_recv) = oneshot();
let target_opstamp = self.0.stamper.stamp();
// first we need to apply deletes to our segment. // first we need to apply deletes to our segment.
let merging_join_handle = thread::Builder::new() let merging_join_handle = thread::Builder::new()
.name(format!("mergingthread-{}", merging_thread_id)) .name(format!("mergingthread-{}", merging_thread_id))
.spawn(move || { .spawn(move || {
// first we need to apply deletes to our segment. // first we need to apply deletes to our segment.
let merged_segment = segment_updater_clone.new_segment();
let merged_segment_id = merged_segment.id();
let merge_result = perform_merge( let merge_result = perform_merge(
&merge_operation,
&segment_updater_clone.0.index, &segment_updater_clone.0.index,
segment_entries, segment_entries,
merged_segment,
target_opstamp,
); );
match merge_result { match merge_result {
Ok(after_merge_segment_entry) => { Ok(after_merge_segment_entry) => {
let merged_segment_meta = after_merge_segment_entry.meta().clone(); let merged_segment_meta = after_merge_segment_entry.meta().clone();
segment_updater_clone segment_updater_clone
.end_merge(segment_ids_vec, after_merge_segment_entry) .end_merge(merge_operation, after_merge_segment_entry)
.expect("Segment updater thread is corrupted."); .expect("Segment updater thread is corrupted.");
// the future may fail if the listener of the oneshot future // the future may fail if the listener of the oneshot future
@@ -320,13 +363,18 @@ impl SegmentUpdater {
let _merging_future_res = merging_future_send.send(merged_segment_meta); let _merging_future_res = merging_future_send.send(merged_segment_meta);
} }
Err(e) => { Err(e) => {
warn!("Merge of {:?} was cancelled: {:?}", segment_ids_vec, e); warn!(
"Merge of {:?} was cancelled: {:?}",
merge_operation.segment_ids(),
e
);
// ... cancel merge // ... cancel merge
if cfg!(test) { if cfg!(test) {
panic!("Merge failed."); panic!("Merge failed.");
} }
segment_updater_clone.cancel_merge(&segment_ids_vec, merged_segment_id); // As `merge_operation` will be dropped, the segment in merge state will
// merging_future_send will be dropped, sending an error to the future. // be available for merge again.
// `merging_future_send` will be dropped, sending an error to the future.
} }
} }
segment_updater_clone segment_updater_clone
@@ -336,7 +384,8 @@ impl SegmentUpdater {
.unwrap() .unwrap()
.remove(&merging_thread_id); .remove(&merging_thread_id);
Ok(()) Ok(())
}).expect("Failed to spawn a thread."); })
.expect("Failed to spawn a thread.");
self.0 self.0
.merging_threads .merging_threads
.write() .write()
@@ -346,16 +395,35 @@ impl SegmentUpdater {
} }
fn consider_merge_options(&self) { fn consider_merge_options(&self) {
let merge_segment_ids: HashSet<SegmentId> = self.0.merge_operations.segment_in_merge();
let (committed_segments, uncommitted_segments) = let (committed_segments, uncommitted_segments) =
get_mergeable_segments(&self.0.segment_manager); get_mergeable_segments(&merge_segment_ids, &self.0.segment_manager);
// Committed segments cannot be merged with uncommitted_segments. // Committed segments cannot be merged with uncommitted_segments.
// We therefore consider merges using these two sets of segments independently. // We therefore consider merges using these two sets of segments independently.
let merge_policy = self.get_merge_policy(); let merge_policy = self.get_merge_policy();
let mut merge_candidates = merge_policy.compute_merge_candidates(&uncommitted_segments);
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments); let current_opstamp = self.0.stamper.stamp();
merge_candidates.extend_from_slice(&committed_merge_candidates[..]); let mut merge_candidates: Vec<MergeOperation> = merge_policy
for MergeCandidate(segment_metas) in merge_candidates { .compute_merge_candidates(&uncommitted_segments)
match self.start_merge_impl(&segment_metas) { .into_iter()
.map(|merge_candidate| {
MergeOperation::new(&self.0.merge_operations, current_opstamp, merge_candidate.0)
})
.collect();
let commit_opstamp = self.load_metas().opstamp;
let committed_merge_candidates = merge_policy
.compute_merge_candidates(&committed_segments)
.into_iter()
.map(|merge_candidate| {
MergeOperation::new(&self.0.merge_operations, commit_opstamp, merge_candidate.0)
})
.collect::<Vec<_>>();
merge_candidates.extend(committed_merge_candidates.into_iter());
for merge_operation in merge_candidates {
match self.start_merge_impl(merge_operation) {
Ok(merge_future) => { Ok(merge_future) => {
if let Err(e) = merge_future.fuse().poll() { if let Err(e) = merge_future.fuse().poll() {
error!("The merge task failed quickly after starting: {:?}", e); error!("The merge task failed quickly after starting: {:?}", e);
@@ -371,31 +439,16 @@ impl SegmentUpdater {
} }
} }
fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentId,
) {
self.0
.segment_manager
.cancel_merge(before_merge_segment_ids, after_merge_segment_entry);
}
fn end_merge( fn end_merge(
&self, &self,
before_merge_segment_ids: Vec<SegmentId>, merge_operation: MergeOperation,
mut after_merge_segment_entry: SegmentEntry, mut after_merge_segment_entry: SegmentEntry,
) -> Result<()> { ) -> Result<()> {
self.run_async(move |segment_updater| { self.run_async(move |segment_updater| {
info!("End merge {:?}", after_merge_segment_entry.meta()); info!("End merge {:?}", after_merge_segment_entry.meta());
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone(); let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() { if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater let committed_opstamp = segment_updater.load_metas().opstamp;
.0
.index
.load_metas()
.expect("Failed to read opstamp")
.opstamp;
if delete_operation.opstamp < committed_opstamp { if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.0.index; let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone()); let segment = index.segment(after_merge_segment_entry.meta().clone());
@@ -404,16 +457,15 @@ impl SegmentUpdater {
{ {
error!( error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}", "Merge of {:?} was cancelled (advancing deletes failed): {:?}",
before_merge_segment_ids, e merge_operation.segment_ids(),
e
); );
// ... cancel merge
if cfg!(test) { if cfg!(test) {
panic!("Merge failed."); panic!("Merge failed.");
} }
segment_updater.cancel_merge( // ... cancel merge
&before_merge_segment_ids, // `merge_operations` are tracked. As it is dropped, the
after_merge_segment_entry.segment_id(), // the segment_ids will be available again for merge.
);
return; return;
} }
} }
@@ -421,13 +473,14 @@ impl SegmentUpdater {
segment_updater segment_updater
.0 .0
.segment_manager .segment_manager
.end_merge(&before_merge_segment_ids, after_merge_segment_entry); .end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
segment_updater.consider_merge_options(); segment_updater.consider_merge_options();
info!("save metas"); info!("save metas");
let previous_metas = segment_updater.0.index.load_metas().unwrap(); let previous_metas = segment_updater.load_metas();
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload); segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
segment_updater.garbage_collect_files_exec(); segment_updater.garbage_collect_files_exec();
}).wait() })
.wait()
} }
/// Wait for current merging threads. /// Wait for current merging threads.
@@ -446,32 +499,25 @@ impl SegmentUpdater {
/// Obsolete files will eventually be cleaned up /// Obsolete files will eventually be cleaned up
/// by the directory garbage collector. /// by the directory garbage collector.
pub fn wait_merging_thread(&self) -> Result<()> { pub fn wait_merging_thread(&self) -> Result<()> {
let mut num_segments: usize;
loop { loop {
num_segments = self.0.segment_manager.num_segments(); let merging_threads: HashMap<usize, JoinHandle<Result<()>>> = {
let mut new_merging_threads = HashMap::new();
{
let mut merging_threads = self.0.merging_threads.write().unwrap(); let mut merging_threads = self.0.merging_threads.write().unwrap();
mem::swap(&mut new_merging_threads, merging_threads.deref_mut()); mem::replace(merging_threads.deref_mut(), HashMap::new())
};
if merging_threads.is_empty() {
return Ok(());
} }
debug!("wait merging thread {}", new_merging_threads.len()); debug!("wait merging thread {}", merging_threads.len());
for (_, merging_thread_handle) in new_merging_threads { for (_, merging_thread_handle) in merging_threads {
merging_thread_handle merging_thread_handle
.join() .join()
.map(|_| ()) .map(|_| ())
.map_err(|_| TantivyError::ErrorInThread("Merging thread failed.".into()))?; .map_err(|_| TantivyError::ErrorInThread("Merging thread failed.".into()))?;
} }
// Our merging thread may have queued their completed // Our merging thread may have queued their completed merged segment.
// Let's wait for that too.
self.run_async(move |_| {}).wait()?; self.run_async(move |_| {}).wait()?;
let new_num_segments = self.0.segment_manager.num_segments();
if new_num_segments >= num_segments {
break;
}
} }
Ok(())
} }
} }
@@ -484,14 +530,14 @@ mod tests {
#[test] #[test]
fn test_delete_during_merge() { fn test_delete_during_merge() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.set_merge_policy(Box::new(MergeWheneverPossible)); index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
{ {
@@ -521,9 +567,8 @@ mod tests {
index_writer.delete_term(term); index_writer.delete_term(term);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
let reader = index.reader().unwrap();
index.load_searchers().unwrap(); assert_eq!(reader.searcher().num_docs(), 302);
assert_eq!(index.searcher().num_docs(), 302);
{ {
index_writer index_writer
@@ -531,8 +576,79 @@ mod tests {
.expect("waiting for merging threads"); .expect("waiting for merging threads");
} }
index.load_searchers().unwrap(); reader.reload().unwrap();
assert_eq!(index.searcher().segment_readers().len(), 1); assert_eq!(reader.searcher().segment_readers().len(), 1);
assert_eq!(index.searcher().num_docs(), 302); assert_eq!(reader.searcher().num_docs(), 302);
}
#[test]
fn delete_all_docs() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
}
assert!(index_writer.commit().is_ok());
}
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"));
}
assert!(index_writer.commit().is_ok());
}
{
index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"));
assert!(index_writer.commit().is_ok());
}
{
let seg_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
// docs exist, should have at least 1 segment
assert!(seg_ids.len() > 0);
}
{
let term_vals = vec!["a", "b", "c", "d", "e", "f"];
for term_val in term_vals {
let term = Term::from_field_text(text_field, term_val);
index_writer.delete_term(term);
assert!(index_writer.commit().is_ok());
}
}
{
index_writer
.wait_merging_threads()
.expect("waiting for merging threads");
}
let reader = index.reader().unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
let seg_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
assert!(seg_ids.is_empty());
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
// empty segments should be erased
assert!(index.searchable_segment_metas().unwrap().is_empty());
assert!(reader.searcher().segment_readers().is_empty());
} }
} }

View File

@@ -5,6 +5,7 @@ use fastfield::FastFieldsWriter;
use fieldnorm::FieldNormsWriter; use fieldnorm::FieldNormsWriter;
use indexer::segment_serializer::SegmentSerializer; use indexer::segment_serializer::SegmentSerializer;
use postings::MultiFieldPostingsWriter; use postings::MultiFieldPostingsWriter;
use schema::FieldEntry;
use schema::FieldType; use schema::FieldType;
use schema::Schema; use schema::Schema;
use schema::Term; use schema::Term;
@@ -15,6 +16,7 @@ use tokenizer::BoxedTokenizer;
use tokenizer::FacetTokenizer; use tokenizer::FacetTokenizer;
use tokenizer::{TokenStream, Tokenizer}; use tokenizer::{TokenStream, Tokenizer};
use DocId; use DocId;
use Opstamp;
use Result; use Result;
/// A `SegmentWriter` is in charge of creating segment index from a /// A `SegmentWriter` is in charge of creating segment index from a
@@ -28,7 +30,7 @@ pub struct SegmentWriter {
segment_serializer: SegmentSerializer, segment_serializer: SegmentSerializer,
fast_field_writers: FastFieldsWriter, fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FieldNormsWriter, fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<u64>, doc_opstamps: Vec<Opstamp>,
tokenizers: Vec<Option<Box<BoxedTokenizer>>>, tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
} }
@@ -53,7 +55,7 @@ impl SegmentWriter {
schema schema
.fields() .fields()
.iter() .iter()
.map(|field_entry| field_entry.field_type()) .map(FieldEntry::field_type)
.map(|field_type| match *field_type { .map(|field_type| match *field_type {
FieldType::Str(ref text_options) => text_options FieldType::Str(ref text_options) => text_options
.get_indexing_options() .get_indexing_options()
@@ -62,7 +64,8 @@ impl SegmentWriter {
segment.index().tokenizers().get(tokenizer_name) segment.index().tokenizers().get(tokenizer_name)
}), }),
_ => None, _ => None,
}).collect(); })
.collect();
Ok(SegmentWriter { Ok(SegmentWriter {
max_doc: 0, max_doc: 0,
multifield_postings, multifield_postings,
@@ -110,18 +113,18 @@ impl SegmentWriter {
} }
match *field_options.field_type() { match *field_options.field_type() {
FieldType::HierarchicalFacet => { FieldType::HierarchicalFacet => {
let facets: Vec<&[u8]> = field_values let facets: Vec<&str> = field_values
.iter() .iter()
.flat_map(|field_value| match *field_value.value() { .flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_bytes()), Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => { _ => {
panic!("Expected hierarchical facet"); panic!("Expected hierarchical facet");
} }
}).collect(); })
.collect();
let mut term = Term::for_field(field); // we set the Term let mut term = Term::for_field(field); // we set the Term
for facet_bytes in facets { for fake_str in facets {
let mut unordered_term_id_opt = None; let mut unordered_term_id_opt = None;
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
FacetTokenizer.token_stream(fake_str).process(&mut |token| { FacetTokenizer.token_stream(fake_str).process(&mut |token| {
term.set_text(&token.text); term.set_text(&token.text);
let unordered_term_id = let unordered_term_id =
@@ -145,7 +148,8 @@ impl SegmentWriter {
.flat_map(|field_value| match *field_value.value() { .flat_map(|field_value| match *field_value.value() {
Value::Str(ref text) => Some(text.as_str()), Value::Str(ref text) => Some(text.as_str()),
_ => None, _ => None,
}).collect(); })
.collect();
if texts.is_empty() { if texts.is_empty() {
0 0
} else { } else {
@@ -169,6 +173,17 @@ impl SegmentWriter {
} }
} }
} }
FieldType::Date(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(
field_value.field(),
field_value.value().date_value().timestamp(),
);
self.multifield_postings.subscribe(doc_id, &term);
}
}
}
FieldType::I64(ref int_option) => { FieldType::I64(ref int_option) => {
if int_option.is_indexed() { if int_option.is_indexed() {
for field_value in field_values { for field_value in field_values {

View File

@@ -1,51 +1,35 @@
// AtomicU64 have not landed in stable. use std::ops::Range;
// For the moment let's just use AtomicUsize on use std::sync::atomic::{AtomicU64, Ordering};
// x86/64 bit platform, and a mutex on other platform. use std::sync::Arc;
use Opstamp;
#[cfg(target = "x86_64")] /// Stamper provides Opstamps, which is just an auto-increment id to label
mod archicture_impl { /// an operation.
///
/// Cloning does not "fork" the stamp generation. The stamper actually wraps an `Arc`.
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64>);
use std::sync::atomic::{AtomicUsize, Ordering}; impl Stamper {
use std::sync::Arc; pub fn new(first_opstamp: Opstamp) -> Stamper {
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
}
#[derive(Clone, Default)] pub fn stamp(&self) -> Opstamp {
pub struct Stamper(Arc<AtomicU64>); self.0.fetch_add(1u64, Ordering::SeqCst) as u64
}
impl Stamper { /// Given a desired count `n`, `stamps` returns an iterator that
pub fn new(first_opstamp: u64) -> Stamper { /// will supply `n` number of u64 stamps.
Stamper(Arc::new(AtomicU64::new(first_opstamp))) pub fn stamps(&self, n: u64) -> Range<Opstamp> {
} let start = self.0.fetch_add(n, Ordering::SeqCst);
Range {
pub fn stamp(&self) -> u64 { start,
self.0.fetch_add(1u64, Ordering::SeqCst) as u64 end: start + n,
} }
} }
} }
#[cfg(not(target = "x86_64"))]
mod archicture_impl {
use std::sync::{Arc, Mutex};
#[derive(Clone, Default)]
pub struct Stamper(Arc<Mutex<u64>>);
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(Mutex::new(first_opstamp)))
}
pub fn stamp(&self) -> u64 {
let mut guard = self.0.lock().expect("Failed to lock the stamper");
let previous_val = *guard;
*guard = previous_val + 1;
previous_val
}
}
}
pub use self::archicture_impl::Stamper;
#[cfg(test)] #[cfg(test)]
mod test { mod test {
@@ -62,5 +46,8 @@ mod test {
assert_eq!(stamper.stamp(), 10u64); assert_eq!(stamper.stamp(), 10u64);
assert_eq!(stamper_clone.stamp(), 11u64); assert_eq!(stamper_clone.stamp(), 11u64);
assert_eq!(stamper.stamps(3u64), (12..15));
assert_eq!(stamper.stamp(), 15u64);
} }
} }

View File

@@ -1,6 +1,5 @@
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")] #![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
#![cfg_attr(all(feature = "unstable", test), feature(test))] #![cfg_attr(all(feature = "unstable", test), feature(test))]
#![cfg_attr(feature = "cargo-clippy", feature(tool_lints))]
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))] #![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
#![doc(test(attr(allow(unused_variables), deny(warnings))))] #![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)] #![warn(missing_docs)]
@@ -24,7 +23,8 @@
//! # use tempdir::TempDir; //! # use tempdir::TempDir;
//! # use tantivy::Index; //! # use tantivy::Index;
//! # use tantivy::schema::*; //! # use tantivy::schema::*;
//! # use tantivy::collector::TopCollector; //! # use tantivy::{Score, DocAddress};
//! # use tantivy::collector::TopDocs;
//! # use tantivy::query::QueryParser; //! # use tantivy::query::QueryParser;
//! # //! #
//! # fn main() { //! # fn main() {
@@ -46,7 +46,7 @@
//! // in a compressed, row-oriented key-value store. //! // in a compressed, row-oriented key-value store.
//! // This store is useful to reconstruct the //! // This store is useful to reconstruct the
//! // documents that were selected during the search phase. //! // documents that were selected during the search phase.
//! let mut schema_builder = SchemaBuilder::default(); //! let mut schema_builder = Schema::builder();
//! let title = schema_builder.add_text_field("title", TEXT | STORED); //! let title = schema_builder.add_text_field("title", TEXT | STORED);
//! let body = schema_builder.add_text_field("body", TEXT); //! let body = schema_builder.add_text_field("body", TEXT);
//! let schema = schema_builder.build(); //! let schema = schema_builder.build();
@@ -75,9 +75,9 @@
//! //!
//! // # Searching //! // # Searching
//! //!
//! index.load_searchers()?; //! let reader = index.reader()?;
//! //!
//! let searcher = index.searcher(); //! let searcher = reader.searcher();
//! //!
//! let query_parser = QueryParser::for_index(&index, vec![title, body]); //! let query_parser = QueryParser::for_index(&index, vec![title, body]);
//! //!
@@ -86,13 +86,13 @@
//! // A ticket has been opened regarding this problem. //! // A ticket has been opened regarding this problem.
//! let query = query_parser.parse_query("sea whale")?; //! let query = query_parser.parse_query("sea whale")?;
//! //!
//! let mut top_collector = TopCollector::with_limit(10); //! // Perform search.
//! searcher.search(&*query, &mut top_collector)?; //! // `topdocs` contains the 10 most relevant doc ids, sorted by decreasing scores...
//! let top_docs: Vec<(Score, DocAddress)> =
//! searcher.search(&query, &TopDocs::with_limit(10))?;
//! //!
//! // Our top collector now contains the 10 //! for (_score, doc_address) in top_docs {
//! // most relevant doc ids... //! // Retrieve the actual content of documents given its `doc_address`.
//! let doc_addresses = top_collector.docs();
//! for doc_address in doc_addresses {
//! let retrieved_doc = searcher.doc(doc_address)?; //! let retrieved_doc = searcher.doc(doc_address)?;
//! println!("{}", schema.to_json(&retrieved_doc)); //! println!("{}", schema.to_json(&retrieved_doc));
//! } //! }
@@ -129,31 +129,28 @@ extern crate base64;
extern crate bit_set; extern crate bit_set;
extern crate bitpacking; extern crate bitpacking;
extern crate byteorder; extern crate byteorder;
extern crate combine; extern crate combine;
extern crate crossbeam; extern crate crossbeam;
extern crate crossbeam_channel;
extern crate fnv; extern crate fnv;
extern crate fst;
extern crate fst_regex;
extern crate futures; extern crate futures;
extern crate futures_cpupool; extern crate futures_cpupool;
extern crate htmlescape; extern crate htmlescape;
extern crate itertools; extern crate itertools;
extern crate levenshtein_automata; extern crate levenshtein_automata;
#[cfg(feature = "mmap")]
extern crate memmap;
extern crate num_cpus; extern crate num_cpus;
extern crate owning_ref; extern crate owning_ref;
extern crate regex; extern crate regex;
extern crate rust_stemmers; extern crate rust_stemmers;
extern crate scoped_pool;
extern crate serde; extern crate serde;
extern crate stable_deref_trait; extern crate stable_deref_trait;
extern crate tantivy_fst;
extern crate tempdir; extern crate tempdir;
extern crate tempfile; extern crate tempfile;
extern crate uuid; extern crate uuid;
#[cfg(test)] #[cfg(test)]
#[macro_use] #[macro_use]
extern crate matches; extern crate matches;
@@ -172,11 +169,12 @@ extern crate maplit;
extern crate test; extern crate test;
#[macro_use] #[macro_use]
extern crate downcast; extern crate downcast_rs;
#[macro_use] #[macro_use]
extern crate fail; extern crate fail;
#[cfg(feature = "mmap")]
#[cfg(test)] #[cfg(test)]
mod functional_test; mod functional_test;
@@ -185,18 +183,19 @@ mod macros;
pub use error::TantivyError; pub use error::TantivyError;
#[deprecated( #[deprecated(since = "0.7.0", note = "please use `tantivy::TantivyError` instead")]
since = "0.7.0",
note = "please use `tantivy::TantivyError` instead"
)]
pub use error::TantivyError as Error; pub use error::TantivyError as Error;
extern crate census; extern crate census;
pub extern crate chrono;
extern crate owned_read; extern crate owned_read;
/// Tantivy result. /// Tantivy result.
pub type Result<T> = std::result::Result<T, error::TantivyError>; pub type Result<T> = std::result::Result<T, error::TantivyError>;
/// Tantivy DateTime
pub type DateTime = chrono::DateTime<chrono::Utc>;
mod common; mod common;
mod core; mod core;
mod indexer; mod indexer;
@@ -217,14 +216,17 @@ pub mod space_usage;
pub mod store; pub mod store;
pub mod termdict; pub mod termdict;
mod reader;
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy};
mod snippet; mod snippet;
pub use self::snippet::SnippetGenerator; pub use self::snippet::{Snippet, SnippetGenerator};
mod docset; mod docset;
pub use self::docset::{DocSet, SkipResult}; pub use self::docset::{DocSet, SkipResult};
pub use core::SegmentComponent; pub use core::SegmentComponent;
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta}; pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta, IndexMeta};
pub use core::{InvertedIndexReader, SegmentReader}; pub use core::{InvertedIndexReader, SegmentReader};
pub use directory::Directory; pub use directory::Directory;
pub use indexer::IndexWriter; pub use indexer::IndexWriter;
@@ -236,11 +238,7 @@ pub use common::{i64_to_u64, u64_to_i64};
/// Expose the current version of tantivy, as well /// Expose the current version of tantivy, as well
/// whether it was compiled with the simd compression. /// whether it was compiled with the simd compression.
pub fn version() -> &'static str { pub fn version() -> &'static str {
if cfg!(feature = "simdcompression") { env!("CARGO_PKG_VERSION")
concat!(env!("CARGO_PKG_VERSION"), "-simd")
} else {
concat!(env!("CARGO_PKG_VERSION"), "-nosimd")
}
} }
/// Defines tantivy's merging strategy /// Defines tantivy's merging strategy
@@ -256,6 +254,16 @@ pub mod merge_policy {
/// as they are added in the segment. /// as they are added in the segment.
pub type DocId = u32; pub type DocId = u32;
/// A u64 assigned to every operation incrementally
///
/// All operations modifying the index receives an monotonic Opstamp.
/// The resulting state of the index is consistent with the opstamp ordering.
///
/// For instance, a commit with opstamp `32_423` will reflect all Add and Delete operations
/// with an opstamp `<= 32_423`. A delete operation with opstamp n will no affect a document added
/// with opstamp `n+1`.
pub type Opstamp = u64;
/// A f32 that represents the relevance of the document to the query /// A f32 that represents the relevance of the document to the query
/// ///
/// This is modelled internally as a `f32`. The /// This is modelled internally as a `f32`. The
@@ -301,12 +309,15 @@ mod tests {
use docset::DocSet; use docset::DocSet;
use query::BooleanQuery; use query::BooleanQuery;
use rand::distributions::Bernoulli; use rand::distributions::Bernoulli;
use rand::distributions::Range; use rand::distributions::Uniform;
use rand::{Rng, SeedableRng, XorShiftRng}; use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use schema::*; use schema::*;
use DocAddress;
use Index; use Index;
use IndexWriter; use IndexWriter;
use Postings; use Postings;
use ReloadPolicy;
pub fn assert_nearly_equals(expected: f32, val: f32) { pub fn assert_nearly_equals(expected: f32, val: f32) {
assert!( assert!(
@@ -322,16 +333,15 @@ mod tests {
} }
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> { pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
let seed: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; let seed: [u8; 32] = [1; 32];
XorShiftRng::from_seed(seed) StdRng::from_seed(seed)
.sample_iter(&Range::new(0u32, max_value)) .sample_iter(&Uniform::new(0u32, max_value))
.take(n_elems) .take(n_elems)
.collect::<Vec<u32>>() .collect::<Vec<u32>>()
} }
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> { pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
let seed: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, seed_val]; StdRng::from_seed([seed_val; 32])
XorShiftRng::from_seed(seed)
.sample_iter(&Bernoulli::new(ratio)) .sample_iter(&Bernoulli::new(ratio))
.take(n as usize) .take(n as usize)
.enumerate() .enumerate()
@@ -346,13 +356,13 @@ mod tests {
#[test] #[test]
#[cfg(feature = "mmap")] #[cfg(feature = "mmap")]
fn test_indexing() { fn test_indexing() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap(); let index = Index::create_from_tempdir(schema).unwrap();
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
let doc = doc!(text_field=>"af b"); let doc = doc!(text_field=>"af b");
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -371,10 +381,10 @@ mod tests {
#[test] #[test]
fn test_docfreq1() { fn test_docfreq1() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
index_writer.add_document(doc!(text_field=>"a b c")); index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -396,8 +406,8 @@ mod tests {
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
{ {
index.load_searchers().unwrap(); let reader = index.reader().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
let term_a = Term::from_field_text(text_field, "a"); let term_a = Term::from_field_text(text_field, "a");
assert_eq!(searcher.doc_freq(&term_a), 3); assert_eq!(searcher.doc_freq(&term_a), 3);
let term_b = Term::from_field_text(text_field, "b"); let term_b = Term::from_field_text(text_field, "b");
@@ -411,12 +421,12 @@ mod tests {
#[test] #[test]
fn test_fieldnorm_no_docs_with_field() { fn test_fieldnorm_no_docs_with_field() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let title_field = schema_builder.add_text_field("title", TEXT); let title_field = schema_builder.add_text_field("title", TEXT);
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
let doc = doc!(text_field=>"a b c"); let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -424,8 +434,8 @@ mod tests {
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
{ {
index.load_searchers().unwrap(); let index_reader = index.reader().unwrap();
let searcher = index.searcher(); let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
{ {
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field); let fieldnorm_reader = reader.get_fieldnorms_reader(text_field);
@@ -440,11 +450,11 @@ mod tests {
#[test] #[test]
fn test_fieldnorm() { fn test_fieldnorm() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
{ {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
let doc = doc!(text_field=>"a b c"); let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -460,8 +470,8 @@ mod tests {
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
{ {
index.load_searchers().unwrap(); let reader = index.reader().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0); let segment_reader: &SegmentReader = searcher.segment_reader(0);
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field); let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorms_reader.fieldnorm(0), 3); assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
@@ -481,7 +491,7 @@ mod tests {
#[test] #[test]
fn test_delete_postings1() { fn test_delete_postings1() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let term_abcd = Term::from_field_text(text_field, "abcd"); let term_abcd = Term::from_field_text(text_field, "abcd");
let term_a = Term::from_field_text(text_field, "a"); let term_a = Term::from_field_text(text_field, "a");
@@ -489,183 +499,151 @@ mod tests {
let term_c = Term::from_field_text(text_field, "c"); let term_c = Term::from_field_text(text_field, "c");
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ // 0
// 0 index_writer.add_document(doc!(text_field=>"a b"));
let doc = doc!(text_field=>"a b"); // 1
index_writer.add_document(doc); index_writer.add_document(doc!(text_field=>" a c"));
} // 2
{ index_writer.add_document(doc!(text_field=>" b c"));
// 1 // 3
let doc = doc!(text_field=>" a c"); index_writer.add_document(doc!(text_field=>" b d"));
index_writer.add_document(doc);
} index_writer.delete_term(Term::from_field_text(text_field, "c"));
{ index_writer.delete_term(Term::from_field_text(text_field, "a"));
// 2 // 4
let doc = doc!(text_field=>" b c"); index_writer.add_document(doc!(text_field=>" b c"));
index_writer.add_document(doc); // 5
} index_writer.add_document(doc!(text_field=>" a"));
{
// 3
let doc = doc!(text_field=>" b d");
index_writer.add_document(doc);
}
{
index_writer.delete_term(Term::from_field_text(text_field, "c"));
}
{
index_writer.delete_term(Term::from_field_text(text_field, "a"));
}
{
// 4
let doc = doc!(text_field=>" b c");
index_writer.add_document(doc);
}
{
// 5
let doc = doc!(text_field=>" a");
index_writer.add_document(doc);
}
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
{ {
index.load_searchers().unwrap(); reader.reload().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
let reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field); let inverted_index = segment_reader.inverted_index(text_field);
assert!( assert!(inverted_index
inverted_index .read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions) .is_none());
.is_none()
);
{ {
let mut postings = inverted_index let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap(); .unwrap();
assert!(advance_undeleted(&mut postings, reader)); assert!(advance_undeleted(&mut postings, segment_reader));
assert_eq!(postings.doc(), 5); assert_eq!(postings.doc(), 5);
assert!(!advance_undeleted(&mut postings, reader)); assert!(!advance_undeleted(&mut postings, segment_reader));
} }
{ {
let mut postings = inverted_index let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.unwrap(); .unwrap();
assert!(advance_undeleted(&mut postings, reader)); assert!(advance_undeleted(&mut postings, segment_reader));
assert_eq!(postings.doc(), 3); assert_eq!(postings.doc(), 3);
assert!(advance_undeleted(&mut postings, reader)); assert!(advance_undeleted(&mut postings, segment_reader));
assert_eq!(postings.doc(), 4); assert_eq!(postings.doc(), 4);
assert!(!advance_undeleted(&mut postings, reader)); assert!(!advance_undeleted(&mut postings, segment_reader));
} }
} }
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ // 0
// 0 index_writer.add_document(doc!(text_field=>"a b"));
let doc = doc!(text_field=>"a b"); // 1
index_writer.add_document(doc); index_writer.delete_term(Term::from_field_text(text_field, "c"));
}
{
// 1
index_writer.delete_term(Term::from_field_text(text_field, "c"));
}
index_writer.rollback().unwrap(); index_writer.rollback().unwrap();
} }
{ {
index.load_searchers().unwrap(); reader.reload().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
let reader = searcher.segment_reader(0); let seg_reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field()); let inverted_index = seg_reader.inverted_index(term_abcd.field());
assert!( assert!(inverted_index
inverted_index .read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions) .is_none());
.is_none()
);
{ {
let mut postings = inverted_index let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap(); .unwrap();
assert!(advance_undeleted(&mut postings, reader)); assert!(advance_undeleted(&mut postings, seg_reader));
assert_eq!(postings.doc(), 5); assert_eq!(postings.doc(), 5);
assert!(!advance_undeleted(&mut postings, reader)); assert!(!advance_undeleted(&mut postings, seg_reader));
} }
{ {
let mut postings = inverted_index let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.unwrap(); .unwrap();
assert!(advance_undeleted(&mut postings, reader)); assert!(advance_undeleted(&mut postings, seg_reader));
assert_eq!(postings.doc(), 3); assert_eq!(postings.doc(), 3);
assert!(advance_undeleted(&mut postings, reader)); assert!(advance_undeleted(&mut postings, seg_reader));
assert_eq!(postings.doc(), 4); assert_eq!(postings.doc(), 4);
assert!(!advance_undeleted(&mut postings, reader)); assert!(!advance_undeleted(&mut postings, seg_reader));
} }
} }
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ index_writer.add_document(doc!(text_field=>"a b"));
let doc = doc!(text_field=>"a b"); index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.add_document(doc);
}
{
index_writer.delete_term(Term::from_field_text(text_field, "c"));
}
index_writer.rollback().unwrap(); index_writer.rollback().unwrap();
index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
{ {
index.load_searchers().unwrap(); reader.reload().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
let reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field()); let inverted_index = segment_reader.inverted_index(term_abcd.field());
assert!( assert!(inverted_index
inverted_index .read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions) .is_none());
.is_none()
);
{ {
let mut postings = inverted_index let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap(); .unwrap();
assert!(!advance_undeleted(&mut postings, reader)); assert!(!advance_undeleted(&mut postings, segment_reader));
} }
{ {
let mut postings = inverted_index let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.unwrap(); .unwrap();
assert!(advance_undeleted(&mut postings, reader)); assert!(advance_undeleted(&mut postings, segment_reader));
assert_eq!(postings.doc(), 3); assert_eq!(postings.doc(), 3);
assert!(advance_undeleted(&mut postings, reader)); assert!(advance_undeleted(&mut postings, segment_reader));
assert_eq!(postings.doc(), 4); assert_eq!(postings.doc(), 4);
assert!(!advance_undeleted(&mut postings, reader)); assert!(!advance_undeleted(&mut postings, segment_reader));
} }
{ {
let mut postings = inverted_index let mut postings = inverted_index
.read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions)
.unwrap(); .unwrap();
assert!(advance_undeleted(&mut postings, reader)); assert!(advance_undeleted(&mut postings, segment_reader));
assert_eq!(postings.doc(), 4); assert_eq!(postings.doc(), 4);
assert!(!advance_undeleted(&mut postings, reader)); assert!(!advance_undeleted(&mut postings, segment_reader));
} }
} }
} }
#[test] #[test]
fn test_indexed_u64() { fn test_indexed_u64() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("value", INT_INDEXED); let field = schema_builder.add_u64_field("value", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(field=>1u64)); index_writer.add_document(doc!(field=>1u64));
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap(); let reader = index.reader().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
let term = Term::from_field_u64(field, 1u64); let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher let mut postings = searcher
.segment_reader(0) .segment_reader(0)
@@ -679,17 +657,17 @@ mod tests {
#[test] #[test]
fn test_indexed_i64() { fn test_indexed_i64() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_i64_field("value", INT_INDEXED); let value_field = schema_builder.add_i64_field("value", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let negative_val = -1i64; let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val)); index_writer.add_document(doc!(value_field => negative_val));
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap(); let reader = index.reader().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
let term = Term::from_field_i64(value_field, negative_val); let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher let mut postings = searcher
.segment_reader(0) .segment_reader(0)
@@ -703,29 +681,34 @@ mod tests {
#[test] #[test]
fn test_indexedfield_not_in_documents() { fn test_indexedfield_not_in_documents() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let absent_field = schema_builder.add_text_field("text", TEXT); let absent_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a")); index_writer.add_document(doc!(text_field=>"a"));
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
assert!(index.load_searchers().is_ok()); let reader = index.reader().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
segment_reader.inverted_index(absent_field); //< should not panic segment_reader.inverted_index(absent_field); //< should not panic
} }
#[test] #[test]
fn test_delete_postings2() { fn test_delete_postings2() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
let add_document = |index_writer: &mut IndexWriter, val: &'static str| { let add_document = |index_writer: &mut IndexWriter, val: &'static str| {
let doc = doc!(text_field=>val); let doc = doc!(text_field=>val);
@@ -748,20 +731,20 @@ mod tests {
remove_document(&mut index_writer, "38"); remove_document(&mut index_writer, "38");
remove_document(&mut index_writer, "34"); remove_document(&mut index_writer, "34");
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap(); reader.reload().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 6); assert_eq!(searcher.num_docs(), 6);
} }
#[test] #[test]
fn test_termfreq() { fn test_termfreq() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
let doc = doc!(text_field=>"af af af bc bc"); let doc = doc!(text_field=>"af af af bc bc");
index_writer.add_document(doc); index_writer.add_document(doc);
@@ -769,16 +752,14 @@ mod tests {
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
{ {
index.load_searchers().unwrap(); let index_reader = index.reader().unwrap();
let searcher = index.searcher(); let searcher = index_reader.searcher();
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field); let inverted_index = reader.inverted_index(text_field);
let term_abcd = Term::from_field_text(text_field, "abcd"); let term_abcd = Term::from_field_text(text_field, "abcd");
assert!( assert!(inverted_index
inverted_index .read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions) .is_none());
.is_none()
);
let term_af = Term::from_field_text(text_field, "af"); let term_af = Term::from_field_text(text_field, "af");
let mut postings = inverted_index let mut postings = inverted_index
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)
@@ -792,109 +773,84 @@ mod tests {
#[test] #[test]
fn test_searcher_1() { fn test_searcher_1() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let reader = index.reader().unwrap();
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ index_writer.add_document(doc!(text_field=>"af af af b"));
let doc = doc!(text_field=>"af af af b"); index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc); index_writer.add_document(doc!(text_field=>"a b c d"));
}
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c d");
index_writer.add_document(doc);
}
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
{ {
index.load_searchers().unwrap(); reader.reload().unwrap();
let searcher = index.searcher(); let searcher = reader.searcher();
let get_doc_ids = |terms: Vec<Term>| { let get_doc_ids = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms); let query = BooleanQuery::new_multiterms_query(terms);
let mut collector = TestCollector::default(); let topdocs = searcher.search(&query, &TestCollector).unwrap();
assert!(searcher.search(&query, &mut collector).is_ok()); topdocs.docs().to_vec()
collector.docs()
}; };
{ assert_eq!(
assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
get_doc_ids(vec![Term::from_field_text(text_field, "a")]), vec![DocAddress(0, 1), DocAddress(0, 2)]
vec![1, 2] );
); assert_eq!(
} get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
{ vec![DocAddress(0, 0)]
assert_eq!( );
get_doc_ids(vec![Term::from_field_text(text_field, "af")]), assert_eq!(
vec![0] get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
); vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
} );
{ assert_eq!(
assert_eq!( get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
get_doc_ids(vec![Term::from_field_text(text_field, "b")]), vec![DocAddress(0, 1), DocAddress(0, 2)]
vec![0, 1, 2] );
); assert_eq!(
} get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
{ vec![DocAddress(0, 2)]
assert_eq!( );
get_doc_ids(vec![Term::from_field_text(text_field, "c")]), assert_eq!(
vec![1, 2] get_doc_ids(vec![
); Term::from_field_text(text_field, "b"),
} Term::from_field_text(text_field, "a"),
{ ]),
assert_eq!( vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
get_doc_ids(vec![Term::from_field_text(text_field, "d")]), );
vec![2]
);
}
{
assert_eq!(
get_doc_ids(vec![
Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a"),
]),
vec![0, 1, 2]
);
}
} }
} }
#[test] #[test]
fn test_searcher_2() { fn test_searcher_2() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0u64);
{ {
// writing the segment // writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ index_writer.add_document(doc!(text_field=>"af b"));
let doc = doc!(text_field=>"af b"); index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc); index_writer.add_document(doc!(text_field=>"a b c d"));
}
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c d");
index_writer.add_document(doc);
}
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
index.searcher(); reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 3u64);
} }
#[test] #[test]
fn test_doc_macro() { fn test_doc_macro() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let other_text_field = schema_builder.add_text_field("text2", TEXT); let other_text_field = schema_builder.add_text_field("text2", TEXT);
let document = doc!(text_field => "tantivy", let document = doc!(text_field => "tantivy",
@@ -912,11 +868,11 @@ mod tests {
#[test] #[test]
fn test_wrong_fast_field_type() { fn test_wrong_fast_field_type() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST); let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
let fast_field_signed = schema_builder.add_i64_field("signed", FAST); let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let stored_int_field = schema_builder.add_u64_field("text", INT_STORED); let stored_int_field = schema_builder.add_u64_field("text", STORED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -926,33 +882,32 @@ mod tests {
index_writer.add_document(document); index_writer.add_document(document);
index_writer.commit().unwrap(); index_writer.commit().unwrap();
} }
let reader = index.reader().unwrap();
index.load_searchers().unwrap(); let searcher = reader.searcher();
let searcher = index.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0); let segment_reader: &SegmentReader = searcher.segment_reader(0);
{ {
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(text_field); let fast_field_reader_opt = segment_reader.fast_fields().u64(text_field);
assert!(fast_field_reader_res.is_err()); assert!(fast_field_reader_opt.is_none());
} }
{ {
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(stored_int_field); let fast_field_reader_opt = segment_reader.fast_fields().u64(stored_int_field);
assert!(fast_field_reader_res.is_err()); assert!(fast_field_reader_opt.is_none());
} }
{ {
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(fast_field_signed); let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_signed);
assert!(fast_field_reader_res.is_err()); assert!(fast_field_reader_opt.is_none());
} }
{ {
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed); let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
assert!(fast_field_reader_res.is_ok()); assert!(fast_field_reader_opt.is_some());
let fast_field_reader = fast_field_reader_res.unwrap(); let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64) assert_eq!(fast_field_reader.get(0), 4i64)
} }
{ {
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed); let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
assert!(fast_field_reader_res.is_ok()); assert!(fast_field_reader_opt.is_some());
let fast_field_reader = fast_field_reader_res.unwrap(); let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64) assert_eq!(fast_field_reader.get(0), 4i64)
} }
} }

View File

@@ -26,12 +26,12 @@
/// #[macro_use] /// #[macro_use]
/// extern crate tantivy; /// extern crate tantivy;
/// ///
/// use tantivy::schema::{SchemaBuilder, TEXT, FAST}; /// use tantivy::schema::{Schema, TEXT, FAST};
/// ///
/// //... /// //...
/// ///
/// # fn main() { /// # fn main() {
/// let mut schema_builder = SchemaBuilder::new(); /// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT); /// let title = schema_builder.add_text_field("title", TEXT);
/// let author = schema_builder.add_text_field("text", TEXT); /// let author = schema_builder.add_text_field("text", TEXT);
/// let likes = schema_builder.add_u64_field("num_u64", FAST); /// let likes = schema_builder.add_u64_field("num_u64", FAST);
@@ -61,39 +61,39 @@ macro_rules! doc(
}; };
// if there is a trailing comma retry with the trailing comma stripped. // if there is a trailing comma retry with the trailing comma stripped.
($($field:expr => $value:expr),+ ,) => { ($($field:expr => $value:expr),+ ,) => {
doc!( $( $field => $value ), *); doc!( $( $field => $value ), *)
}; };
); );
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use schema::{SchemaBuilder, FAST, TEXT}; use schema::{Schema, FAST, TEXT};
#[test] #[test]
fn test_doc_basic() { fn test_doc_basic() {
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT); let title = schema_builder.add_text_field("title", TEXT);
let author = schema_builder.add_text_field("text", TEXT); let author = schema_builder.add_text_field("text", TEXT);
let likes = schema_builder.add_u64_field("num_u64", FAST); let likes = schema_builder.add_u64_field("num_u64", FAST);
let _schema = schema_builder.build(); let _schema = schema_builder.build();
let _doc = doc!( let _doc = doc!(
title => "Life Aquatic", title => "Life Aquatic",
author => "Wes Anderson", author => "Wes Anderson",
likes => 4u64 likes => 4u64
); );
} }
#[test] #[test]
fn test_doc_trailing_comma() { fn test_doc_trailing_comma() {
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT); let title = schema_builder.add_text_field("title", TEXT);
let author = schema_builder.add_text_field("text", TEXT); let author = schema_builder.add_text_field("text", TEXT);
let likes = schema_builder.add_u64_field("num_u64", FAST); let likes = schema_builder.add_u64_field("num_u64", FAST);
let _schema = schema_builder.build(); let _schema = schema_builder.build();
let _doc = doc!( let _doc = doc!(
title => "Life Aquatic", title => "Life Aquatic",
author => "Wes Anderson", author => "Wes Anderson",
likes => 4u64, likes => 4u64,
); );
} }
} }

View File

@@ -34,10 +34,6 @@ const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
const LONG_SKIP_IN_BLOCKS: usize = 1_024; const LONG_SKIP_IN_BLOCKS: usize = 1_024;
const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) as u64; const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) as u64;
lazy_static! {
static ref BIT_PACKER: BitPacker4x = BitPacker4x::new();
}
#[cfg(test)] #[cfg(test)]
pub mod tests { pub mod tests {

View File

@@ -1,4 +1,23 @@
use super::BIT_PACKER; /// Positions works as a long sequence of compressed block.
/// All terms are chained one after the other.
///
/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
/// This means we need to skip to the `nth` positions efficiently.
///
/// This is done thanks to two levels of skiping that we refer to in the code
/// as `long_skip` and `short_skip`.
///
/// The `long_skip` makes it possible to skip every 1_024 compression blocks (= 131_072 positions).
/// Skipping offset are simply stored one after as an offset stored over 8 bytes.
///
/// We find the number of long skips, as `n / long_skip`.
///
/// Blocks are compressed using bitpacking, so `skip_read` contains the number of bytes
/// (values can go from 0bit to 32 bits) required to decompressed every block.
///
/// A given block obviously takes `(128 x num_bit_for_the_block / num_bits_in_a_byte)`,
/// so skipping a block without decompressing it is just a matter of advancing that many
/// bytes.
use bitpacking::{BitPacker, BitPacker4x}; use bitpacking::{BitPacker, BitPacker4x};
use common::{BinarySerializable, FixedSize}; use common::{BinarySerializable, FixedSize};
use directory::ReadOnlySource; use directory::ReadOnlySource;
@@ -8,9 +27,65 @@ use positions::LONG_SKIP_INTERVAL;
use positions::LONG_SKIP_IN_BLOCKS; use positions::LONG_SKIP_IN_BLOCKS;
use postings::compression::compressed_block_size; use postings::compression::compressed_block_size;
struct Positions {
bit_packer: BitPacker4x,
skip_source: ReadOnlySource,
position_source: ReadOnlySource,
long_skip_source: ReadOnlySource,
}
impl Positions {
pub fn new(position_source: ReadOnlySource, skip_source: ReadOnlySource) -> Positions {
let skip_len = skip_source.len();
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
let (skip_source, long_skip_source) = body.split(body_split);
Positions {
bit_packer: BitPacker4x::new(),
skip_source,
long_skip_source,
position_source,
}
}
/// Returns the offset of the block associated to the given `long_skip_id`.
///
/// One `long_skip_id` means `LONG_SKIP_IN_BLOCKS` blocks.
fn long_skip(&self, long_skip_id: usize) -> u64 {
if long_skip_id == 0 {
return 0;
}
let long_skip_slice = self.long_skip_source.as_slice();
let mut long_skip_blocks: &[u8] = &long_skip_slice[(long_skip_id - 1) * 8..][..8];
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted")
}
fn reader(&self, offset: u64) -> PositionReader {
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
let small_skip = (offset % LONG_SKIP_INTERVAL) as usize;
let offset_num_bytes: u64 = self.long_skip(long_skip_id);
let mut position_read = OwnedRead::new(self.position_source.clone());
position_read.advance(offset_num_bytes as usize);
let mut skip_read = OwnedRead::new(self.skip_source.clone());
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
let mut position_reader = PositionReader {
bit_packer: self.bit_packer,
skip_read,
position_read,
inner_offset: 0,
buffer: Box::new([0u32; 128]),
ahead: None,
};
position_reader.skip(small_skip);
position_reader
}
}
pub struct PositionReader { pub struct PositionReader {
skip_read: OwnedRead, skip_read: OwnedRead,
position_read: OwnedRead, position_read: OwnedRead,
bit_packer: BitPacker4x,
inner_offset: usize, inner_offset: usize,
buffer: Box<[u32; 128]>, buffer: Box<[u32; 128]>,
ahead: Option<usize>, // if None, no block is loaded. ahead: Option<usize>, // if None, no block is loaded.
@@ -27,6 +102,7 @@ pub struct PositionReader {
// If the requested number of els ends exactly at a given block, the next // If the requested number of els ends exactly at a given block, the next
// block is not decompressed. // block is not decompressed.
fn read_impl( fn read_impl(
bit_packer: BitPacker4x,
mut position: &[u8], mut position: &[u8],
buffer: &mut [u32; 128], buffer: &mut [u32; 128],
mut inner_offset: usize, mut inner_offset: usize,
@@ -37,21 +113,23 @@ fn read_impl(
let mut output_len = output.len(); let mut output_len = output.len();
let mut ahead = 0; let mut ahead = 0;
loop { loop {
let available_len = 128 - inner_offset; let available_len = COMPRESSION_BLOCK_SIZE - inner_offset;
// We have enough elements in the current block.
// Let's copy the requested elements in the output buffer,
// and return.
if output_len <= available_len { if output_len <= available_len {
output[output_start..].copy_from_slice(&buffer[inner_offset..][..output_len]); output[output_start..].copy_from_slice(&buffer[inner_offset..][..output_len]);
return ahead; return ahead;
} else {
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
output_len -= available_len;
output_start += available_len;
inner_offset = 0;
let num_bits = num_bits[ahead];
BitPacker4x::new().decompress(position, &mut buffer[..], num_bits);
let block_len = compressed_block_size(num_bits);
position = &position[block_len..];
ahead += 1;
} }
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
output_len -= available_len;
output_start += available_len;
inner_offset = 0;
let num_bits = num_bits[ahead];
bit_packer.decompress(position, &mut buffer[..], num_bits);
let block_len = compressed_block_size(num_bits);
position = &position[block_len..];
ahead += 1;
} }
} }
@@ -61,35 +139,7 @@ impl PositionReader {
skip_source: ReadOnlySource, skip_source: ReadOnlySource,
offset: u64, offset: u64,
) -> PositionReader { ) -> PositionReader {
let skip_len = skip_source.len(); Positions::new(position_source, skip_source).reader(offset)
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
let (skip_body, long_skips) = body.split(body_split);
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
let small_skip = (offset - (long_skip_id as u64) * (LONG_SKIP_INTERVAL as u64)) as usize;
let offset_num_bytes: u64 = {
if long_skip_id > 0 {
let mut long_skip_blocks: &[u8] =
&long_skips.as_slice()[(long_skip_id - 1) * 8..][..8];
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") * 16
} else {
0
}
};
let mut position_read = OwnedRead::new(position_source);
position_read.advance(offset_num_bytes as usize);
let mut skip_read = OwnedRead::new(skip_body);
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
let mut position_reader = PositionReader {
skip_read,
position_read,
inner_offset: 0,
buffer: Box::new([0u32; 128]),
ahead: None,
};
position_reader.skip(small_skip);
position_reader
} }
/// Fills a buffer with the next `output.len()` integers. /// Fills a buffer with the next `output.len()` integers.
@@ -101,10 +151,13 @@ impl PositionReader {
if self.ahead != Some(0) { if self.ahead != Some(0) {
// the block currently available is not the block // the block currently available is not the block
// for the current position // for the current position
BIT_PACKER.decompress(position_data, self.buffer.as_mut(), num_bits); self.bit_packer
.decompress(position_data, self.buffer.as_mut(), num_bits);
self.ahead = Some(0);
} }
let block_len = compressed_block_size(num_bits); let block_len = compressed_block_size(num_bits);
self.ahead = Some(read_impl( self.ahead = Some(read_impl(
self.bit_packer,
&position_data[block_len..], &position_data[block_len..],
self.buffer.as_mut(), self.buffer.as_mut(),
self.inner_offset, self.inner_offset,
@@ -133,14 +186,13 @@ impl PositionReader {
} }
}); });
let skip_len = self.skip_read.as_ref()[..num_blocks_to_advance] let skip_len_in_bits = self.skip_read.as_ref()[..num_blocks_to_advance]
.iter() .iter()
.cloned() .map(|num_bits| *num_bits as usize)
.map(|num_bit| num_bit as usize)
.sum::<usize>() .sum::<usize>()
* (COMPRESSION_BLOCK_SIZE / 8); * COMPRESSION_BLOCK_SIZE;
let skip_len_in_bytes = skip_len_in_bits / 8;
self.skip_read.advance(num_blocks_to_advance); self.skip_read.advance(num_blocks_to_advance);
self.position_read.advance(skip_len); self.position_read.advance(skip_len_in_bytes);
} }
} }

View File

@@ -1,29 +1,30 @@
use super::BIT_PACKER;
use bitpacking::BitPacker; use bitpacking::BitPacker;
use bitpacking::BitPacker4x;
use common::BinarySerializable; use common::BinarySerializable;
use common::CountingWriter;
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL}; use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
use std::io; use std::io::{self, Write};
pub struct PositionSerializer<W: io::Write> { pub struct PositionSerializer<W: io::Write> {
write_stream: W, bit_packer: BitPacker4x,
write_stream: CountingWriter<W>,
write_skiplist: W, write_skiplist: W,
block: Vec<u32>, block: Vec<u32>,
buffer: Vec<u8>, buffer: Vec<u8>,
num_ints: u64, num_ints: u64,
long_skips: Vec<u64>, long_skips: Vec<u64>,
cumulated_num_bits: u64,
} }
impl<W: io::Write> PositionSerializer<W> { impl<W: io::Write> PositionSerializer<W> {
pub fn new(write_stream: W, write_skiplist: W) -> PositionSerializer<W> { pub fn new(write_stream: W, write_skiplist: W) -> PositionSerializer<W> {
PositionSerializer { PositionSerializer {
write_stream, bit_packer: BitPacker4x::new(),
write_stream: CountingWriter::wrap(write_stream),
write_skiplist, write_skiplist,
block: Vec::with_capacity(128), block: Vec::with_capacity(128),
buffer: vec![0u8; 128 * 4], buffer: vec![0u8; 128 * 4],
num_ints: 0u64, num_ints: 0u64,
long_skips: Vec::new(), long_skips: Vec::new(),
cumulated_num_bits: 0u64,
} }
} }
@@ -50,14 +51,15 @@ impl<W: io::Write> PositionSerializer<W> {
} }
fn flush_block(&mut self) -> io::Result<()> { fn flush_block(&mut self) -> io::Result<()> {
let num_bits = BIT_PACKER.num_bits(&self.block[..]); let num_bits = self.bit_packer.num_bits(&self.block[..]);
self.cumulated_num_bits += u64::from(num_bits);
self.write_skiplist.write_all(&[num_bits])?; self.write_skiplist.write_all(&[num_bits])?;
let written_len = BIT_PACKER.compress(&self.block[..], &mut self.buffer, num_bits); let written_len = self
.bit_packer
.compress(&self.block[..], &mut self.buffer, num_bits);
self.write_stream.write_all(&self.buffer[..written_len])?; self.write_stream.write_all(&self.buffer[..written_len])?;
self.block.clear(); self.block.clear();
if (self.num_ints % LONG_SKIP_INTERVAL) == 0u64 { if (self.num_ints % LONG_SKIP_INTERVAL) == 0u64 {
self.long_skips.push(self.cumulated_num_bits); self.long_skips.push(self.write_stream.written_bytes());
} }
Ok(()) Ok(())
} }

View File

@@ -0,0 +1,249 @@
use postings::compression::AlignedBuffer;
/// This modules define the logic used to search for a doc in a given
/// block. (at most 128 docs)
///
/// Searching within a block is a hotspot when running intersection.
/// so it was worth defining it in its own module.
#[cfg(target_arch = "x86_64")]
mod sse2 {
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
use std::arch::x86_64::__m128i as DataType;
use std::arch::x86_64::_mm_add_epi32 as op_add;
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
use std::arch::x86_64::_mm_set1_epi32 as set1;
use std::arch::x86_64::_mm_setzero_si128 as set0;
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
const MASK1: i32 = 78;
const MASK2: i32 = 177;
/// Performs an exhaustive linear search over the
///
/// There is no early exit here. We simply count the
/// number of elements that are `< target`.
pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
unsafe {
let ptr = arr as *const AlignedBuffer as *const DataType;
let vkey = set1(target as i32);
let mut cnt = set0();
// We work over 4 `__m128i` at a time.
// A single `__m128i` actual contains 4 `u32`.
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
cnt = op_sub(cnt, sum);
}
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
_mm_cvtsi128_si32(cnt) as usize
}
}
#[cfg(test)]
mod test {
use super::linear_search_sse2_128;
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
#[test]
fn test_linear_search_sse2_128_u32() {
let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
for el in 0u32..128u32 {
block[el as usize] = el * 2 + 1 << 18;
}
let target = block[64] + 1;
assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
}
}
}
/// This `linear search` browser exhaustively through the array.
/// but the early exit is very difficult to predict.
///
/// Coupled with `exponential search` this function is likely
/// to be called with the same `len`
fn linear_search(arr: &[u32], target: u32) -> usize {
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
}
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
let end = arr.len();
let mut begin = 0;
for &pivot in &[1, 3, 7, 15, 31, 63] {
if pivot >= end {
break;
}
if arr[pivot] > target {
return (begin, pivot);
}
begin = pivot;
}
(begin, end)
}
fn galloping(block_docs: &[u32], target: u32) -> usize {
let (start, end) = exponential_search(&block_docs, target);
start + linear_search(&block_docs[start..end], target)
}
/// Tantivy may rely on SIMD instructions to search for a specific document within
/// a given block.
#[derive(Clone, Copy, PartialEq)]
pub enum BlockSearcher {
#[cfg(target_arch = "x86_64")]
SSE2,
Scalar,
}
impl BlockSearcher {
/// Search the first index containing an element greater or equal to
/// the target.
///
/// The results should be equivalent to
/// ```ignore
/// block[..]
// .iter()
// .take_while(|&&val| val < target)
// .count()
/// ```
///
/// The `start` argument is just used to hint that the response is
/// greater than beyond `start`. The implementation may or may not use
/// it for optimization.
///
/// # Assumption
///
/// The array len is > start.
/// The block is sorted
/// The target is assumed greater or equal to the `arr[start]`.
/// The target is assumed smaller or equal to the last element of the block.
///
/// Currently the scalar implementation starts by an exponential search, and
/// then operates a linear search in the result subarray.
///
/// If SSE2 instructions are available in the `(platform, running CPU)`,
/// then we use a different implementation that does an exhaustive linear search over
/// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
/// of branch.
pub(crate) fn search_in_block(
self,
block_docs: &AlignedBuffer,
len: usize,
start: usize,
target: u32,
) -> usize {
#[cfg(target_arch = "x86_64")]
{
use postings::compression::COMPRESSION_BLOCK_SIZE;
if self == BlockSearcher::SSE2 && len == COMPRESSION_BLOCK_SIZE {
return sse2::linear_search_sse2_128(block_docs, target);
}
}
start + galloping(&block_docs.0[start..len], target)
}
}
impl Default for BlockSearcher {
fn default() -> BlockSearcher {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("sse2") {
return BlockSearcher::SSE2;
}
}
BlockSearcher::Scalar
}
}
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::linear_search;
use super::BlockSearcher;
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
#[test]
fn test_linear_search() {
let len: usize = 50;
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
for target in 1..*arr.last().unwrap() {
let res = linear_search(&arr[..], target);
if res > 0 {
assert!(arr[res - 1] < target);
}
if res < len {
assert!(arr[res] >= target);
}
}
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
assert_eq!(
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
(3, 7)
);
}
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
let cursor = search_in_block_trivial_but_slow(block, target);
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
let mut output_buffer = [u32::max_value(); COMPRESSION_BLOCK_SIZE];
output_buffer[..block.len()].copy_from_slice(block);
for i in 0..cursor {
assert_eq!(
block_searcher.search_in_block(
&AlignedBuffer(output_buffer),
block.len(),
i,
target
),
cursor
);
}
}
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
use std::collections::HashSet;
let mut targets = HashSet::new();
for (i, val) in block.iter().cloned().enumerate() {
if i > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_in_block(block_searcher, block, target);
}
}
fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
block.iter().take_while(|&&val| val < target).count()
}
fn test_search_in_block_util(block_searcher: BlockSearcher) {
for len in 1u32..128u32 {
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
util_test_search_in_block_all(block_searcher, &v[..]);
}
}
#[test]
fn test_search_in_block_scalar() {
test_search_in_block_util(BlockSearcher::Scalar);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_search_in_block_sse2() {
test_search_in_block_util(BlockSearcher::SSE2);
}
}

View File

@@ -43,9 +43,14 @@ impl BlockEncoder {
} }
} }
/// We ensure that the OutputBuffer is align on 128 bits
/// in order to run SSE2 linear search on it.
#[repr(align(128))]
pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
pub struct BlockDecoder { pub struct BlockDecoder {
bitpacker: BitPacker4x, bitpacker: BitPacker4x,
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1], output: AlignedBuffer,
pub output_len: usize, pub output_len: usize,
} }
@@ -55,11 +60,9 @@ impl BlockDecoder {
} }
pub fn with_val(val: u32) -> BlockDecoder { pub fn with_val(val: u32) -> BlockDecoder {
let mut output = [val; COMPRESSION_BLOCK_SIZE + 1];
output[COMPRESSION_BLOCK_SIZE] = 0u32;
BlockDecoder { BlockDecoder {
bitpacker: BitPacker4x::new(), bitpacker: BitPacker4x::new(),
output, output: AlignedBuffer([val; COMPRESSION_BLOCK_SIZE]),
output_len: 0, output_len: 0,
} }
} }
@@ -72,23 +75,28 @@ impl BlockDecoder {
) -> usize { ) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE; self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker self.bitpacker
.decompress_sorted(offset, &compressed_data, &mut self.output, num_bits) .decompress_sorted(offset, &compressed_data, &mut self.output.0, num_bits)
} }
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize { pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE; self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker self.bitpacker
.decompress(&compressed_data, &mut self.output, num_bits) .decompress(&compressed_data, &mut self.output.0, num_bits)
} }
#[inline] #[inline]
pub fn output_array(&self) -> &[u32] { pub fn output_array(&self) -> &[u32] {
&self.output[..self.output_len] &self.output.0[..self.output_len]
}
#[inline]
pub(crate) fn output_aligned(&self) -> (&AlignedBuffer, usize) {
(&self.output, self.output_len)
} }
#[inline] #[inline]
pub fn output(&self, idx: usize) -> u32 { pub fn output(&self, idx: usize) -> u32 {
self.output[idx] self.output.0[idx]
} }
} }
@@ -159,12 +167,12 @@ impl VIntDecoder for BlockDecoder {
num_els: usize, num_els: usize,
) -> usize { ) -> usize {
self.output_len = num_els; self.output_len = num_els;
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset) vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
} }
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize { fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
self.output_len = num_els; self.output_len = num_els;
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els]) vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
} }
} }
@@ -266,21 +274,17 @@ pub mod tests {
mod bench { mod bench {
use super::*; use super::*;
use rand::Rng;
use rand::SeedableRng; use rand::SeedableRng;
use rand::XorShiftRng; use rand::{Rng, XorShiftRng};
use test::Bencher; use test::Bencher;
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> { fn generate_array_with_seed(n: usize, ratio: f64, seed_val: u8) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, seed_val]; let seed: &[u8; 16] = &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed); let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
(0..u32::max_value()) (0u32..).filter(|_| rng.gen_bool(ratio)).take(n).collect()
.filter(|_| rng.next_f32() < ratio)
.take(n)
.collect()
} }
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> { pub fn generate_array(n: usize, ratio: f64) -> Vec<u32> {
generate_array_with_seed(n, ratio, 4) generate_array_with_seed(n, ratio, 4)
} }
@@ -297,24 +301,23 @@ mod bench {
fn bench_uncompress(b: &mut Bencher) { fn bench_uncompress(b: &mut Bencher) {
let mut encoder = BlockEncoder::new(); let mut encoder = BlockEncoder::new();
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1); let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let (_, compressed) = encoder.compress_block_sorted(&data, 0u32); let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new(); let mut decoder = BlockDecoder::new();
b.iter(|| { b.iter(|| {
decoder.uncompress_block_sorted(compressed, 0u32); decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
}); });
} }
#[test] #[test]
fn test_all_docs_compression_numbits() { fn test_all_docs_compression_numbits() {
for num_bits in 0..33 { for expected_num_bits in 0u8.. {
let mut data = [0u32; 128]; let mut data = [0u32; 128];
if num_bits > 0 { if expected_num_bits > 0 {
data[0] = 1 << (num_bits - 1); data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32;
} }
let mut encoder = BlockEncoder::new(); let mut encoder = BlockEncoder::new();
let compressed = encoder.compress_block_unsorted(&data); let (num_bits, compressed) = encoder.compress_block_unsorted(&data);
assert_eq!(compressed[0] as usize, num_bits); assert_eq!(compressed.len(), compressed_block_size(num_bits));
assert_eq!(compressed.len(), compressed_block_size(compressed[0]));
} }
} }

View File

@@ -2,6 +2,7 @@
Postings module (also called inverted index) Postings module (also called inverted index)
*/ */
mod block_search;
pub(crate) mod compression; pub(crate) mod compression;
/// Postings module /// Postings module
/// ///
@@ -16,6 +17,8 @@ mod skip;
mod stacker; mod stacker;
mod term_info; mod term_info;
pub(crate) use self::block_search::BlockSearcher;
pub(crate) use self::postings_writer::MultiFieldPostingsWriter; pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer}; pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
@@ -31,7 +34,6 @@ pub(crate) use self::stacker::compute_table_size;
pub use common::HasLen; pub use common::HasLen;
pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32; pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32;
pub(crate) type UnorderedTermId = u64; pub(crate) type UnorderedTermId = u64;
#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))] #[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))]
@@ -53,18 +55,21 @@ pub mod tests {
use fieldnorm::FieldNormReader; use fieldnorm::FieldNormReader;
use indexer::operation::AddOperation; use indexer::operation::AddOperation;
use indexer::SegmentWriter; use indexer::SegmentWriter;
use merge_policy::NoMergePolicy;
use query::Scorer; use query::Scorer;
use rand::{Rng, SeedableRng, XorShiftRng}; use rand::rngs::StdRng;
use schema::Field; use rand::{Rng, SeedableRng};
use schema::IndexRecordOption; use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT}; use schema::{Field, TextOptions};
use schema::{IndexRecordOption, TextFieldIndexing};
use std::iter; use std::iter;
use tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
use DocId; use DocId;
use Score; use Score;
#[test] #[test]
pub fn test_position_write() { pub fn test_position_write() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -88,7 +93,7 @@ pub mod tests {
#[test] #[test]
pub fn test_skip_positions() { pub fn test_skip_positions() {
let mut schema_builder = SchemaBuilder::new(); let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT); let title = schema_builder.add_text_field("title", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
@@ -100,14 +105,11 @@ pub mod tests {
} }
index_writer.add_document(doc!(title => r#"abc be be be be abc"#)); index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher(); let searcher = index.reader().unwrap().searcher();
let inverted_index = searcher.segment_reader(0u32).inverted_index(title); let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
let term = Term::from_field_text(title, "abc"); let term = Term::from_field_text(title, "abc");
let mut positions = Vec::new(); let mut positions = Vec::new();
{ {
let mut postings = inverted_index let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions) .read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
@@ -160,10 +162,56 @@ pub mod tests {
} }
} }
#[test]
pub fn test_drop_token_that_are_too_long() {
let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
exceeding_token_text.push_str(" hello");
let mut schema_builder = Schema::builder();
let text_options = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
.set_tokenizer("simple_no_truncation"),
);
let text_field = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader().unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
{
index_writer.add_document(doc!(text_field=>exceeding_token_text));
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inverted_index = segment_reader.inverted_index(text_field);
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
assert_eq!(&bytes, b"hello");
}
{
index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(1u32);
let inverted_index = segment_reader.inverted_index(text_field);
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
assert_eq!(&bytes[..], ok_token_text.as_bytes());
}
}
#[test] #[test]
pub fn test_position_and_fieldnorm1() { pub fn test_position_and_fieldnorm1() {
let mut positions = Vec::new(); let mut positions = Vec::new();
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
@@ -220,12 +268,10 @@ pub mod tests {
} }
{ {
let term_a = Term::from_field_text(text_field, "abcdef"); let term_a = Term::from_field_text(text_field, "abcdef");
assert!( assert!(segment_reader
segment_reader .inverted_index(term_a.field())
.inverted_index(term_a.field()) .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) .is_none());
.is_none()
);
} }
{ {
let term_a = Term::from_field_text(text_field, "a"); let term_a = Term::from_field_text(text_field, "a");
@@ -276,12 +322,12 @@ pub mod tests {
#[test] #[test]
pub fn test_position_and_fieldnorm2() { pub fn test_position_and_fieldnorm2() {
let mut positions: Vec<u32> = Vec::new(); let mut positions: Vec<u32> = Vec::new();
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT); let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{ {
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_text(text_field, "g b b d c g c"); doc.add_text(text_field, "g b b d c g c");
@@ -294,9 +340,8 @@ pub mod tests {
} }
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
index.load_searchers().unwrap();
let term_a = Term::from_field_text(text_field, "a"); let term_a = Term::from_field_text(text_field, "a");
let searcher = index.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let mut postings = segment_reader let mut postings = segment_reader
.inverted_index(text_field) .inverted_index(text_field)
@@ -317,13 +362,13 @@ pub mod tests {
let num_docs = 300u32; let num_docs = 300u32;
let index = { let index = {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_u64_field("value", INT_INDEXED); let value_field = schema_builder.add_u64_field("value", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for i in 0..num_docs { for i in 0..num_docs {
let mut doc = Document::default(); let mut doc = Document::default();
doc.add_u64(value_field, 2); doc.add_u64(value_field, 2);
@@ -333,10 +378,9 @@ pub mod tests {
} }
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
index.load_searchers().unwrap();
index index
}; };
let searcher = index.searcher(); let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
// check that the basic usage works // check that the basic usage works
@@ -400,12 +444,11 @@ pub mod tests {
// delete some of the documents // delete some of the documents
{ {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.delete_term(term_0); index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
index.load_searchers().unwrap(); let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
// make sure seeking still works // make sure seeking still works
@@ -450,33 +493,19 @@ pub mod tests {
// delete everything else // delete everything else
{ {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.delete_term(term_1); index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
index.load_searchers().unwrap(); let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
// finally, check that it's empty // finally, check that it's empty
{ {
let mut segment_postings = segment_reader let searchable_segment_ids = index
.inverted_index(term_2.field()) .searchable_segment_ids()
.read_postings(&term_2, IndexRecordOption::Basic) .expect("could not get index segment ids");
.unwrap(); assert!(searchable_segment_ids.is_empty());
assert_eq!(searcher.num_docs(), 0);
assert_eq!(segment_postings.skip_next(0), SkipResult::Reached);
assert_eq!(segment_postings.doc(), 0);
assert!(segment_reader.is_deleted(0));
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.unwrap();
assert_eq!(segment_postings.skip_next(num_docs), SkipResult::End);
} }
} }
@@ -498,17 +527,16 @@ pub mod tests {
Term::from_field_text(field, "d") Term::from_field_text(field, "d")
}; };
pub static ref INDEX: Index = { pub static ref INDEX: Index = {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", STRING); let text_field = schema_builder.add_text_field("text", STRING);
let schema = schema_builder.build(); let schema = schema_builder.build();
let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let mut rng: XorShiftRng = XorShiftRng::from_seed(seed);
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let posting_list_size = 1_000_000; let posting_list_size = 1_000_000;
{ {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for _ in 0..posting_list_size { for _ in 0..posting_list_size {
let mut doc = Document::default(); let mut doc = Document::default();
if rng.gen_bool(1f64 / 15f64) { if rng.gen_bool(1f64 / 15f64) {
@@ -525,7 +553,6 @@ pub mod tests {
} }
assert!(index_writer.commit().is_ok()); assert!(index_writer.commit().is_ok());
} }
index.load_searchers().unwrap();
index index
}; };
} }
@@ -654,7 +681,7 @@ mod bench {
}); });
} }
fn bench_skip_next(p: f32, b: &mut Bencher) { fn bench_skip_next(p: f64, b: &mut Bencher) {
let searcher = INDEX.searcher(); let searcher = INDEX.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let docs = tests::sample(segment_reader.num_docs(), p); let docs = tests::sample(segment_reader.num_docs(), p);

View File

@@ -1,6 +1,8 @@
use super::stacker::{Addr, MemoryArena, TermHashMap}; use super::stacker::{Addr, MemoryArena, TermHashMap};
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder}; use postings::recorder::{
BufferLender, NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder,
};
use postings::UnorderedTermId; use postings::UnorderedTermId;
use postings::{FieldSerializer, InvertedIndexSerializer}; use postings::{FieldSerializer, InvertedIndexSerializer};
use schema::IndexRecordOption; use schema::IndexRecordOption;
@@ -10,8 +12,8 @@ use std::io;
use std::marker::PhantomData; use std::marker::PhantomData;
use std::ops::DerefMut; use std::ops::DerefMut;
use termdict::TermOrdinal; use termdict::TermOrdinal;
use tokenizer::Token;
use tokenizer::TokenStream; use tokenizer::TokenStream;
use tokenizer::{Token, MAX_TOKEN_LEN};
use DocId; use DocId;
use Result; use Result;
@@ -29,10 +31,12 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
IndexRecordOption::WithFreqsAndPositions => { IndexRecordOption::WithFreqsAndPositions => {
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed() SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
} }
}).unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()), })
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => { .unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
SpecializedPostingsWriter::<NothingRecorder>::new_boxed() FieldType::U64(_)
} | FieldType::I64(_)
| FieldType::Date(_)
| FieldType::HierarchicalFacet => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
FieldType::Bytes => { FieldType::Bytes => {
// FieldType::Bytes cannot actually be indexed. // FieldType::Bytes cannot actually be indexed.
// TODO fix during the indexer refactoring described in #276 // TODO fix during the indexer refactoring described in #276
@@ -48,6 +52,31 @@ pub struct MultiFieldPostingsWriter {
per_field_postings_writers: Vec<Box<PostingsWriter>>, per_field_postings_writers: Vec<Box<PostingsWriter>>,
} }
fn make_field_partition(
term_offsets: &[(&[u8], Addr, UnorderedTermId)],
) -> Vec<(Field, usize, usize)> {
let term_offsets_it = term_offsets
.iter()
.map(|(key, _, _)| Term::wrap(key).field())
.enumerate();
let mut prev_field = Field(u32::max_value());
let mut fields = vec![];
let mut offsets = vec![];
for (offset, field) in term_offsets_it {
if field != prev_field {
prev_field = field;
fields.push(field);
offsets.push(offset);
}
}
offsets.push(term_offsets.len());
let mut field_offsets = vec![];
for i in 0..fields.len() {
field_offsets.push((fields[i], offsets[i], offsets[i + 1]));
}
field_offsets
}
impl MultiFieldPostingsWriter { impl MultiFieldPostingsWriter {
/// Create a new `MultiFieldPostingsWriter` given /// Create a new `MultiFieldPostingsWriter` given
/// a schema and a heap. /// a schema and a heap.
@@ -93,38 +122,16 @@ impl MultiFieldPostingsWriter {
&self, &self,
serializer: &mut InvertedIndexSerializer, serializer: &mut InvertedIndexSerializer,
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> { ) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
.term_index self.term_index.iter().collect();
.iter()
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
.collect();
term_offsets.sort_unstable_by_key(|&(k, _, _)| k); term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
let mut offsets: Vec<(Field, usize)> = vec![]; let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
let term_offsets_it = term_offsets HashMap::new();
.iter()
.cloned()
.map(|(key, _, _)| Term::wrap(key).field())
.enumerate();
let mut unordered_term_mappings: HashMap< let field_offsets = make_field_partition(&term_offsets);
Field,
HashMap<UnorderedTermId, TermOrdinal>,
> = HashMap::new();
let mut prev_field = Field(u32::max_value());
for (offset, field) in term_offsets_it {
if field != prev_field {
offsets.push((field, offset));
prev_field = field;
}
}
offsets.push((Field(0), term_offsets.len()));
for i in 0..(offsets.len() - 1) {
let (field, start) = offsets[i];
let (_, stop) = offsets[i + 1];
for (field, start, stop) in field_offsets {
let field_entry = self.schema.get_field_entry(field); let field_entry = self.schema.get_field_entry(field);
match *field_entry.field_type() { match *field_entry.field_type() {
@@ -138,10 +145,11 @@ impl MultiFieldPostingsWriter {
.enumerate() .enumerate()
.map(|(term_ord, unord_term_id)| { .map(|(term_ord, unord_term_id)| {
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal) (unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
}).collect(); })
.collect();
unordered_term_mappings.insert(field, mapping); unordered_term_mappings.insert(field, mapping);
} }
FieldType::U64(_) | FieldType::I64(_) => {} FieldType::U64(_) | FieldType::I64(_) | FieldType::Date(_) => {}
FieldType::Bytes => {} FieldType::Bytes => {}
} }
@@ -202,8 +210,18 @@ pub trait PostingsWriter {
) -> u32 { ) -> u32 {
let mut term = Term::for_field(field); let mut term = Term::for_field(field);
let mut sink = |token: &Token| { let mut sink = |token: &Token| {
term.set_text(token.text.as_str()); // We skip all tokens with a len greater than u16.
self.subscribe(term_index, doc_id, token.position as u32, &term, heap); if token.text.len() <= MAX_TOKEN_LEN {
term.set_text(token.text.as_str());
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
} else {
info!(
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
MAX_TOKEN_LEN in the documentation for more information.",
token.text.len(),
MAX_TOKEN_LEN
);
}
}; };
token_stream.process(&mut sink) token_stream.process(&mut sink)
} }
@@ -213,7 +231,7 @@ pub trait PostingsWriter {
/// The `SpecializedPostingsWriter` is just here to remove dynamic /// The `SpecializedPostingsWriter` is just here to remove dynamic
/// dispatch to the recorder information. /// dispatch to the recorder information.
pub struct SpecializedPostingsWriter<Rec: Recorder + 'static> { pub(crate) struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
total_num_tokens: u64, total_num_tokens: u64,
_recorder_type: PhantomData<Rec>, _recorder_type: PhantomData<Rec>,
} }
@@ -245,8 +263,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
debug_assert!(term.as_slice().len() >= 4); debug_assert!(term.as_slice().len() >= 4);
self.total_num_tokens += 1; self.total_num_tokens += 1;
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| { term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
if opt_recorder.is_some() { if let Some(mut recorder) = opt_recorder {
let mut recorder = opt_recorder.unwrap();
let current_doc = recorder.current_doc(); let current_doc = recorder.current_doc();
if current_doc != doc { if current_doc != doc {
recorder.close_doc(heap); recorder.close_doc(heap);
@@ -255,7 +272,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
recorder.record_position(position, heap); recorder.record_position(position, heap);
recorder recorder
} else { } else {
let mut recorder = Rec::new(heap); let mut recorder = Rec::new();
recorder.new_doc(doc, heap); recorder.new_doc(doc, heap);
recorder.record_position(position, heap); recorder.record_position(position, heap);
recorder recorder
@@ -270,10 +287,11 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
termdict_heap: &MemoryArena, termdict_heap: &MemoryArena,
heap: &MemoryArena, heap: &MemoryArena,
) -> io::Result<()> { ) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for &(term_bytes, addr, _) in term_addrs { for &(term_bytes, addr, _) in term_addrs {
let recorder: Rec = unsafe { termdict_heap.read(addr) }; let recorder: Rec = termdict_heap.read(addr);
serializer.new_term(&term_bytes[4..])?; serializer.new_term(&term_bytes[4..])?;
recorder.serialize(serializer, heap)?; recorder.serialize(&mut buffer_lender, serializer, heap)?;
serializer.close_term()?; serializer.close_term()?;
} }
Ok(()) Ok(())

View File

@@ -1,10 +1,50 @@
use super::stacker::{ExpUnrolledLinkedList, MemoryArena}; use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
use common::{read_u32_vint, write_u32_vint};
use postings::FieldSerializer; use postings::FieldSerializer;
use std::{self, io}; use std::io;
use DocId; use DocId;
const EMPTY_ARRAY: [u32; 0] = [0u32; 0]; const POSITION_END: u32 = 0;
const POSITION_END: u32 = std::u32::MAX;
#[derive(Default)]
pub(crate) struct BufferLender {
buffer_u8: Vec<u8>,
buffer_u32: Vec<u32>,
}
impl BufferLender {
pub fn lend_u8(&mut self) -> &mut Vec<u8> {
self.buffer_u8.clear();
&mut self.buffer_u8
}
pub fn lend_all(&mut self) -> (&mut Vec<u8>, &mut Vec<u32>) {
self.buffer_u8.clear();
self.buffer_u32.clear();
(&mut self.buffer_u8, &mut self.buffer_u32)
}
}
pub struct VInt32Reader<'a> {
data: &'a [u8],
}
impl<'a> VInt32Reader<'a> {
fn new(data: &'a [u8]) -> VInt32Reader<'a> {
VInt32Reader { data }
}
}
impl<'a> Iterator for VInt32Reader<'a> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.data.is_empty() {
None
} else {
Some(read_u32_vint(&mut self.data))
}
}
}
/// Recorder is in charge of recording relevant information about /// Recorder is in charge of recording relevant information about
/// the presence of a term in a document. /// the presence of a term in a document.
@@ -15,9 +55,9 @@ const POSITION_END: u32 = std::u32::MAX;
/// * the document id /// * the document id
/// * the term frequency /// * the term frequency
/// * the term positions /// * the term positions
pub trait Recorder: Copy { pub(crate) trait Recorder: Copy + 'static {
/// ///
fn new(heap: &mut MemoryArena) -> Self; fn new() -> Self;
/// Returns the current document /// Returns the current document
fn current_doc(&self) -> u32; fn current_doc(&self) -> u32;
/// Starts recording information about a new document /// Starts recording information about a new document
@@ -29,7 +69,12 @@ pub trait Recorder: Copy {
/// Close the document. It will help record the term frequency. /// Close the document. It will help record the term frequency.
fn close_doc(&mut self, heap: &mut MemoryArena); fn close_doc(&mut self, heap: &mut MemoryArena);
/// Pushes the postings information to the serializer. /// Pushes the postings information to the serializer.
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()>; fn serialize(
&self,
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()>;
} }
/// Only records the doc ids /// Only records the doc ids
@@ -40,9 +85,9 @@ pub struct NothingRecorder {
} }
impl Recorder for NothingRecorder { impl Recorder for NothingRecorder {
fn new(heap: &mut MemoryArena) -> Self { fn new() -> Self {
NothingRecorder { NothingRecorder {
stack: ExpUnrolledLinkedList::new(heap), stack: ExpUnrolledLinkedList::new(),
current_doc: u32::max_value(), current_doc: u32::max_value(),
} }
} }
@@ -53,16 +98,23 @@ impl Recorder for NothingRecorder {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) { fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc; self.current_doc = doc;
self.stack.push(doc, heap); let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
} }
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {} fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
fn close_doc(&mut self, _heap: &mut MemoryArena) {} fn close_doc(&mut self, _heap: &mut MemoryArena) {}
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> { fn serialize(
for doc in self.stack.iter(heap) { &self,
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?; buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()> {
let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(heap, buffer);
for doc in VInt32Reader::new(&buffer[..]) {
serializer.write_doc(doc as u32, 0u32, &[][..])?;
} }
Ok(()) Ok(())
} }
@@ -77,9 +129,9 @@ pub struct TermFrequencyRecorder {
} }
impl Recorder for TermFrequencyRecorder { impl Recorder for TermFrequencyRecorder {
fn new(heap: &mut MemoryArena) -> Self { fn new() -> Self {
TermFrequencyRecorder { TermFrequencyRecorder {
stack: ExpUnrolledLinkedList::new(heap), stack: ExpUnrolledLinkedList::new(),
current_doc: u32::max_value(), current_doc: u32::max_value(),
current_tf: 0u32, current_tf: 0u32,
} }
@@ -91,7 +143,7 @@ impl Recorder for TermFrequencyRecorder {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) { fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc; self.current_doc = doc;
self.stack.push(doc, heap); let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
} }
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) { fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
@@ -100,24 +152,24 @@ impl Recorder for TermFrequencyRecorder {
fn close_doc(&mut self, heap: &mut MemoryArena) { fn close_doc(&mut self, heap: &mut MemoryArena) {
debug_assert!(self.current_tf > 0); debug_assert!(self.current_tf > 0);
self.stack.push(self.current_tf, heap); let _ = write_u32_vint(self.current_tf, &mut self.stack.writer(heap));
self.current_tf = 0; self.current_tf = 0;
} }
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> { fn serialize(
// the last document has not been closed... &self,
// its term freq is self.current_tf. buffer_lender: &mut BufferLender,
let mut doc_iter = self serializer: &mut FieldSerializer,
.stack heap: &MemoryArena,
.iter(heap) ) -> io::Result<()> {
.chain(Some(self.current_tf).into_iter()); let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(heap, buffer);
while let Some(doc) = doc_iter.next() { let mut u32_it = VInt32Reader::new(&buffer[..]);
let term_freq = doc_iter while let Some(doc) = u32_it.next() {
.next() let term_freq = u32_it.next().unwrap_or(self.current_tf);
.expect("The IndexWriter recorded a doc without a term freq."); serializer.write_doc(doc as u32, term_freq, &[][..])?;
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
} }
Ok(()) Ok(())
} }
} }
@@ -128,11 +180,10 @@ pub struct TFAndPositionRecorder {
stack: ExpUnrolledLinkedList, stack: ExpUnrolledLinkedList,
current_doc: DocId, current_doc: DocId,
} }
impl Recorder for TFAndPositionRecorder { impl Recorder for TFAndPositionRecorder {
fn new(heap: &mut MemoryArena) -> Self { fn new() -> Self {
TFAndPositionRecorder { TFAndPositionRecorder {
stack: ExpUnrolledLinkedList::new(heap), stack: ExpUnrolledLinkedList::new(),
current_doc: u32::max_value(), current_doc: u32::max_value(),
} }
} }
@@ -143,33 +194,88 @@ impl Recorder for TFAndPositionRecorder {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) { fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc; self.current_doc = doc;
self.stack.push(doc, heap); let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
} }
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) { fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
self.stack.push(position, heap); let _ = write_u32_vint(position + 1u32, &mut self.stack.writer(heap));
} }
fn close_doc(&mut self, heap: &mut MemoryArena) { fn close_doc(&mut self, heap: &mut MemoryArena) {
self.stack.push(POSITION_END, heap); let _ = write_u32_vint(POSITION_END, &mut self.stack.writer(heap));
} }
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> { fn serialize(
let mut doc_positions = Vec::with_capacity(100); &self,
let mut positions_iter = self.stack.iter(heap); buffer_lender: &mut BufferLender,
while let Some(doc) = positions_iter.next() { serializer: &mut FieldSerializer,
let mut prev_position = 0; heap: &MemoryArena,
doc_positions.clear(); ) -> io::Result<()> {
for position in &mut positions_iter { let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
if position == POSITION_END { self.stack.read_to_end(heap, buffer_u8);
break; let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
} else { while let Some(doc) = u32_it.next() {
doc_positions.push(position - prev_position); let mut prev_position_plus_one = 1u32;
prev_position = position; buffer_positions.clear();
loop {
match u32_it.next() {
Some(POSITION_END) | None => {
break;
}
Some(position_plus_one) => {
let delta_position = position_plus_one - prev_position_plus_one;
buffer_positions.push(delta_position);
prev_position_plus_one = position_plus_one;
}
} }
} }
serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)?; serializer.write_doc(doc, buffer_positions.len() as u32, &buffer_positions)?;
} }
Ok(()) Ok(())
} }
} }
#[cfg(test)]
mod tests {
use super::write_u32_vint;
use super::BufferLender;
use super::VInt32Reader;
#[test]
fn test_buffer_lender() {
let mut buffer_lender = BufferLender::default();
{
let buf = buffer_lender.lend_u8();
assert!(buf.is_empty());
buf.push(1u8);
}
{
let buf = buffer_lender.lend_u8();
assert!(buf.is_empty());
buf.push(1u8);
}
{
let (_, buf) = buffer_lender.lend_all();
assert!(buf.is_empty());
buf.push(1u32);
}
{
let (_, buf) = buffer_lender.lend_all();
assert!(buf.is_empty());
buf.push(1u32);
}
}
#[test]
fn test_vint_u32() {
let mut buffer = vec![];
let vals = [0, 1, 324_234_234, u32::max_value()];
for &i in &vals {
assert!(write_u32_vint(i, &mut buffer).is_ok());
}
assert_eq!(buffer.len(), 1 + 1 + 5 + 5);
let res: Vec<u32> = VInt32Reader::new(&buffer[..]).collect();
assert_eq!(&res[..], &vals[..]);
}
}

View File

@@ -2,22 +2,21 @@ use common::BitSet;
use common::HasLen; use common::HasLen;
use common::{BinarySerializable, VInt}; use common::{BinarySerializable, VInt};
use docset::{DocSet, SkipResult}; use docset::{DocSet, SkipResult};
use fst::Streamer;
use owned_read::OwnedRead; use owned_read::OwnedRead;
use positions::PositionReader; use positions::PositionReader;
use postings::compression::compressed_block_size; use postings::compression::{compressed_block_size, AlignedBuffer};
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE}; use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
use postings::serializer::PostingsSerializer; use postings::serializer::PostingsSerializer;
use postings::BlockSearcher;
use postings::FreqReadingOption; use postings::FreqReadingOption;
use postings::Postings; use postings::Postings;
use postings::SkipReader; use postings::SkipReader;
use postings::USE_SKIP_INFO_LIMIT; use postings::USE_SKIP_INFO_LIMIT;
use schema::IndexRecordOption; use schema::IndexRecordOption;
use std::cmp::Ordering; use std::cmp::Ordering;
use tantivy_fst::Streamer;
use DocId; use DocId;
const EMPTY_ARR: [u8; 0] = [];
struct PositionComputer { struct PositionComputer {
// store the amount of position int // store the amount of position int
// before reading positions. // before reading positions.
@@ -62,6 +61,7 @@ pub struct SegmentPostings {
block_cursor: BlockSegmentPostings, block_cursor: BlockSegmentPostings,
cur: usize, cur: usize,
position_computer: Option<PositionComputer>, position_computer: Option<PositionComputer>,
block_searcher: BlockSearcher,
} }
impl SegmentPostings { impl SegmentPostings {
@@ -72,6 +72,7 @@ impl SegmentPostings {
block_cursor: empty_block_cursor, block_cursor: empty_block_cursor,
cur: COMPRESSION_BLOCK_SIZE, cur: COMPRESSION_BLOCK_SIZE,
position_computer: None, position_computer: None,
block_searcher: BlockSearcher::default(),
} }
} }
@@ -119,46 +120,33 @@ impl SegmentPostings {
block_cursor: segment_block_postings, block_cursor: segment_block_postings,
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
position_computer: positions_stream_opt.map(PositionComputer::new), position_computer: positions_stream_opt.map(PositionComputer::new),
block_searcher: BlockSearcher::default(),
} }
} }
} }
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
let mut start = 0;
let end = arr.len();
debug_assert!(target >= arr[start]);
debug_assert!(target <= arr[end - 1]);
let mut jump = 1;
loop {
let new = start + jump;
if new >= end {
return (start, end);
}
if arr[new] > target {
return (start, new);
}
start = new;
jump *= 2;
}
}
/// Search the first index containing an element greater or equal to the target.
///
/// # Assumption
///
/// The array is assumed non empty.
/// The target is assumed greater or equal to the first element.
/// The target is assumed smaller or equal to the last element.
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
let (start, end) = exponential_search(target, block_docs);
start.wrapping_add(
block_docs[start..end]
.binary_search(&target)
.unwrap_or_else(|e| e),
)
}
impl DocSet for SegmentPostings { impl DocSet for SegmentPostings {
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> bool {
if self.position_computer.is_some() && self.cur < COMPRESSION_BLOCK_SIZE {
let term_freq = self.term_freq() as usize;
if let Some(position_computer) = self.position_computer.as_mut() {
position_computer.add_skip(term_freq);
}
}
self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = COMPRESSION_BLOCK_SIZE;
return false;
}
}
true
}
fn skip_next(&mut self, target: DocId) -> SkipResult { fn skip_next(&mut self, target: DocId) -> SkipResult {
if !self.advance() { if !self.advance() {
return SkipResult::End; return SkipResult::End;
@@ -181,7 +169,6 @@ impl DocSet for SegmentPostings {
// skip blocks until one that might contain the target // skip blocks until one that might contain the target
// check if we need to go to the next block // check if we need to go to the next block
let need_positions = self.position_computer.is_some();
let mut sum_freqs_skipped: u32 = 0; let mut sum_freqs_skipped: u32 = 0;
if !self if !self
.block_cursor .block_cursor
@@ -195,7 +182,7 @@ impl DocSet for SegmentPostings {
// we are not in the right block. // we are not in the right block.
// //
// First compute all of the freqs skipped from the current block. // First compute all of the freqs skipped from the current block.
if need_positions { if self.position_computer.is_some() {
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum(); sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
match self.block_cursor.skip_to(target) { match self.block_cursor.skip_to(target) {
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => { BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
@@ -214,26 +201,21 @@ impl DocSet for SegmentPostings {
self.cur = 0; self.cur = 0;
} }
// we're in the right block now, start with an exponential search let cur = self.cur;
let block_docs = self.block_cursor.docs();
debug_assert!(target >= self.doc()); // we're in the right block now, start with an exponential search
let (output, len) = self.block_cursor.docs_aligned();
let new_cur = self let new_cur = self
.cur .block_searcher
.wrapping_add(search_within_block(&block_docs[self.cur..], target)); .search_in_block(&output, len, cur, target);
if need_positions { if let Some(position_computer) = self.position_computer.as_mut() {
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur] sum_freqs_skipped += self.block_cursor.freqs()[cur..new_cur].iter().sum::<u32>();
.iter() position_computer.add_skip(sum_freqs_skipped as usize);
.sum::<u32>();
self.position_computer
.as_mut()
.unwrap()
.add_skip(sum_freqs_skipped as usize);
} }
self.cur = new_cur; self.cur = new_cur;
// `doc` is now the first element >= `target` // `doc` is now the first element >= `target`
let doc = block_docs[new_cur]; let doc = output.0[new_cur];
debug_assert!(doc >= target); debug_assert!(doc >= target);
if doc == target { if doc == target {
SkipResult::Reached SkipResult::Reached
@@ -242,40 +224,25 @@ impl DocSet for SegmentPostings {
} }
} }
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> bool {
if self.position_computer.is_some() {
let term_freq = self.term_freq() as usize;
self.position_computer.as_mut().unwrap().add_skip(term_freq);
}
self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = COMPRESSION_BLOCK_SIZE;
return false;
}
}
true
}
fn size_hint(&self) -> u32 {
self.len() as u32
}
/// Return the current document's `DocId`. /// Return the current document's `DocId`.
///
/// # Panics
///
/// Will panics if called without having called advance before.
#[inline] #[inline]
fn doc(&self) -> DocId { fn doc(&self) -> DocId {
let docs = self.block_cursor.docs(); let docs = self.block_cursor.docs();
debug_assert!( debug_assert!(
self.cur < docs.len(), self.cur < docs.len(),
"Have you forgotten to call `.advance()` at least once before calling .doc()." "Have you forgotten to call `.advance()` at least once before calling `.doc()` ."
); );
docs[self.cur] docs[self.cur]
} }
fn size_hint(&self) -> u32 {
self.len() as u32
}
fn append_to_bitset(&mut self, bitset: &mut BitSet) { fn append_to_bitset(&mut self, bitset: &mut BitSet) {
// finish the current block // finish the current block
if self.advance() { if self.advance() {
@@ -299,17 +266,33 @@ impl HasLen for SegmentPostings {
} }
impl Postings for SegmentPostings { impl Postings for SegmentPostings {
/// Returns the frequency associated to the current document.
/// If the schema is set up so that no frequency have been encoded,
/// this method should always return 1.
///
/// # Panics
///
/// Will panics if called without having called advance before.
fn term_freq(&self) -> u32 { fn term_freq(&self) -> u32 {
debug_assert!(
// Here we do not use the len of `freqs()`
// because it is actually ok to request for the freq of doc
// even if no frequency were encoded for the field.
//
// In that case we hit the block just as if the frequency had been
// decoded. The block is simply prefilled by the value 1.
self.cur < COMPRESSION_BLOCK_SIZE,
"Have you forgotten to call `.advance()` at least once before calling \
`.term_freq()`."
);
self.block_cursor.freq(self.cur) self.block_cursor.freq(self.cur)
} }
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) { fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
if self.position_computer.is_some() { let term_freq = self.term_freq() as usize;
output.resize(self.term_freq() as usize, 0u32); if let Some(position_comp) = self.position_computer.as_mut() {
self.position_computer output.resize(term_freq, 0u32);
.as_mut() position_comp.positions_with_offset(offset, &mut output[..]);
.unwrap()
.positions_with_offset(offset, &mut output[..])
} else { } else {
output.clear(); output.clear();
} }
@@ -374,7 +357,7 @@ impl BlockSegmentPostings {
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data); let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
let skip_reader = match skip_data_opt { let skip_reader = match skip_data_opt {
Some(skip_data) => SkipReader::new(skip_data, record_option), Some(skip_data) => SkipReader::new(skip_data, record_option),
None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option), None => SkipReader::new(OwnedRead::new(&[][..]), record_option),
}; };
let doc_freq = doc_freq as usize; let doc_freq = doc_freq as usize;
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE; let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
@@ -408,7 +391,7 @@ impl BlockSegmentPostings {
if let Some(skip_data) = skip_data_opt { if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data); self.skip_reader.reset(skip_data);
} else { } else {
self.skip_reader.reset(OwnedRead::new(&EMPTY_ARR[..])) self.skip_reader.reset(OwnedRead::new(&[][..]))
} }
self.doc_offset = 0; self.doc_offset = 0;
self.doc_freq = doc_freq as usize; self.doc_freq = doc_freq as usize;
@@ -431,6 +414,10 @@ impl BlockSegmentPostings {
self.doc_decoder.output_array() self.doc_decoder.output_array()
} }
pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
self.doc_decoder.output_aligned()
}
/// Return the document at index `idx` of the block. /// Return the document at index `idx` of the block.
#[inline] #[inline]
pub fn doc(&self, idx: usize) -> u32 { pub fn doc(&self, idx: usize) -> u32 {
@@ -533,7 +520,8 @@ impl BlockSegmentPostings {
} else { } else {
BlockSegmentPostingsSkipResult::Terminated BlockSegmentPostingsSkipResult::Terminated
} }
}).unwrap_or(BlockSegmentPostingsSkipResult::Terminated); })
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
} }
BlockSegmentPostingsSkipResult::Terminated BlockSegmentPostingsSkipResult::Terminated
} }
@@ -620,20 +608,20 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::search_within_block;
use super::BlockSegmentPostings; use super::BlockSegmentPostings;
use super::BlockSegmentPostingsSkipResult; use super::BlockSegmentPostingsSkipResult;
use super::SegmentPostings; use super::SegmentPostings;
use common::HasLen; use common::HasLen;
use core::Index; use core::Index;
use docset::DocSet; use docset::DocSet;
use fst::Streamer; use postings::postings::Postings;
use schema::IndexRecordOption; use schema::IndexRecordOption;
use schema::SchemaBuilder; use schema::Schema;
use schema::Term; use schema::Term;
use schema::INT_INDEXED; use schema::INDEXED;
use tantivy_fst::Streamer;
use DocId; use DocId;
use SkipResult;
#[test] #[test]
fn test_empty_segment_postings() { fn test_empty_segment_postings() {
@@ -643,6 +631,18 @@ mod tests {
assert_eq!(postings.len(), 0); assert_eq!(postings.len(), 0);
} }
#[test]
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
fn test_panic_if_doc_called_before_advance() {
SegmentPostings::empty().doc();
}
#[test]
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
fn test_panic_if_freq_called_before_advance() {
SegmentPostings::empty().term_freq();
}
#[test] #[test]
fn test_empty_block_segment_postings() { fn test_empty_block_segment_postings() {
let mut postings = BlockSegmentPostings::empty(); let mut postings = BlockSegmentPostings::empty();
@@ -650,49 +650,9 @@ mod tests {
assert_eq!(postings.doc_freq(), 0); assert_eq!(postings.doc_freq(), 0);
} }
fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
block
.iter()
.cloned()
.enumerate()
.filter(|&(_, ref val)| *val >= target)
.next()
.unwrap()
.0
}
fn util_test_search_within_block(block: &[u32], target: u32) {
assert_eq!(
search_within_block(block, target),
search_within_block_trivial_but_slow(block, target)
);
}
fn util_test_search_within_block_all(block: &[u32]) {
use std::collections::HashSet;
let mut targets = HashSet::new();
for (i, val) in block.iter().cloned().enumerate() {
if i > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_within_block(block, target);
}
}
#[test]
fn test_search_within_block() {
for len in 1u32..128u32 {
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
util_test_search_within_block_all(&v[..]);
}
}
#[test] #[test]
fn test_block_segment_postings() { fn test_block_segment_postings() {
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>()); let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32; let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty // checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty()); assert!(block_segments.docs().is_empty());
@@ -706,14 +666,44 @@ mod tests {
} }
} }
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings { #[test]
let mut schema_builder = SchemaBuilder::default(); fn test_skip_right_at_new_block() {
let int_field = schema_builder.add_u64_field("id", INT_INDEXED); let mut doc_ids = (0..128).collect::<Vec<u32>>();
doc_ids.push(129);
doc_ids.push(130);
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(129), SkipResult::Reached);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(131), SkipResult::End);
}
}
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut last_doc = 0u32; let mut last_doc = 0u32;
for doc in docs { for &doc in docs {
for _ in last_doc..doc { for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64)); index_writer.add_document(doc!(int_field=>1u64));
} }
@@ -721,8 +711,7 @@ mod tests {
last_doc = doc + 1; last_doc = doc + 1;
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap(); let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field); let inverted_index = segment_reader.inverted_index(int_field);
let term = Term::from_field_u64(int_field, 0u64); let term = Term::from_field_u64(int_field, 0u64);
@@ -733,7 +722,7 @@ mod tests {
#[test] #[test]
fn test_block_segment_postings_skip() { fn test_block_segment_postings_skip() {
for i in 0..4 { for i in 0..4 {
let mut block_postings = build_block_postings(vec![3]); let mut block_postings = build_block_postings(&[3]);
assert_eq!( assert_eq!(
block_postings.skip_to(i), block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Success(0u32) BlockSegmentPostingsSkipResult::Success(0u32)
@@ -743,7 +732,7 @@ mod tests {
BlockSegmentPostingsSkipResult::Terminated BlockSegmentPostingsSkipResult::Terminated
); );
} }
let mut block_postings = build_block_postings(vec![3]); let mut block_postings = build_block_postings(&[3]);
assert_eq!( assert_eq!(
block_postings.skip_to(4u32), block_postings.skip_to(4u32),
BlockSegmentPostingsSkipResult::Terminated BlockSegmentPostingsSkipResult::Terminated
@@ -756,7 +745,7 @@ mod tests {
for i in 0..1300 { for i in 0..1300 {
docs.push((i * i / 100) + i); docs.push((i * i / 100) + i);
} }
let mut block_postings = build_block_postings(docs.clone()); let mut block_postings = build_block_postings(&docs[..]);
for i in vec![0, 424, 10000] { for i in vec![0, 424, 10000] {
assert_eq!( assert_eq!(
block_postings.skip_to(i), block_postings.skip_to(i),
@@ -778,11 +767,11 @@ mod tests {
#[test] #[test]
fn test_reset_block_segment_postings() { fn test_reset_block_segment_postings() {
let mut schema_builder = SchemaBuilder::default(); let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INT_INDEXED); let int_field = schema_builder.add_u64_field("id", INDEXED);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
// create two postings list, one containg even number, // create two postings list, one containg even number,
// the other containing odd numbers. // the other containing odd numbers.
for i in 0..6 { for i in 0..6 {
@@ -790,8 +779,7 @@ mod tests {
index_writer.add_document(doc); index_writer.add_document(doc);
} }
index_writer.commit().unwrap(); index_writer.commit().unwrap();
index.load_searchers().unwrap(); let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0); let segment_reader = searcher.segment_reader(0);
let mut block_segments; let mut block_segments;

View File

@@ -14,7 +14,7 @@ use termdict::{TermDictionaryBuilder, TermOrdinal};
use DocId; use DocId;
use Result; use Result;
/// `PostingsSerializer` is in charge of serializing /// `InvertedIndexSerializer` is in charge of serializing
/// postings on disk, in the /// postings on disk, in the
/// * `.idx` (inverted index) /// * `.idx` (inverted index)
/// * `.pos` (positions file) /// * `.pos` (positions file)
@@ -54,8 +54,8 @@ pub struct InvertedIndexSerializer {
} }
impl InvertedIndexSerializer { impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment /// Open a new `InvertedIndexSerializer` for the given segment
fn new( fn create(
terms_write: CompositeWrite<WritePtr>, terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>, postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>, positions_write: CompositeWrite<WritePtr>,
@@ -74,7 +74,7 @@ impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment /// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> { pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
use SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS}; use SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
InvertedIndexSerializer::new( InvertedIndexSerializer::create(
CompositeWrite::wrap(segment.open_write(TERMS)?), CompositeWrite::wrap(segment.open_write(TERMS)?),
CompositeWrite::wrap(segment.open_write(POSTINGS)?), CompositeWrite::wrap(segment.open_write(POSTINGS)?),
CompositeWrite::wrap(segment.open_write(POSITIONS)?), CompositeWrite::wrap(segment.open_write(POSITIONS)?),
@@ -99,7 +99,7 @@ impl InvertedIndexSerializer {
let positions_write = self.positions_write.for_field(field); let positions_write = self.positions_write.for_field(field);
let positionsidx_write = self.positionsidx_write.for_field(field); let positionsidx_write = self.positionsidx_write.for_field(field);
let field_type: FieldType = (*field_entry.field_type()).clone(); let field_type: FieldType = (*field_entry.field_type()).clone();
FieldSerializer::new( FieldSerializer::create(
&field_type, &field_type,
term_dictionary_write, term_dictionary_write,
postings_write, postings_write,
@@ -130,7 +130,7 @@ pub struct FieldSerializer<'a> {
} }
impl<'a> FieldSerializer<'a> { impl<'a> FieldSerializer<'a> {
fn new( fn create(
field_type: &FieldType, field_type: &FieldType,
term_dictionary_write: &'a mut CountingWriter<WritePtr>, term_dictionary_write: &'a mut CountingWriter<WritePtr>,
postings_write: &'a mut CountingWriter<WritePtr>, postings_write: &'a mut CountingWriter<WritePtr>,
@@ -152,7 +152,7 @@ impl<'a> FieldSerializer<'a> {
_ => (false, false), _ => (false, false),
}; };
let term_dictionary_builder = let term_dictionary_builder =
TermDictionaryBuilder::new(term_dictionary_write, &field_type)?; TermDictionaryBuilder::create(term_dictionary_write, &field_type)?;
let postings_serializer = let postings_serializer =
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled); PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
let positions_serializer_opt = if position_enabled { let positions_serializer_opt = if position_enabled {
@@ -175,7 +175,7 @@ impl<'a> FieldSerializer<'a> {
let positions_idx = self let positions_idx = self
.positions_serializer_opt .positions_serializer_opt
.as_ref() .as_ref()
.map(|positions_serializer| positions_serializer.positions_idx()) .map(PositionSerializer::positions_idx)
.unwrap_or(0u64); .unwrap_or(0u64);
TermInfo { TermInfo {
doc_freq: 0, doc_freq: 0,

View File

@@ -1,28 +1,37 @@
use super::{Addr, MemoryArena}; use super::{Addr, MemoryArena};
use common::is_power_of_2; use postings::stacker::memory_arena::load;
use postings::stacker::memory_arena::store;
use std::io;
use std::mem; use std::mem;
const MAX_BLOCK_LEN: u32 = 1u32 << 15; const MAX_BLOCK_LEN: u32 = 1u32 << 15;
const FIRST_BLOCK: usize = 16;
const INLINED_BLOCK_LEN: usize = FIRST_BLOCK + mem::size_of::<Addr>();
const FIRST_BLOCK: u32 = 4u32; enum CapacityResult {
Available(u32),
NeedAlloc(u32),
}
#[inline] fn len_to_capacity(len: u32) -> CapacityResult {
pub fn jump_needed(len: u32) -> Option<usize> {
match len { match len {
0...3 => None, 0...15 => CapacityResult::Available(FIRST_BLOCK as u32 - len),
4...MAX_BLOCK_LEN => { 16...MAX_BLOCK_LEN => {
if is_power_of_2(len as usize) { let cap = 1 << (32u32 - (len - 1u32).leading_zeros());
Some(len as usize) let available = cap - len;
if available == 0 {
CapacityResult::NeedAlloc(len)
} else { } else {
None CapacityResult::Available(available)
} }
} }
n => { n => {
if n % MAX_BLOCK_LEN == 0 { let available = n % MAX_BLOCK_LEN;
Some(MAX_BLOCK_LEN as usize) if available == 0 {
CapacityResult::NeedAlloc(MAX_BLOCK_LEN)
} else { } else {
None CapacityResult::Available(MAX_BLOCK_LEN - available)
} }
} }
} }
@@ -52,82 +61,119 @@ pub fn jump_needed(len: u32) -> Option<usize> {
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct ExpUnrolledLinkedList { pub struct ExpUnrolledLinkedList {
len: u32, len: u32,
head: Addr,
tail: Addr, tail: Addr,
inlined_data: [u8; INLINED_BLOCK_LEN as usize],
}
pub struct ExpUnrolledLinkedListWriter<'a> {
eull: &'a mut ExpUnrolledLinkedList,
heap: &'a mut MemoryArena,
}
fn ensure_capacity<'a>(
eull: &'a mut ExpUnrolledLinkedList,
heap: &'a mut MemoryArena,
) -> &'a mut [u8] {
if eull.len <= FIRST_BLOCK as u32 {
// We are still hitting the inline block.
if eull.len < FIRST_BLOCK as u32 {
return &mut eull.inlined_data[eull.len as usize..FIRST_BLOCK];
}
// We need to allocate a new block!
let new_block_addr: Addr = heap.allocate_space(FIRST_BLOCK + mem::size_of::<Addr>());
store(&mut eull.inlined_data[FIRST_BLOCK..], new_block_addr);
eull.tail = new_block_addr;
return heap.slice_mut(eull.tail, FIRST_BLOCK);
}
let len = match len_to_capacity(eull.len) {
CapacityResult::NeedAlloc(new_block_len) => {
let new_block_addr: Addr =
heap.allocate_space(new_block_len as usize + mem::size_of::<Addr>());
heap.write_at(eull.tail, new_block_addr);
eull.tail = new_block_addr;
new_block_len
}
CapacityResult::Available(available) => available,
};
heap.slice_mut(eull.tail, len as usize)
}
impl<'a> ExpUnrolledLinkedListWriter<'a> {
pub fn extend_from_slice(&mut self, mut buf: &[u8]) {
if buf.is_empty() {
// we need to cut early, because `ensure_capacity`
// allocates if there is no capacity at all right now.
return;
}
while !buf.is_empty() {
let add_len: usize;
{
let output_buf = ensure_capacity(self.eull, self.heap);
add_len = buf.len().min(output_buf.len());
output_buf[..add_len].copy_from_slice(&buf[..add_len]);
}
self.eull.len += add_len as u32;
self.eull.tail = self.eull.tail.offset(add_len as u32);
buf = &buf[add_len..];
}
}
}
impl<'a> io::Write for ExpUnrolledLinkedListWriter<'a> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
// There is no use case to only write the capacity.
// This is not IO after all, so we write the whole
// buffer even if the contract of `.write` is looser.
self.extend_from_slice(buf);
Ok(buf.len())
}
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
self.extend_from_slice(buf);
Ok(())
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
} }
impl ExpUnrolledLinkedList { impl ExpUnrolledLinkedList {
pub fn new(heap: &mut MemoryArena) -> ExpUnrolledLinkedList { pub fn new() -> ExpUnrolledLinkedList {
let addr = heap.allocate_space((FIRST_BLOCK as usize) * mem::size_of::<u32>());
ExpUnrolledLinkedList { ExpUnrolledLinkedList {
len: 0u32, len: 0u32,
head: addr, tail: Addr::null_pointer(),
tail: addr, inlined_data: [0u8; INLINED_BLOCK_LEN as usize],
} }
} }
pub fn iter<'a>(&self, heap: &'a MemoryArena) -> ExpUnrolledLinkedListIterator<'a> { #[inline(always)]
ExpUnrolledLinkedListIterator { pub fn writer<'a>(&'a mut self, heap: &'a mut MemoryArena) -> ExpUnrolledLinkedListWriter<'a> {
heap, ExpUnrolledLinkedListWriter { eull: self, heap }
addr: self.head,
len: self.len,
consumed: 0,
}
} }
/// Appends a new element to the current stack. pub fn read_to_end(&self, heap: &MemoryArena, output: &mut Vec<u8>) {
/// let len = self.len as usize;
/// If the current block end is reached, a new block is allocated. if len <= FIRST_BLOCK {
pub fn push(&mut self, val: u32, heap: &mut MemoryArena) { output.extend_from_slice(&self.inlined_data[..len]);
self.len += 1; return;
if let Some(new_block_len) = jump_needed(self.len) {
// We need to allocate another block.
// We also allocate an extra `u32` to store the pointer
// to the future next block.
let new_block_size: usize = (new_block_len + 1) * mem::size_of::<u32>();
let new_block_addr: Addr = heap.allocate_space(new_block_size);
unsafe {
// logic
heap.write(self.tail, new_block_addr)
};
self.tail = new_block_addr;
} }
unsafe { output.extend_from_slice(&self.inlined_data[..FIRST_BLOCK]);
// logic let mut cur = FIRST_BLOCK;
heap.write(self.tail, val); let mut addr = load(&self.inlined_data[FIRST_BLOCK..]);
self.tail = self.tail.offset(mem::size_of::<u32>() as u32); loop {
} let cap = match len_to_capacity(cur as u32) {
} CapacityResult::Available(capacity) => capacity,
} CapacityResult::NeedAlloc(capacity) => capacity,
} as usize;
pub struct ExpUnrolledLinkedListIterator<'a> { let data = heap.slice(addr, cap);
heap: &'a MemoryArena, if cur + cap >= len {
addr: Addr, output.extend_from_slice(&data[..(len - cur)]);
len: u32, return;
consumed: u32, }
} output.extend_from_slice(data);
cur += cap;
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> { addr = heap.read(addr.offset(cap as u32));
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.consumed == self.len {
None
} else {
self.consumed += 1;
let addr: Addr = if jump_needed(self.consumed).is_some() {
unsafe {
// logic
self.heap.read(self.addr)
}
} else {
self.addr
};
self.addr = addr.offset(mem::size_of::<u32>() as u32);
Some(unsafe {
// logic
self.heap.read(addr)
})
} }
} }
} }
@@ -136,46 +182,134 @@ impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
mod tests { mod tests {
use super::super::MemoryArena; use super::super::MemoryArena;
use super::jump_needed; use super::len_to_capacity;
use super::*; use super::*;
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
#[test]
#[test] #[test]
fn test_stack() { fn test_stack() {
let mut heap = MemoryArena::new(); let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new(&mut heap); let mut stack = ExpUnrolledLinkedList::new();
stack.push(1u32, &mut heap); stack.writer(&mut heap).extend_from_slice(&[1u8]);
stack.push(2u32, &mut heap); stack.writer(&mut heap).extend_from_slice(&[2u8]);
stack.push(4u32, &mut heap); stack.writer(&mut heap).extend_from_slice(&[3u8, 4u8]);
stack.push(8u32, &mut heap); stack.writer(&mut heap).extend_from_slice(&[5u8]);
{ {
let mut it = stack.iter(&heap); let mut buffer = Vec::new();
assert_eq!(it.next().unwrap(), 1u32); stack.read_to_end(&heap, &mut buffer);
assert_eq!(it.next().unwrap(), 2u32); assert_eq!(&buffer[..], &[1u8, 2u8, 3u8, 4u8, 5u8]);
assert_eq!(it.next().unwrap(), 4u32);
assert_eq!(it.next().unwrap(), 8u32);
assert!(it.next().is_none());
} }
} }
#[test] #[test]
fn test_jump_if_needed() { fn test_stack_long() {
let mut block_len = 4u32; let mut heap = MemoryArena::new();
let mut i = 0; let mut stack = ExpUnrolledLinkedList::new();
while i < 10_000_000 { let source: Vec<u32> = (0..100).collect();
assert!(jump_needed(i + block_len - 1).is_none()); for &el in &source {
assert!(jump_needed(i + block_len + 1).is_none()); assert!(stack
assert!(jump_needed(i + block_len).is_some()); .writer(&mut heap)
let new_block_len = jump_needed(i + block_len).unwrap(); .write_u32::<LittleEndian>(el)
i += block_len; .is_ok());
block_len = new_block_len as u32;
} }
let mut buffer = Vec::new();
stack.read_to_end(&heap, &mut buffer);
let mut result = vec![];
let mut remaining = &buffer[..];
while !remaining.is_empty() {
result.push(LittleEndian::read_u32(&remaining[..4]));
remaining = &remaining[4..];
}
assert_eq!(&result[..], &source[..]);
}
#[test]
fn test_stack_interlaced() {
let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new();
let mut stack2 = ExpUnrolledLinkedList::new();
let mut vec1: Vec<u8> = vec![];
let mut vec2: Vec<u8> = vec![];
for i in 0..9 {
assert!(stack.writer(&mut heap).write_u32::<LittleEndian>(i).is_ok());
assert!(vec1.write_u32::<LittleEndian>(i).is_ok());
if i % 2 == 0 {
assert!(stack2
.writer(&mut heap)
.write_u32::<LittleEndian>(i)
.is_ok());
assert!(vec2.write_u32::<LittleEndian>(i).is_ok());
}
}
let mut res1 = vec![];
let mut res2 = vec![];
stack.read_to_end(&heap, &mut res1);
stack2.read_to_end(&heap, &mut res2);
assert_eq!(&vec1[..], &res1[..]);
assert_eq!(&vec2[..], &res2[..]);
}
#[test]
fn test_jump_if_needed() {
let mut available = 16u32;
for i in 0..10_000_000 {
match len_to_capacity(i) {
CapacityResult::NeedAlloc(cap) => {
assert_eq!(available, 0, "Failed len={}: Expected 0 got {}", i, cap);
available = cap;
}
CapacityResult::Available(cap) => {
assert_eq!(
available, cap,
"Failed len={}: Expected {} Got {}",
i, available, cap
);
}
}
available -= 1;
}
}
#[test]
fn test_jump_if_needed_progression() {
let mut v = vec![];
for i in 0.. {
if v.len() >= 10 {
break;
}
match len_to_capacity(i) {
CapacityResult::NeedAlloc(cap) => {
v.push((i, cap));
}
_ => {}
}
}
assert_eq!(
&v[..],
&[
(16, 16),
(32, 32),
(64, 64),
(128, 128),
(256, 256),
(512, 512),
(1024, 1024),
(2048, 2048),
(4096, 4096),
(8192, 8192)
]
);
} }
} }
#[cfg(all(test, feature = "unstable"))] #[cfg(all(test, feature = "unstable"))]
mod bench { mod bench {
use super::super::MemoryArena;
use super::ExpUnrolledLinkedList; use super::ExpUnrolledLinkedList;
use tantivy_memory_arena::MemoryArena; use byteorder::{NativeEndian, WriteBytesExt};
use test::Bencher; use test::Bencher;
const NUM_STACK: usize = 10_000; const NUM_STACK: usize = 10_000;
@@ -199,20 +333,19 @@ mod bench {
#[bench] #[bench]
fn bench_push_stack(bench: &mut Bencher) { fn bench_push_stack(bench: &mut Bencher) {
let heap = MemoryArena::new();
bench.iter(|| { bench.iter(|| {
let mut heap = MemoryArena::new();
let mut stacks = Vec::with_capacity(100); let mut stacks = Vec::with_capacity(100);
for _ in 0..NUM_STACK { for _ in 0..NUM_STACK {
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>(); let mut stack = ExpUnrolledLinkedList::new();
stacks.push(stack); stacks.push(stack);
} }
for s in 0..NUM_STACK { for s in 0..NUM_STACK {
for i in 0u32..STACK_SIZE { for i in 0u32..STACK_SIZE {
let t = s * 392017 % NUM_STACK; let t = s * 392017 % NUM_STACK;
stacks[t].push(i, &heap); let _ = stacks[t].writer(&mut heap).write_u32::<NativeEndian>(i);
} }
} }
heap.clear();
}); });
} }
} }

View File

@@ -37,7 +37,7 @@ const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large
/// page of memory. /// page of memory.
/// ///
/// The last 20 bits are an address within this page of memory. /// The last 20 bits are an address within this page of memory.
#[derive(Clone, Copy, Debug)] #[derive(Copy, Clone, Debug)]
pub struct Addr(u32); pub struct Addr(u32);
impl Addr { impl Addr {
@@ -69,32 +69,16 @@ impl Addr {
} }
} }
/// Trait required for an object to be `storable`. pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
/// assert_eq!(dest.len(), std::mem::size_of::<Item>());
/// # Warning unsafe {
/// ptr::write_unaligned(dest.as_mut_ptr() as *mut Item, val);
/// Most of the time you should not implement this trait, }
/// and only use the `MemoryArena` with object implementing `Copy`.
///
/// `ArenaStorable` is used in `tantivy` to force
/// a `Copy` object and a `slice` of data to be stored contiguously.
pub trait ArenaStorable {
fn num_bytes(&self) -> usize;
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr);
} }
impl<V> ArenaStorable for V pub fn load<Item: Copy + 'static>(data: &[u8]) -> Item {
where assert_eq!(data.len(), std::mem::size_of::<Item>());
V: Copy, unsafe { ptr::read_unaligned(data.as_ptr() as *const Item) }
{
fn num_bytes(&self) -> usize {
mem::size_of::<V>()
}
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
let dst_ptr = arena.get_mut_ptr(addr) as *mut V;
ptr::write_unaligned(dst_ptr, self);
}
} }
/// The `MemoryArena` /// The `MemoryArena`
@@ -126,47 +110,9 @@ impl MemoryArena {
self.pages.len() * PAGE_SIZE self.pages.len() * PAGE_SIZE
} }
/// Writes a slice at the given address, assuming the pub fn write_at<Item: Copy + 'static>(&mut self, addr: Addr, val: Item) {
/// memory was allocated beforehands. let dest = self.slice_mut(addr, std::mem::size_of::<Item>());
/// store(dest, val);
/// # Panics
///
/// May panic or corrupt the heap if he space was not
/// properly allocated beforehands.
pub fn write_bytes<B: AsRef<[u8]>>(&mut self, addr: Addr, data: B) {
let bytes = data.as_ref();
self.pages[addr.page_id()]
.get_mut_slice(addr.page_local_addr(), bytes.len())
.copy_from_slice(bytes);
}
/// Returns the `len` bytes starting at `addr`
///
/// # Panics
///
/// Panics if the memory has not been allocated beforehands.
pub fn read_slice(&self, addr: Addr, len: usize) -> &[u8] {
self.pages[addr.page_id()].get_slice(addr.page_local_addr(), len)
}
unsafe fn get_mut_ptr(&mut self, addr: Addr) -> *mut u8 {
self.pages[addr.page_id()].get_mut_ptr(addr.page_local_addr())
}
/// Stores an item's data in the heap
///
/// It allocates the `Item` beforehands.
pub fn store<Item: ArenaStorable>(&mut self, val: Item) -> Addr {
let num_bytes = val.num_bytes();
let addr = self.allocate_space(num_bytes);
unsafe {
self.write(addr, val);
};
addr
}
pub unsafe fn write<Item: ArenaStorable>(&mut self, addr: Addr, val: Item) {
val.write_into(self, addr)
} }
/// Read an item in the heap at the given `address`. /// Read an item in the heap at the given `address`.
@@ -174,9 +120,21 @@ impl MemoryArena {
/// # Panics /// # Panics
/// ///
/// If the address is erroneous /// If the address is erroneous
pub unsafe fn read<Item: Copy>(&self, addr: Addr) -> Item { pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
let ptr = self.pages[addr.page_id()].get_ptr(addr.page_local_addr()); load(self.slice(addr, mem::size_of::<Item>()))
ptr::read_unaligned(ptr as *const Item) }
pub fn slice(&self, addr: Addr, len: usize) -> &[u8] {
self.pages[addr.page_id()].slice(addr.page_local_addr(), len)
}
pub fn slice_from(&self, addr: Addr) -> &[u8] {
self.pages[addr.page_id()].slice_from(addr.page_local_addr())
}
#[inline(always)]
pub fn slice_mut(&mut self, addr: Addr, len: usize) -> &mut [u8] {
self.pages[addr.page_id()].slice_mut(addr.page_local_addr(), len)
} }
/// Allocates `len` bytes and returns the allocated address. /// Allocates `len` bytes and returns the allocated address.
@@ -197,14 +155,10 @@ struct Page {
impl Page { impl Page {
fn new(page_id: usize) -> Page { fn new(page_id: usize) -> Page {
let mut data: Vec<u8> = Vec::with_capacity(PAGE_SIZE);
unsafe {
data.set_len(PAGE_SIZE);
} // avoid initializing page
Page { Page {
page_id, page_id,
len: 0, len: 0,
data: data.into_boxed_slice(), data: vec![0u8; PAGE_SIZE].into_boxed_slice(),
} }
} }
@@ -213,12 +167,16 @@ impl Page {
len + self.len <= PAGE_SIZE len + self.len <= PAGE_SIZE
} }
fn get_mut_slice(&mut self, local_addr: usize, len: usize) -> &mut [u8] { fn slice(&self, local_addr: usize, len: usize) -> &[u8] {
&mut self.data[local_addr..][..len] &self.slice_from(local_addr)[..len]
} }
fn get_slice(&self, local_addr: usize, len: usize) -> &[u8] { fn slice_from(&self, local_addr: usize) -> &[u8] {
&self.data[local_addr..][..len] &self.data[local_addr..]
}
fn slice_mut(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
&mut self.data[local_addr..][..len]
} }
fn allocate_space(&mut self, len: usize) -> Option<Addr> { fn allocate_space(&mut self, len: usize) -> Option<Addr> {
@@ -230,16 +188,6 @@ impl Page {
None None
} }
} }
#[inline(always)]
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
self.data.as_ptr().add(addr)
}
#[inline(always)]
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
self.data.as_mut_ptr().add(addr)
}
} }
#[cfg(test)] #[cfg(test)]
@@ -254,13 +202,13 @@ mod tests {
let b = b"happy tax payer"; let b = b"happy tax payer";
let addr_a = arena.allocate_space(a.len()); let addr_a = arena.allocate_space(a.len());
arena.write_bytes(addr_a, a); arena.slice_mut(addr_a, a.len()).copy_from_slice(a);
let addr_b = arena.allocate_space(b.len()); let addr_b = arena.allocate_space(b.len());
arena.write_bytes(addr_b, b); arena.slice_mut(addr_b, b.len()).copy_from_slice(b);
assert_eq!(arena.read_slice(addr_a, a.len()), a); assert_eq!(arena.slice(addr_a, a.len()), a);
assert_eq!(arena.read_slice(addr_b, b.len()), b); assert_eq!(arena.slice(addr_b, b.len()), b);
} }
#[derive(Clone, Copy, Debug, Eq, PartialEq)] #[derive(Clone, Copy, Debug, Eq, PartialEq)]
@@ -283,9 +231,15 @@ mod tests {
b: 221, b: 221,
c: 12, c: 12,
}; };
let addr_a = arena.store(a);
let addr_b = arena.store(b); let num_bytes = std::mem::size_of::<MyTest>();
assert_eq!(unsafe { arena.read::<MyTest>(addr_a) }, a); let addr_a = arena.allocate_space(num_bytes);
assert_eq!(unsafe { arena.read::<MyTest>(addr_b) }, b); arena.write_at(addr_a, a);
let addr_b = arena.allocate_space(num_bytes);
arena.write_at(addr_b, b);
assert_eq!(arena.read::<MyTest>(addr_a), a);
assert_eq!(arena.read::<MyTest>(addr_b), b);
} }
} }

View File

@@ -1,9 +1,7 @@
mod expull; mod expull;
mod memory_arena; mod memory_arena;
mod murmurhash2;
mod term_hashmap; mod term_hashmap;
pub use self::expull::ExpUnrolledLinkedList; pub use self::expull::ExpUnrolledLinkedList;
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena}; pub use self::memory_arena::{Addr, MemoryArena};
use self::murmurhash2::murmurhash2;
pub use self::term_hashmap::{compute_table_size, TermHashMap}; pub use self::term_hashmap::{compute_table_size, TermHashMap};

View File

@@ -1,87 +0,0 @@
use std::ptr;
const SEED: u32 = 3_242_157_231u32;
const M: u32 = 0x5bd1_e995;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
k = k.wrapping_mul(M);
k ^= k >> 24;
k = k.wrapping_mul(M);
h = h.wrapping_mul(M);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining: &[u8] = &key[key.len() & !3..];
match remaining.len() {
3 => {
h ^= u32::from(remaining[2]) << 16;
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
2 => {
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
1 => {
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(M);
h ^ (h >> 15)
}
#[cfg(test)]
mod test {
use super::murmurhash2;
use std::collections::HashSet;
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_against_reference_impl() {
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
}

View File

@@ -1,37 +1,15 @@
use super::murmurhash2; extern crate murmurhash32;
use super::{Addr, ArenaStorable, MemoryArena};
use self::murmurhash32::murmurhash2;
use super::{Addr, MemoryArena};
use byteorder::{ByteOrder, NativeEndian};
use postings::stacker::memory_arena::store;
use postings::UnorderedTermId;
use std::iter; use std::iter;
use std::mem; use std::mem;
use std::slice; use std::slice;
pub type BucketId = usize;
struct KeyBytesValue<'a, V> {
key: &'a [u8],
value: V,
}
impl<'a, V> KeyBytesValue<'a, V> {
fn new(key: &'a [u8], value: V) -> KeyBytesValue<'a, V> {
KeyBytesValue { key, value }
}
}
impl<'a, V> ArenaStorable for KeyBytesValue<'a, V>
where
V: ArenaStorable,
{
fn num_bytes(&self) -> usize {
0u16.num_bytes() + self.key.len() + self.value.num_bytes()
}
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
arena.write(addr, self.key.len() as u16);
arena.write_bytes(addr.offset(2), self.key);
arena.write(addr.offset(2 + self.key.len() as u32), self.value);
}
}
/// Returns the actual memory size in bytes /// Returns the actual memory size in bytes
/// required to create a table of size $2^num_bits$. /// required to create a table of size $2^num_bits$.
pub fn compute_table_size(num_bits: usize) -> usize { pub fn compute_table_size(num_bits: usize) -> usize {
@@ -49,6 +27,7 @@ pub fn compute_table_size(num_bits: usize) -> usize {
struct KeyValue { struct KeyValue {
key_value_addr: Addr, key_value_addr: Addr,
hash: u32, hash: u32,
unordered_term_id: UnorderedTermId,
} }
impl Default for KeyValue { impl Default for KeyValue {
@@ -56,6 +35,7 @@ impl Default for KeyValue {
KeyValue { KeyValue {
key_value_addr: Addr::null_pointer(), key_value_addr: Addr::null_pointer(),
hash: 0u32, hash: 0u32,
unordered_term_id: UnorderedTermId::default(),
} }
} }
} }
@@ -80,6 +60,7 @@ pub struct TermHashMap {
pub heap: MemoryArena, pub heap: MemoryArena,
mask: usize, mask: usize,
occupied: Vec<usize>, occupied: Vec<usize>,
len: usize,
} }
struct QuadraticProbing { struct QuadraticProbing {
@@ -106,14 +87,13 @@ pub struct Iter<'a> {
} }
impl<'a> Iterator for Iter<'a> { impl<'a> Iterator for Iter<'a> {
type Item = (&'a [u8], Addr, BucketId); type Item = (&'a [u8], Addr, UnorderedTermId);
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
self.inner.next().cloned().map(move |bucket: usize| { self.inner.next().cloned().map(move |bucket: usize| {
let kv = self.hashmap.table[bucket]; let kv = self.hashmap.table[bucket];
let (key, offset): (&'a [u8], Addr) = let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
unsafe { self.hashmap.get_key_value(kv.key_value_addr) }; (key, offset, kv.unordered_term_id)
(key, offset, bucket as BucketId)
}) })
} }
} }
@@ -128,6 +108,7 @@ impl TermHashMap {
heap, heap,
mask: table_size - 1, mask: table_size - 1,
occupied: Vec::with_capacity(table_size / 2), occupied: Vec::with_capacity(table_size / 2),
len: 0,
} }
} }
@@ -143,20 +124,34 @@ impl TermHashMap {
self.table.len() < self.occupied.len() * 3 self.table.len() < self.occupied.len() * 3
} }
unsafe fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) { #[inline(always)]
let key_bytes_len = self.heap.read::<u16>(addr) as usize; fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
let key_addr = addr.offset(2u32); let data = self.heap.slice_from(addr);
let key_bytes: &[u8] = self.heap.read_slice(key_addr, key_bytes_len); let key_bytes_len = NativeEndian::read_u16(data) as usize;
let val_addr: Addr = key_addr.offset(key_bytes.len() as u32); let key_bytes: &[u8] = &data[2..][..key_bytes_len];
(key_bytes, val_addr) (key_bytes, addr.offset(2u32 + key_bytes_len as u32))
} }
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) { #[inline(always)]
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
let (stored_key, value_addr) = self.get_key_value(addr);
if stored_key == target_key {
Some(value_addr)
} else {
None
}
}
fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) -> UnorderedTermId {
self.occupied.push(bucket); self.occupied.push(bucket);
let unordered_term_id = self.len as UnorderedTermId;
self.len += 1;
self.table[bucket] = KeyValue { self.table[bucket] = KeyValue {
key_value_addr, key_value_addr,
hash, hash,
unordered_term_id,
}; };
unordered_term_id
} }
pub fn iter(&self) -> Iter { pub fn iter(&self) -> Iter {
@@ -196,64 +191,53 @@ impl TermHashMap {
/// will be in charge of returning a default value. /// will be in charge of returning a default value.
/// If the key already as an associated value, then it will be passed /// If the key already as an associated value, then it will be passed
/// `Some(previous_value)`. /// `Some(previous_value)`.
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId pub fn mutate_or_create<S, V, TMutator>(
&mut self,
key: S,
mut updater: TMutator,
) -> UnorderedTermId
where where
S: AsRef<[u8]>, S: AsRef<[u8]>,
V: Copy, V: Copy + 'static,
TMutator: FnMut(Option<V>) -> V, TMutator: FnMut(Option<V>) -> V,
{ {
if self.is_saturated() { if self.is_saturated() {
self.resize(); self.resize();
} }
let key_bytes: &[u8] = key.as_ref(); let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2::murmurhash2(key.as_ref()); let hash = murmurhash2(key.as_ref());
let mut probe = self.probe(hash); let mut probe = self.probe(hash);
loop { loop {
let bucket = probe.next_probe(); let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket]; let kv: KeyValue = self.table[bucket];
if kv.is_empty() { if kv.is_empty() {
// The key does not exists yet.
let val = updater(None); let val = updater(None);
let key_addr = self.heap.store(KeyBytesValue::new(key_bytes, val)); let num_bytes =
self.set_bucket(hash, key_addr, bucket); std::mem::size_of::<u16>() + key_bytes.len() + std::mem::size_of::<V>();
return bucket as BucketId; let key_addr = self.heap.allocate_space(num_bytes);
{
let data = self.heap.slice_mut(key_addr, num_bytes);
NativeEndian::write_u16(data, key_bytes.len() as u16);
let stop = 2 + key_bytes.len();
data[2..stop].copy_from_slice(key_bytes);
store(&mut data[stop..], val);
}
return self.set_bucket(hash, key_addr, bucket);
} else if kv.hash == hash { } else if kv.hash == hash {
let (key_matches, val_addr) = { if let Some(val_addr) =
let (stored_key, val_addr): (&[u8], Addr) = self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr)
unsafe { self.get_key_value(kv.key_value_addr) }; {
(stored_key == key_bytes, val_addr) let v = self.heap.read(val_addr);
}; let new_v = updater(Some(v));
if key_matches { self.heap.write_at(val_addr, new_v);
unsafe { return kv.unordered_term_id;
// logic
let v = self.heap.read(val_addr);
let new_v = updater(Some(v));
self.heap.write(val_addr, new_v);
};
return bucket as BucketId;
} }
} }
} }
} }
} }
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::murmurhash2::murmurhash2;
use test::Bencher;
#[bench]
fn bench_murmurhash2(b: &mut Bencher) {
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
let mut s = 0;
for &key in &keys {
s ^= murmurhash2(key.as_bytes());
}
s
});
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
@@ -285,10 +269,7 @@ mod tests {
let mut vanilla_hash_map = HashMap::new(); let mut vanilla_hash_map = HashMap::new();
let mut iter_values = hash_map.iter(); let mut iter_values = hash_map.iter();
while let Some((key, addr, _)) = iter_values.next() { while let Some((key, addr, _)) = iter_values.next() {
let val: u32 = unsafe { let val: u32 = hash_map.heap.read(addr);
// test
hash_map.heap.read(addr)
};
vanilla_hash_map.insert(key.to_owned(), val); vanilla_hash_map.insert(key.to_owned(), val);
} }
assert_eq!(vanilla_hash_map.len(), 2); assert_eq!(vanilla_hash_map.len(), 2);

Some files were not shown because too many files have changed in this diff Show More