Compare commits

...

229 Commits

Author SHA1 Message Date
Paul Masurel
1658be3792 Various changes. Need to cherrypick some of them and put them into master 2017-12-25 10:35:10 +09:00
Paul Masurel
23fad88b35 NOBUG common crawl, streamdict works with 64 bits (hopefully) 2017-12-21 22:44:50 +09:00
Paul Masurel
79132e803a NOBUG Switched to 64 bits addr 2017-12-21 11:06:46 +09:00
Paul Masurel
9e132b7dde NOBUG QueryParser does not need to be mut. Code cleanup 2017-12-16 15:43:35 +09:00
Paul Masurel
1e55189db1 NOBUG rustfmt 2017-12-14 19:30:31 +09:00
Paul Masurel
8b1b389a76 NOBUG Clippy 2017-12-14 19:25:12 +09:00
Paul Masurel
46f3ec87a5 Removed packed memory layout. 2017-12-14 18:37:04 +09:00
Paul Masurel
f24e5f405e NOBUG intellij misc lint 2017-12-14 18:23:35 +09:00
Paul Masurel
2589be3984 BUGFIX Serialization of schema got broken after serde's update 2017-12-14 17:37:20 +09:00
Paul Masurel
a02a9294e4 removed doc in travis 2017-11-27 13:53:58 +09:00
Paul Masurel
8023445b63 docs 2017-11-26 11:52:03 +09:00
Paul Masurel
05ce093f97 doc 2017-11-26 11:43:11 +09:00
Paul Masurel
6937e23a56 fixing doctest 2017-11-26 11:06:34 +09:00
Paul Masurel
974c321153 cargo fmt 2017-11-26 11:02:02 +09:00
Paul Masurel
f30ec9b36b Merge branch 'master' of github.com:tantivy-search/tantivy
Conflicts:
	src/analyzer/mod.rs
	src/schema/index_record_option.rs
	src/tokenizer/lower_caser.rs
	src/tokenizer/tokenizer.rs
2017-11-26 10:54:05 +09:00
Paul Masurel
acd7c1ea2d Added comments 2017-11-26 10:44:49 +09:00
Paul Masurel
aaeeda2bc5 Editing rustdoc 2017-11-25 13:23:32 +09:00
Paul Masurel
ac4d433fad Renamed analyzer to tokenizer 2017-11-24 16:50:32 +09:00
Paul Masurel
a298c084e6 Analyzer's Analyzer::token_stream does not need to me &mut self 2017-11-22 20:37:34 +09:00
Paul Masurel
185a72b341 Closes #224. Fixes documentation about STORED in the example. (#225) 2017-11-16 08:22:54 +09:00
Paul Masurel
bb41ae76f9 Closes #224. Fixes documentation about STORED in the example. 2017-11-16 08:16:17 +09:00
Jain Jacob
927dd1ee6f Updates crate gcc to cc v1 (#217)
* Bump cc to v1

* Changes gcc::Config to cc::Build. Resolves #216
2017-10-06 16:18:44 +09:00
Paul Masurel
2c9302290f #191 Analyzer 2017-09-20 22:56:55 +09:00
Paul Masurel
426cc436da Test passing 2017-09-10 17:48:41 +09:00
Paul Masurel
68d42c9cf2 Added raw tokenizer, using the right analyzer in query parser. 2017-09-10 16:58:50 +09:00
Paul Masurel
ca49d6130f Test not passing 2017-09-09 17:32:47 +09:00
Paul Masurel
3588ca0561 Integrated with the merge branch 2017-09-09 15:27:19 +09:00
Paul Masurel
7c6cdcd876 Merge branch 'master' of github.com:tantivy-search/tantivy 2017-09-02 16:03:06 +09:00
Paul Masurel
71366b9a56 issue/197 Remove logic that prevents leak from crossbeam MsQueue. (#212)
Closes #197
2017-09-02 15:55:23 +09:00
Paul Masurel
a3247ebcfb issue/197 Remove logic that prevents leak from csossbeam MsQueue. 2017-09-02 15:53:07 +09:00
Paul Masurel
3ec13a8719 Readded fix for non-simd 2017-08-28 23:18:56 +09:00
Paul Masurel
f8593c76d5 Merge branch 'imhotep-new-codec'
Conflicts:
	src/common/bitpacker.rs
	src/compression/pack/compression_pack_nosimd.rs
	src/indexer/log_merge_policy.rs
2017-08-28 19:30:01 +09:00
Paul Masurel
f8710bd4b0 Format 2017-08-28 18:22:41 +09:00
Paul Masurel
8d05b8f7b2 Added comments. Renamed field reader 2017-08-28 17:00:12 +09:00
Paul Masurel
fc25516b7a Added unit test. 2017-08-28 11:15:37 +09:00
Paul Masurel
5b1e71947f Stream working, all test passing 2017-08-27 20:20:38 +09:00
Paul Masurel
69351fb4a5 Toward a new codec 2017-08-27 18:44:37 +09:00
Paul Masurel
3d0082d020 Delta encoded. Range and get are broken 2017-08-26 19:59:51 +09:00
Paul Masurel
8e450c770a Better error handling. Some doc. 2017-08-26 18:40:30 +09:00
Paul Masurel
a757902aed Merge branch 'feature/streamdict-simd' into imhotep 2017-08-22 18:58:57 +09:00
Paul Masurel
b3a8074826 removed println 2017-08-22 18:58:17 +09:00
Paul Masurel
4289625348 Merged with the new codec branch 2017-08-22 18:26:09 +09:00
Paul Masurel
850f10c1fe Exposing Field 2017-08-22 18:21:35 +09:00
raphael claude
d7f9bfdfc5 fix segments sorting in log_merge_policy (#211)
bug: segments were sorted on their indices (first field in the tuples)
fix: sort on the segments size
2017-08-20 08:59:54 +09:00
Paul Masurel
d0d5db4515 Streamdict using SIMD instruction. 2017-08-19 12:03:04 +09:00
Paul Masurel
303fc7e820 Better unit test for termdict. Checking the TermInfo 2017-08-17 12:08:39 +09:00
Paul Masurel
744edb2c5c NOBUG Avoid serializing position offset when useless. Test passing 2017-08-16 14:06:00 +09:00
Paul Masurel
2d70efb7b0 Removed trait boundary on termdict 2017-08-15 14:43:05 +09:00
Paul Masurel
eb5b2ffdcc Cleanups 2017-08-15 13:57:22 +09:00
Paul Masurel
38513014d5 Reenable unit test.
Consuming CompositeWrite on Close.
2017-08-14 23:35:09 +09:00
Paul Masurel
9cb7a0f6e6 Unit tests passing 2017-08-13 19:38:25 +09:00
Paul Masurel
8d466b8a76 half way through removing FastFieldsReader 2017-08-13 18:39:45 +09:00
Paul Masurel
413d0e1719 NOBUG test passing 2017-08-13 17:57:11 +09:00
Paul Masurel
0eb3c872fd Using composite file for all of the inverted index component 2017-08-12 19:34:23 +09:00
Paul Masurel
f9203228be Using composite file in fast field. 2017-08-12 18:45:59 +09:00
Paul Masurel
8f377b92d0 introducing a field serializer 2017-08-11 18:11:32 +09:00
Paul Masurel
1e89f86267 blop 2017-08-08 13:55:09 +09:00
Paul Masurel
d1f61a50c1 issue/207 Lazily decompressing positions. 2017-08-06 20:29:21 +09:00
Dru Sellers
2bb85ed575 Minor Doc Changes (#206)
* Various small documentation tweaks

* walking through the docs

* Update lib.rs

* Update lib.rs

* Update mod.rs
2017-08-06 09:22:03 +09:00
Paul Masurel
236fa74767 Positions almost working. 2017-08-05 23:17:35 +09:00
Paul Masurel
63b35dd87b removing freq handler. 2017-08-05 18:09:19 +09:00
Paul Masurel
efb910f4e8 Added CompressedIntStream 2017-08-05 16:44:01 +09:00
Paul Masurel
aff7e64d4e test 2017-08-04 22:07:14 +09:00
Paul Masurel
92a3f3981f issue/204 trying to fix nosimd branch. test not passing 2017-08-04 21:19:18 +09:00
king6cong
447a9361d8 Remove submodule information in README as subtree is now used 2017-08-03 13:52:16 +09:00
Paul Masurel
5f59139484 NOBUG simplified code. 2017-08-02 20:49:47 +09:00
Paul Masurel
27c373d26d NOBUG Updated changelog and bumped version 2017-07-24 18:52:45 +09:00
Paul Masurel
80ae136646 issue/198 Getting living_file after getting the list of managed files. 2017-07-24 18:46:41 +09:00
Paul Masurel
52b1398702 NOBUG version 0.4.0 -> 0.4.1 2017-07-19 19:07:54 +09:00
Paul Masurel
7b9cd09a6e Closes #199. Unindexed fields are indexed as untokenized 2017-07-19 18:41:22 +09:00
Paul Masurel
4c423ad2ca Merge branch 'master' of github.com:tantivy-search/tantivy 2017-07-19 17:01:32 +09:00
Paul Masurel
9f542d5252 NOBUG Fix spelling of "encountered". (as reported by @dazzag24) 2017-07-19 16:59:50 +09:00
Paul Masurel
77d8e81ae4 issue/17 Slightly more explicit error message 2017-07-19 11:08:42 +09:00
Paul Masurel
76e07b9705 NOBUG Small fixes. 2017-07-14 18:09:54 +09:00
Paul Masurel
ea4e9fdaf1 NOBUG updated README 2017-07-14 14:09:13 +09:00
Paul Masurel
e418bee693 NOBUG Garbage collection after end merge. 2017-07-14 12:09:47 +09:00
Paul Masurel
af4f1a86bc Merge remote-tracking branch 'origin/exp/hash_intable' 2017-07-13 20:50:54 +09:00
Paul Masurel
753b639454 NOBUG splitting the per-thread memory between the table and the heap 2017-07-13 17:11:39 +09:00
Paul Masurel
5907a47547 NOBUG Added whitespaces. 2017-07-13 15:14:12 +09:00
Paul Masurel
586a6e62a2 NOBUG Added Changelog for 4.0 2017-07-13 15:06:09 +09:00
Paul Masurel
fdae0eff5a NOBUG Remove range step_by 2017-07-13 14:05:33 +09:00
Paul Masurel
6eea407f20 Removing usage of step_by 2017-06-23 17:46:39 +09:00
Paul Masurel
1ba51d4dc4 NOBUG removed using range.step_by 2017-06-22 22:10:53 +09:00
Paul Masurel
6e742d5145 NOBUG removing batch add docs 2017-06-22 11:35:22 +09:00
Paul Masurel
1843259e91 NOBUG Simplified addr definitions 2017-06-22 11:27:32 +09:00
Paul Masurel
4ebacb7297 BytesRef is now wrapping an addr 2017-06-21 22:32:05 +09:00
Paul Masurel
fb75e60c6e issue/136 Added hashmaps. 2017-06-21 15:47:55 +09:00
Paul Masurel
04b15c6c11 Merge branch 'master' into exp/hash_intable
Conflicts:
	src/datastruct/stacker/hashmap.rs
2017-06-21 11:40:49 +09:00
Paul Masurel
b05b5f5487 issue/191 Added an analyzer manager. 2017-06-20 10:02:26 +09:00
Paul Masurel
4fe96483bc fill_buffer 2017-06-14 23:32:58 +09:00
Paul Masurel
09e27740e2 Added fill_buffer in DocSet 2017-06-14 18:28:30 +09:00
Paul Masurel
e51feea574 Removed cargo fmt from travis. 2017-06-14 13:45:11 +09:00
Paul Masurel
93e7f28cc0 Added unit test 2017-06-14 10:46:06 +09:00
Paul Masurel
8875b9794a Added API to get range from fastfield 2017-06-13 23:16:50 +09:00
Paul Masurel
f26874557e Remove the concept of pipeline. Made a BoableAnalyzer 2017-06-10 20:06:00 +09:00
Paul Masurel
a7d10b65ae Added support for Japanese. 2017-06-09 22:25:03 +09:00
Paul Masurel
e120e3b7aa issue/191 Added proper analyzer 2017-06-07 23:21:36 +09:00
Paul Masurel
90fcfb3f43 issue/188 Using murmurhash 2017-06-07 09:30:34 +09:00
Paul Masurel
e547e8abad Closes #184
Resizing the `Vec` was a bad idea, as for some stacker operation,
we may have a living reference to an object in the current heap.
2017-06-06 23:16:28 +09:00
Paul Masurel
5aa4565424 Tiny cleaning 2017-06-05 23:40:08 +09:00
Paul Masurel
3637620187 Merge branch 'master' of github.com:tantivy-search/tantivy 2017-06-02 21:03:37 +09:00
Laurentiu Nicola
a94679d74d Use four terms in the intersection bench 2017-05-31 08:31:33 +09:00
Laurentiu Nicola
a35a8638cc Comment nit 2017-05-31 08:31:33 +09:00
Paul Masurel
97a051996f issue 171. Hopefully bugfix? 2017-05-31 08:31:33 +09:00
Laurentiu Nicola
69525cb3c7 Add extra intersection test 2017-05-31 08:31:33 +09:00
Laurentiu Nicola
63867a7150 Fix document generation for posting benchmarks 2017-05-31 08:31:33 +09:00
Paul Masurel
19c073385a Better intersection and added size_hint 2017-05-31 08:31:33 +09:00
Paul Masurel
0521844e56 Format, small changes in VInt 2017-05-31 08:31:20 +09:00
Paul Masurel
8d4778f94d issue/181 BinarySerializable does not return the len + Generics over Read+Write 2017-05-31 08:31:20 +09:00
Paul Masurel
1d5464351d generic read 2017-05-31 08:31:20 +09:00
Paul Masurel
522ebdc674 made ResultExt public 2017-05-31 08:31:20 +09:00
Paul Masurel
4a805733db another hash 2017-05-30 15:36:48 +09:00
Paul Masurel
568d149db8 Merge branch 'master' into exp/hash_intable 2017-05-30 08:27:33 +09:00
Paul Masurel
4cfc9806c0 made ResultExt public 2017-05-30 08:22:17 +09:00
Paul Masurel
37042e3ccb Send and Sync impl now useless 2017-05-29 18:53:49 +09:00
Paul Masurel
b316cd337a Optimization in bitpacker 2017-05-29 18:53:49 +09:00
Paul Masurel
c04991e5ad Removed pointer in fastfield 2017-05-29 18:53:49 +09:00
Paul Masurel
c59b712eeb Added hash info in the table 2017-05-29 18:47:20 +09:00
Ashley Mannix
da61baed3b run fmt 2017-05-29 18:29:39 +09:00
Ashley Mannix
b6140d2962 drop some patch bounds 2017-05-29 18:29:39 +09:00
Ashley Mannix
6a9a71bb1b re-export ErrorKind 2017-05-29 18:29:39 +09:00
Ashley Mannix
e8fc4c77e2 fix delete error msg 2017-05-29 18:29:39 +09:00
Ashley Mannix
80837601ea remove error::* imports 2017-05-29 18:29:39 +09:00
Ashley Mannix
2b2703cf51 run cargo fmt 2017-05-29 18:29:39 +09:00
Ashley Mannix
d79018a7f8 fix build warnings 2017-05-29 18:29:39 +09:00
Ashley Mannix
d8a7c428f7 impl std error for directory errors 2017-05-29 18:29:39 +09:00
Ashley Mannix
45595234cc fix error match 2017-05-29 18:29:39 +09:00
Ashley Mannix
1bcebdd29e initial error-chain 2017-05-29 18:29:39 +09:00
Paul Masurel
ed0333a404 Optimized streamer 2017-05-28 19:58:28 +09:00
Paul Masurel
ac0b1a21eb Term as a wrapper
Small changes

Plastic
2017-05-25 23:49:54 +09:00
Paul Masurel
6bbc789d84 Fmt fix 2017-05-25 23:49:54 +09:00
Paul Masurel
87152daef3 issue/174 Added doc, and made field private 2017-05-25 23:49:54 +09:00
Paul Masurel
e0fce4782a Added documentation 2017-05-25 23:49:54 +09:00
Paul Masurel
a633c2a49a Avoid exposing common. Exposes u64 to i64 conversion instead. 2017-05-25 23:49:54 +09:00
Paul Masurel
51623d593e Avoid exposign schema from segment_reader 2017-05-25 23:49:54 +09:00
Paul Masurel
29bf740ddf Exposing the remaining API 2017-05-25 23:49:54 +09:00
Paul Masurel
511bd25a31 trailing whitespace 2017-05-25 18:17:37 +09:00
Paul Masurel
66e14ac1b1 clippy 2017-05-25 18:17:37 +09:00
Paul Masurel
09e94072ba Cargo fmt 2017-05-25 18:17:37 +09:00
Paul Masurel
6c68136d31 Reorganized code 2017-05-25 18:17:37 +09:00
Paul Masurel
aaf1b2c6b6 Reorganized code and added documentation. 2017-05-25 18:17:37 +09:00
Paul Masurel
8a6af2aefa Added unit test and bugfix 2017-05-25 18:17:37 +09:00
Paul Masurel
7a6e62976b Added stream dictionary code, merge unit test 2017-05-25 18:17:37 +09:00
Paul Masurel
2712930bd6 Added the feature 2017-05-25 18:17:37 +09:00
Paul Masurel
cb05f8c098 Prevent execution of the code in the macro doc 2017-05-22 10:55:45 +09:00
Paul Masurel
c0c9d04ca9 Added extra doc 2017-05-22 10:55:45 +09:00
Paul Masurel
7ea5e740e0 Using the $crate thing to make the macro usable in and outside tantivy 2017-05-22 10:55:45 +09:00
Paul Masurel
2afa6c372a issue/168 Make doc! macro usable outside tantivy 2017-05-22 10:55:45 +09:00
Paul Masurel
c7db8866b5 Merge branch 'facets' 2017-05-21 22:57:01 +09:00
Paul Masurel
02d992324a simplified facets. 2017-05-21 22:56:43 +09:00
Paul Masurel
4ab511ffc6 Merging 2017-05-21 22:15:02 +09:00
Paul Masurel
f318172ea4 Merge branch 'issue/162' 2017-05-21 20:04:03 +09:00
Paul Masurel
581449a824 issue/162 Docs and unit tests 2017-05-21 18:58:04 +09:00
Maciej Dziardziel
272589a381 faceting for fast numerical fields 2017-05-21 12:04:29 +03:00
Laurentiu Nicola
73d54c6379 Inline block_len 2017-05-21 10:44:49 +03:00
Paul Masurel
3e4606de5d Simplifying, and reordering the members 2017-05-21 16:31:52 +09:00
Laurentiu Nicola
020779f61b Make things faster 2017-05-20 20:56:37 +03:00
Laurentiu Nicola
835936585f Don't search whole blocks, but only the remaining part 2017-05-20 18:45:41 +03:00
Paul Masurel
bdd05e97d1 Added bench for segment postings 2017-05-20 23:38:53 +09:00
Paul Masurel
2be5f08cd6 issue/162 Added block iteration API 2017-05-20 11:46:40 +09:00
Paul Masurel
3f49d65a87 issue/162 Create block postings 2017-05-20 00:46:23 +09:00
Paul Masurel
f9baf4bcc8 Merge branch 'issue/155'
Conflicts:
	src/indexer/merger.rs
	src/indexer/segment_writer.rs
2017-05-19 20:14:36 +09:00
Paul Masurel
7ee93fbed5 Cleaning 2017-05-19 20:08:04 +09:00
Paul Masurel
57a5547ae8 Comments and cleaning up API 2017-05-19 11:20:27 +09:00
Paul Masurel
c57ab6a335 Renamed fstmap to termdict 2017-05-19 09:26:18 +09:00
Paul Masurel
02bfa9be52 Moving to termdict 2017-05-19 08:43:52 +09:00
Paul Masurel
b3f62b8acc Better API 2017-05-18 23:35:39 +09:00
Paul Masurel
2a08c247af Clippy 2017-05-18 23:20:41 +09:00
Paul Masurel
d2926b6ee0 Format 2017-05-18 23:09:20 +09:00
Paul Masurel
0272167c2e Code cleaning 2017-05-18 23:06:02 +09:00
Laurentiu Nicola
a9cf0bde16 Format code 2017-05-18 22:07:49 +09:00
Laurentiu Nicola
5a457df45d VInt encode values in IntFastFieldWriter
Closes #131
2017-05-18 22:07:49 +09:00
Paul Masurel
ca76fd5ba0 Uncommenting unit test 2017-05-18 20:41:56 +09:00
Paul Masurel
e79a316e41 Issue 155 - Trying to avoid term lookup when merging terms
+ Adds a proper Streamer interface
2017-05-18 20:12:00 +09:00
Paul Masurel
733f54d80e Making clippy happy. 2017-05-17 19:07:39 +09:00
Paul Masurel
7b2b181652 Merge branch 'master' into issue/136
Conflicts:
	src/datastruct/stacker/hashmap.rs
	src/datastruct/stacker/heap.rs
	src/datastruct/stacker/mod.rs
	src/indexer/index_writer.rs
	src/indexer/merger.rs
	src/indexer/segment_updater.rs
	src/indexer/segment_writer.rs
	src/postings/postings_writer.rs
	src/postings/recorder.rs
	src/schema/term.rs
2017-05-17 18:40:09 +09:00
Laurentiu Nicola
b3f39f2343 Remove unneeded suppressions, make clippy lints explicit 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
a13122d392 use explicit drop instead of suppression 2017-05-17 15:50:07 +09:00
Paul Masurel
113917c521 Making clippy happy.
+ Simplifying bitpacking by adding a 7 byte padding.
+ Bugfix in a unit test.
2017-05-17 15:50:07 +09:00
Laurentiu Nicola
1352b95b07 clippy: fix never_loop warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
c0538dbe9a clippy: fix mut_from_ref warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
0d5ea98132 clippy: fix inline_always warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
0404df3fd5 Fix typo in docstring 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
a67caee141 clippy: fix len_zero warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
f5fb29422a clippy: fix while_let_loop warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
4e48bbf0ea clippy: fix needless_lifetimes warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
6fea510869 clippy: fix redundant_closure warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
39958ec476 clippy: fix single_match warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
36f51e289e clippy: fix match_same_arms warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
5c83153035 clippy: fix or_fun_call warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
8e407bb314 clippy: fix needless_borrow warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
103ba6ba35 clippy: fix match_ref_pats warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
3965b26cd2 clippy: fix useless_let_if_seq warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
1cd0b378fb clippy: fix map_clone warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
92f383fa51 clippy: fix let_unit_value warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
6ae34d2a77 clippy: fix toplevel_ref_arg warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
1af1f7e0d1 clippy: fix if_let_redundant_pattern_matching warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
feec2e2620 clippy: fix needless_bool warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
3e2ad7542d clippy: fix needless_return warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
ac02c76b1e clippy: fix doc_markdown warnings 2017-05-17 15:50:07 +09:00
Paul Masurel
e5c7c0b8b9 Update CHANGELOG.md 2017-05-16 21:13:33 +09:00
Laurentiu Nicola
49dbe4722f Add a test for SegmentPostings::skip_len 2017-05-16 21:12:43 +09:00
Laurentiu Nicola
f64ff77424 Use an exponential search 2017-05-16 21:12:43 +09:00
Laurentiu Nicola
2bf93e9e51 Avoid rebuilding simdcomp when running tests 2017-05-16 08:37:43 +09:00
Laurentiu Nicola
3dde748b25 Make rustfmt happy 2017-05-16 00:49:05 +03:00
Laurentiu Nicola
1dabe26395 Add comment about block_len 2017-05-15 21:26:28 +03:00
Laurentiu Nicola
5590537739 Disable early exit 2017-05-15 21:18:06 +03:00
Laurentiu Nicola
ccf0f9cb2f Merge branch 'master' of github.com:tantivy-search/tantivy into issue/130 2017-05-15 18:54:16 +03:00
Laurentiu Nicola
e21913ecdc Use binary search for SegmentPostings::skip_next 2017-05-15 18:33:43 +03:00
Laurentiu Nicola
2cc826adc7 Add a bench for SegmentPostings::SkipNext 2017-05-15 18:33:43 +03:00
Laurentiu Nicola
4d90d8fc1d Move the random sampling helpers to the tests module 2017-05-15 18:33:43 +03:00
Paul Masurel
0606a8ae73 Bugfix in travis yml 2017-05-16 00:22:11 +09:00
Paul Masurel
03564214e7 Added check for rustfmt in travis 2017-05-15 22:46:43 +09:00
Paul Masurel
4c8f9742f8 format 2017-05-15 22:30:18 +09:00
Paul Masurel
a23b7a1815 Test the size of complete 0..128 block 2017-05-15 19:09:52 +09:00
Paul Masurel
6f89a86b14 Added simple search in travis CI 2017-05-15 12:10:23 +09:00
Laurentiu Nicola
b2beac1203 Check the result of wait_merging_threads 2017-05-15 08:00:25 +09:00
Paul Masurel
8cd5a2d81d Fixed logging deleted files twice 2017-05-15 00:25:49 +09:00
Paul Masurel
b26c22ada0 Merge branch 'issue/148' 2017-05-15 00:02:51 +09:00
Laurentiu Nicola
8a35259300 Avoid clone() call 2017-05-14 23:28:17 +09:00
Paul Masurel
db56167a5d Display backtrace 2017-05-14 23:28:17 +09:00
Paul Masurel
ab66ffed4e Closes #147 2017-05-14 23:28:17 +09:00
Laurentiu Nicola
e04f2f0b08 issue/148 Wait for the index writer threads to shut down in simple_search 2017-05-14 16:35:24 +03:00
Paul Masurel
7a5df33c85 issue/148 Wrapping MsQueue to drop all of its concent on Drop 2017-05-14 16:25:33 +03:00
Laurentiu Nicola
ee0873dd07 Avoid clone() call 2017-05-13 16:11:58 +03:00
Paul Masurel
695c8828b8 Display backtrace 2017-05-13 18:51:38 +09:00
Paul Masurel
4ff7dc7a4f Closes #147 2017-05-13 18:46:50 +09:00
Paul Masurel
69832bfd03 NOBUG Disabling running examples in CI as it is not working. 2017-05-12 14:35:50 +09:00
Paul Masurel
9cd7458978 NOBUG Hiding methods making it possible to build a incorrect Term. 2017-05-11 21:12:59 +09:00
156 changed files with 10162 additions and 5626 deletions

5
.gitignore vendored
View File

@@ -5,4 +5,7 @@ target/release
Cargo.lock
benchmark
.DS_Store
cpp/simdcomp/bitpackingbenchmark
cpp/simdcomp/bitpackingbenchmark
*.bk
.idea
trace.dat

View File

@@ -26,8 +26,7 @@ script:
- |
travis-cargo build &&
travis-cargo test &&
travis-cargo bench &&
travis-cargo doc
travis-cargo bench
- cargo run --example simple_search
after_success:
- bash ./script/build-doc.sh

View File

@@ -1,13 +1,47 @@
Tantivy 0.4.3
==========================
- Bugfix race condition when deleting files. (#198)
Tantivy 0.4.2
==========================
- Prevent usage of AVX2 instructions (#201)
Tantivy 0.4.1
==========================
- Bugfix for non-indexed fields. (#199)
Tantivy 0.4.0
==========================
- Raise the limit of number of fields (previously 256 fields)
- Removed u32 fields. They are replaced by u64 and i64 fields (#65)
- Raise the limit of number of fields (previously 256 fields) (@fulmicoton)
- Removed u32 fields. They are replaced by u64 and i64 fields (#65) (@fulmicoton)
- Optimized skip in SegmentPostings (#130) (@lnicola)
- Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola
- QueryParser:
- Using error-chain (@KodrAus)
- QueryParser: (@fulmicoton)
- Explicit error returned when searched for a term that is not indexed
- Searching for a int term via the query parser was broken `(age:1)`
- Searching for a non-indexed field returns an explicit Error
- Phrase query for non-tokenized field are not tokenized by the query parser.
- Faster/Better indexing (@fulmicoton)
- using murmurhash2
- faster merging
- more memory efficient fast field writer (@lnicola )
- better handling of collisions
- lesser memory usage
- Added API, most notably to iterate over ranges of terms (@fulmicoton)
- Bugfix that was preventing to unmap segment files, on index drop (@fulmicoton)
- Made the doc! macro public (@fulmicoton)
- Added an alternative implementation of the streaming dictionary (@fulmicoton)
Tantivy 0.3.1
==========================
@@ -15,6 +49,7 @@ Tantivy 0.3.1
- Expose a method to trigger files garbage collection
Tantivy 0.3
==========================
@@ -36,6 +71,7 @@ You should not expect backward compatibility before
tantivy 1.0.
New Features
------------

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.4.0-alpha"
version = "0.5.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
build = "build.rs"
license = "MIT"
@@ -16,6 +16,7 @@ keywords = ["search", "information", "retrieval"]
byteorder = "1.0"
memmap = "0.4"
lazy_static = "0.2.1"
tinysegmenter = "0.1.0"
regex = "0.2"
fst = "0.1.37"
atomicwrites = "0.1.3"
@@ -26,7 +27,7 @@ tempdir = "0.3"
serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
bincode = "0.7.0-alpha7"
bincode = "0.8"
libc = {version = "0.2.20", optional=true}
num_cpus = "1.2"
itertools = "0.5.9"
@@ -36,9 +37,13 @@ time = "0.1"
uuid = { version = "0.5", features = ["v4", "serde"] }
chan = "0.1"
version = "2"
crossbeam = "0.2"
futures = "0.1.9"
futures-cpupool = "0.1.2"
crossbeam = "0.3"
futures = "0.1"
futures-cpupool = "0.1"
error-chain = "0.8"
owning_ref = "0.3"
stable_deref_trait = "1.0.0"
rust-stemmers = "0.1.0"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
@@ -48,7 +53,7 @@ rand = "0.3"
env_logger = "0.4"
[build-dependencies]
gcc = {version = "0.3", optional=true}
cc = {version = "1.0.0", optional=true}
[profile.release]
opt-level = 3
@@ -59,7 +64,8 @@ debug-assertions = false
[features]
default = ["simdcompression"]
simdcompression = ["libc", "gcc"]
simdcompression = ["libc", "cc"]
streamdict = []
[badges]

View File

@@ -19,10 +19,10 @@ It is strongly inspired by Lucene's design.
- Basic query language
- Phrase queries
- Incremental indexing
- Multithreaded indexing (indexing English Wikipedia takes 4 minutes on my desktop)
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
- mmap based
- optional SIMD integer compression
- u32 fast fields (equivalent of doc values in Lucene)
- u64 and i64 fast fields (equivalent of doc values in Lucene)
- LZ4 compressed document store
- Cheesy logo with a horse
@@ -38,12 +38,10 @@ It will walk you through getting a wikipedia search engine up and running in a f
- [For the last released version](https://docs.rs/tantivy/)
- [For the last master branch](https://tantivy-search.github.io/tantivy/tantivy/index.html)
# Compiling
# Compiling
Tantivy requires Rust Nightly because it uses requires the features [`box_syntax`](https://doc.rust-lang.org/stable/book/box-syntax-and-patterns.html), [`optin_builtin_traits`](https://github.com/rust-lang/rfcs/blob/master/text/0019-opt-in-builtin-traits.md), and [`conservative_impl_trait`](https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md).
By default, `tantivy` uses a git submodule called `simdcomp`.
After cloning the repository, you will need to initialize and update
the submodules. The project can then be built using `cargo`.
The project can then be built using `cargo`.
git clone git@github.com:tantivy-search/tantivy.git
cd tantivy
@@ -54,9 +52,9 @@ Alternatively, if you are trying to compile `tantivy` without simd compression,
you can disable this functionality. In this case, this submodule is not required
and you can compile tantivy by using the `--no-default-features` flag.
cargo build --no-default-features
cargo build --no-default-features
# Contribute
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.

View File

@@ -22,4 +22,4 @@ build: false
test_script:
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
- REM SET RUST_LOG=tantivy,test & cargo run --example simple_search
- REM SET RUST_BACKTRACE=1 & cargo run --example simple_search

View File

@@ -1,9 +1,9 @@
#[cfg(feature = "simdcompression")]
mod build {
extern crate gcc;
extern crate cc;
pub fn build() {
let mut config = gcc::Config::new();
let mut config = cc::Build::new();
config
.include("./cpp/simdcomp/include")
.file("cpp/simdcomp/src/avxbitpacking.c")
@@ -46,6 +46,8 @@ mod build {
if !cfg!(debug_assertions) && cfg!(target_env = "msvc") {
println!("cargo:rustc-link-lib=dylib=simdcomp");
}
println!("cargo:rerun-if-changed=cpp");
}
}

View File

@@ -30,10 +30,12 @@
</div>
<div class="content"><div class='highlight'><pre><span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> rustc_serialize;
<span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> tantivy;
<div class="content"><div class='highlight'><pre><span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> tantivy;
<span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> tempdir;
<span class="hljs-meta">#[macro_use]</span>
<span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> serde_json;
<span class="hljs-keyword">use</span> std::path::Path;
<span class="hljs-keyword">use</span> tempdir::TempDir;
<span class="hljs-keyword">use</span> tantivy::Index;
@@ -108,8 +110,8 @@ be indexed”.</p>
<a class="pilcrow" href="#section-5">&#182;</a>
</div>
<p>Our first field is title.
We want full-text search for it, and we want to be able
to retrieve the document after the search.</p>
We want full-text search for it, and we also want
to be able to retrieve the document after the search.</p>
<p>TEXT | STORED is some syntactic sugar to describe
that.</p>
<p><code>TEXT</code> means the field should be tokenized and indexed,
@@ -132,9 +134,12 @@ documents that were selected during the search phase.</p>
<div class="pilwrap ">
<a class="pilcrow" href="#section-6">&#182;</a>
</div>
<p>Our first field is body.
We want full-text search for it, and we want to be able
to retrieve the body after the search.</p>
<p>Our second field is body.
We want full-text search for it, but we do not
need to be able to be able to retrieve it
for our application. </p>
<p>We can make our index lighter and
by omitting <code>STORED</code> flag.</p>
</div>
@@ -158,7 +163,7 @@ with our schema in the directory.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> index = <span class="hljs-built_in">try!</span>(Index::create(index_path, schema.clone()));</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> index = Index::create(index_path, schema.clone())?;</pre></div></div>
</li>
@@ -178,7 +183,7 @@ heap for the indexer can increase its throughput.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> index_writer = <span class="hljs-built_in">try!</span>(index.writer(<span class="hljs-number">50_000_000</span>));</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> index_writer = index.writer(<span class="hljs-number">50_000_000</span>)?;</pre></div></div>
</li>
@@ -214,9 +219,11 @@ one by one in a Document object.</p>
<span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> old_man_doc = Document::<span class="hljs-keyword">default</span>();
old_man_doc.add_text(title, <span class="hljs-string">"The Old Man and the Sea"</span>);
old_man_doc.add_text(body,
<span class="hljs-string">"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."</span>);</pre></div></div>
old_man_doc.add_text(
body,
<span class="hljs-string">"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."</span>,
);</pre></div></div>
</li>
@@ -243,16 +250,25 @@ one by one in a Document object.</p>
<a class="pilcrow" href="#section-12">&#182;</a>
</div>
<h3 id="create-a-document-directly-from-json-">Create a document directly from json.</h3>
<p>Alternatively, we can use our schema to parse
a document object directly from json.</p>
<p>Alternatively, we can use our schema to parse a
document object directly from json.
The document is a string, but we use the <code>json</code> macro
from <code>serde_json</code> for the convenience of multi-line support.</p>
</div>
<div class="content"><div class='highlight'><pre>
<span class="hljs-keyword">let</span> mice_and_men_doc = <span class="hljs-built_in">try!</span>(schema.parse_document(r#<span class="hljs-string">"{
"</span>title<span class="hljs-string">": "</span>Of Mice and Men<span class="hljs-string">",
"</span>body<span class="hljs-string">": "</span>few miles south of Soledad, the Salinas River drops <span class="hljs-keyword">in</span> close to the hillside bank and runs deep and green. The water is warm too, <span class="hljs-keyword">for</span> it has slipped twinkling over the yellow sands <span class="hljs-keyword">in</span> the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying <span class="hljs-keyword">in</span> their lower leaf junctures the debris of the winters flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool<span class="hljs-string">"
}"</span>#));
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> json = json!({
<span class="hljs-string">"title"</span>: <span class="hljs-string">"Of Mice and Men"</span>,
<span class="hljs-string">"body"</span>: <span class="hljs-string">"A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"</span>
});
<span class="hljs-keyword">let</span> mice_and_men_doc = schema.parse_document(&amp;json.to_string())?;
index_writer.add_document(mice_and_men_doc);</pre></div></div>
@@ -271,10 +287,15 @@ The following document has two titles.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> frankenstein_doc = <span class="hljs-built_in">try!</span>(schema.parse_document(r#<span class="hljs-string">"{
"</span>title<span class="hljs-string">": ["</span>Frankenstein<span class="hljs-string">", "</span>The Modern Promotheus<span class="hljs-string">"],
"</span>body<span class="hljs-string">": "</span>You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence <span class="hljs-keyword">in</span> the success of my undertaking.<span class="hljs-string">"
}"</span>#));
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> json = json!({
<span class="hljs-string">"title"</span>: [<span class="hljs-string">"Frankenstein"</span>, <span class="hljs-string">"The Modern Prometheus"</span>],
<span class="hljs-string">"body"</span>: <span class="hljs-string">"You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."</span>
});
<span class="hljs-keyword">let</span> frankenstein_doc = schema.parse_document(&amp;json.to_string())?;
index_writer.add_document(frankenstein_doc);</pre></div></div>
</li>
@@ -313,7 +334,7 @@ the existence of new documents.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-built_in">try!</span>(index_writer.commit());</pre></div></div>
<div class="content"><div class='highlight'><pre> index_writer.commit()?;</pre></div></div>
</li>
@@ -349,7 +370,7 @@ after every commit().</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-built_in">try!</span>(index.load_searchers());</pre></div></div>
<div class="content"><div class='highlight'><pre> index.load_searchers()?;</pre></div></div>
</li>
@@ -384,7 +405,7 @@ in both title and body.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> query_parser = QueryParser::new(index.schema(), <span class="hljs-built_in">vec!</span>[title, body]);</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> query_parser = QueryParser::for_index(index, <span class="hljs-built_in">vec!</span>[title, body]);</pre></div></div>
</li>
@@ -401,7 +422,7 @@ A ticket has been opened regarding this problem.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> query = <span class="hljs-built_in">try!</span>(query_parser.parse_query(<span class="hljs-string">"sea whale"</span>));</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> query = query_parser.parse_query(<span class="hljs-string">"sea whale"</span>)?;</pre></div></div>
</li>
@@ -451,7 +472,7 @@ is the role of the TopCollector.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-built_in">try!</span>(searcher.search(&amp;*query, &amp;<span class="hljs-keyword">mut</span> top_collector));</pre></div></div>
<div class="content"><div class='highlight'><pre> searcher.search(&amp;*query, &amp;<span class="hljs-keyword">mut</span> top_collector)?;</pre></div></div>
</li>
@@ -488,9 +509,27 @@ a title.</p>
<div class="content"><div class='highlight'><pre>
<span class="hljs-keyword">for</span> doc_address <span class="hljs-keyword">in</span> doc_addresses {
<span class="hljs-keyword">let</span> retrieved_doc = <span class="hljs-built_in">try!</span>(searcher.doc(&amp;doc_address));
<span class="hljs-keyword">let</span> retrieved_doc = searcher.doc(&amp;doc_address)?;
<span class="hljs-built_in">println!</span>(<span class="hljs-string">"{}"</span>, schema.to_json(&amp;retrieved_doc));
}
}</pre></div></div>
</li>
<li id="section-26">
<div class="annotation">
<div class="pilwrap ">
<a class="pilcrow" href="#section-26">&#182;</a>
</div>
<p>Wait for indexing and merging threads to shut down.
Usually this isnt needed, but in <code>main</code> we try to
delete the temporary directory and that fails on
Windows if the files are still open.</p>
</div>
<div class="content"><div class='highlight'><pre> index_writer.wait_merging_threads()?;
<span class="hljs-literal">Ok</span>(())
}</pre></div></div>

View File

@@ -1,6 +1,9 @@
extern crate tantivy;
extern crate tempdir;
#[macro_use]
extern crate serde_json;
use std::path::Path;
use tempdir::TempDir;
use tantivy::Index;
@@ -33,8 +36,8 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
let mut schema_builder = SchemaBuilder::default();
// Our first field is title.
// We want full-text search for it, and we want to be able
// to retrieve the document after the search.
// We want full-text search for it, and we also want
// to be able to retrieve the document after the search.
//
// TEXT | STORED is some syntactic sugar to describe
// that.
@@ -48,9 +51,13 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
// documents that were selected during the search phase.
schema_builder.add_text_field("title", TEXT | STORED);
// Our first field is body.
// We want full-text search for it, and we want to be able
// to retrieve the body after the search.
// Our second field is body.
// We want full-text search for it, but we do not
// need to be able to be able to retrieve it
// for our application.
//
// We can make our index lighter and
// by omitting `STORED` flag.
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
@@ -63,8 +70,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
//
// This will actually just save a meta.json
// with our schema in the directory.
let index = try!(Index::create(index_path, schema.clone()));
let index = Index::create(index_path, schema.clone())?;
// To insert document we need an index writer.
@@ -74,7 +80,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
//
// Here we use a buffer of 50MB per thread. Using a bigger
// heap for the indexer can increase its throughput.
let mut index_writer = try!(index.writer(50_000_000));
let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents!
// We first need a handle on the title and the body field.
@@ -89,32 +95,48 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text(body,
"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish.");
old_man_doc.add_text(
body,
"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish.",
);
// ... and add it to the `IndexWriter`.
index_writer.add_document(old_man_doc);
// ### Create a document directly from json.
//
// Alternatively, we can use our schema to parse
// a document object directly from json.
let mice_and_men_doc = try!(schema.parse_document(r#"{
// Alternatively, we can use our schema to parse a
// document object directly from json.
// The document is a string, but we use the `json` macro
// from `serde_json` for the convenience of multi-line support.
let json = json!({
"title": "Of Mice and Men",
"body": "few miles south of Soledad, the Salinas River drops in close to the hillside bank and runs deep and green. The water is warm too, for it has slipped twinkling over the yellow sands in the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying in their lower leaf junctures the debris of the winters flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool"
}"#));
"body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
});
let mice_and_men_doc = schema.parse_document(&json.to_string())?;
index_writer.add_document(mice_and_men_doc);
// Multi-valued field are allowed, they are
// expressed in JSON by an array.
// The following document has two titles.
let frankenstein_doc = try!(schema.parse_document(r#"{
"title": ["Frankenstein", "The Modern Promotheus"],
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
}"#));
let json = json!({
"title": ["Frankenstein", "The Modern Prometheus"],
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
});
let frankenstein_doc = schema.parse_document(&json.to_string())?;
index_writer.add_document(frankenstein_doc);
// This is an example, so we will only index 3 documents
@@ -135,7 +157,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
// the existence of new documents.
//
// This call is blocking.
try!(index_writer.commit());
index_writer.commit()?;
// If `.commit()` returns correctly, then all of the
// documents that have been added are guaranteed to be
@@ -151,7 +173,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
// Let's search our index. Start by reloading
// searchers in the index. This should be done
// after every commit().
try!(index.load_searchers());
index.load_searchers()?;
// Afterwards create one (or more) searchers.
//
@@ -163,12 +185,12 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
// Here, if the user does not specify which
// field they want to search, tantivy will search
// in both title and body.
let query_parser = QueryParser::new(index.schema(), vec![title, body]);
let query_parser = QueryParser::for_index(&index, vec![title, body]);
// QueryParser may fail if the query is not in the right
// format. For user facing applications, this can be a problem.
// A ticket has been opened regarding this problem.
let query = try!(query_parser.parse_query("sea whale"));
let query = query_parser.parse_query("sea whale")?;
// A query defines a set of documents, as
@@ -186,7 +208,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
let mut top_collector = TopCollector::with_limit(10);
// We can now perform our query.
try!(searcher.search(&*query, &mut top_collector));
searcher.search(&*query, &mut top_collector)?;
// Our top collector now contains the 10
// most relevant doc ids...
@@ -200,9 +222,15 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
// a title.
for doc_address in doc_addresses {
let retrieved_doc = try!(searcher.doc(&doc_address));
let retrieved_doc = searcher.doc(&doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
// Wait for indexing and merging threads to shut down.
// Usually this isn't needed, but in `main` we try to
// delete the temporary directory and that fails on
// Windows if the files are still open.
index_writer.wait_merging_threads()?;
Ok(())
}

View File

@@ -1,86 +0,0 @@
extern crate regex;
use std::str::Chars;
use std::ascii::AsciiExt;
pub struct TokenIter<'a> {
chars: Chars<'a>,
term_buffer: String,
}
fn append_char_lowercase(c: char, term_buffer: &mut String) {
term_buffer.push(c.to_ascii_lowercase());
}
pub trait StreamingIterator<'a, T> {
fn next(&'a mut self) -> Option<T>;
}
impl<'a, 'b> TokenIter<'b> {
fn consume_token(&'a mut self) -> Option<&'a str> {
for c in &mut self.chars {
if c.is_alphanumeric() {
append_char_lowercase(c, &mut self.term_buffer);
}
else {
break;
}
}
Some(&self.term_buffer)
}
}
impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> {
#[inline]
fn next(&'a mut self,) -> Option<&'a str> {
self.term_buffer.clear();
// skipping non-letter characters.
loop {
match self.chars.next() {
Some(c) => {
if c.is_alphanumeric() {
append_char_lowercase(c, &mut self.term_buffer);
return self.consume_token();
}
}
None => { return None; }
}
}
}
}
pub struct SimpleTokenizer;
impl SimpleTokenizer {
pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> {
TokenIter {
term_buffer: String::new(),
chars: text.chars(),
}
}
}
#[test]
fn test_tokenizer() {
let simple_tokenizer = SimpleTokenizer;
let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!");
assert_eq!(term_reader.next().unwrap(), "hello");
assert_eq!(term_reader.next().unwrap(), "happy");
assert_eq!(term_reader.next().unwrap(), "tax");
assert_eq!(term_reader.next().unwrap(), "payer");
assert_eq!(term_reader.next(), None);
}
#[test]
fn test_tokenizer_empty() {
let simple_tokenizer = SimpleTokenizer;
let mut term_reader = simple_tokenizer.tokenize("");
assert_eq!(term_reader.next(), None);
}

View File

@@ -5,10 +5,9 @@ use SegmentReader;
use DocId;
use Score;
/// Collector that does nothing.
/// This is used in the chain Collector and will hopefully
/// be optimized away by the compiler.
/// This is used in the chain Collector and will hopefully
/// be optimized away by the compiler.
pub struct DoNothingCollector;
impl Collector for DoNothingCollector {
#[inline]
@@ -24,10 +23,10 @@ impl Collector for DoNothingCollector {
/// are known at compile time.
pub struct ChainedCollector<Left: Collector, Right: Collector> {
left: Left,
right: Right
right: Right,
}
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
/// Adds a collector
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
ChainedCollector {
@@ -38,7 +37,11 @@ impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
}
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
try!(self.left.set_segment(segment_local_id, segment));
try!(self.right.set_segment(segment_local_id, segment));
Ok(())
@@ -58,7 +61,6 @@ pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
}
}
#[cfg(test)]
mod tests {
@@ -70,9 +72,7 @@ mod tests {
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors = chain()
.push(&mut top_collector)
.push(&mut count_collector);
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);
@@ -80,4 +80,4 @@ mod tests {
assert_eq!(count_collector.count(), 3);
assert!(top_collector.at_capacity());
}
}
}

View File

@@ -6,7 +6,7 @@ use SegmentReader;
use SegmentLocalId;
/// `CountCollector` collector only counts how many
/// documents match the query.
/// documents match the query.
pub struct CountCollector {
count: usize,
}
@@ -14,20 +14,18 @@ pub struct CountCollector {
impl CountCollector {
/// Returns the count of documents that were
/// collected.
pub fn count(&self,) -> usize {
pub fn count(&self) -> usize {
self.count
}
}
impl Default for CountCollector {
fn default() -> CountCollector {
CountCollector {count: 0,
}
CountCollector { count: 0 }
}
}
impl Collector for CountCollector {
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
}

View File

@@ -0,0 +1,113 @@
use std::cmp::Eq;
use std::collections::HashMap;
use std::hash::Hash;
use collector::Collector;
use fastfield::FastFieldReader;
use schema::Field;
use DocId;
use Result;
use Score;
use SegmentReader;
use SegmentLocalId;
/// Facet collector for i64/u64 fast field
pub struct FacetCollector<T>
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
counters: HashMap<T::ValueType, u64>,
field: Field,
ff_reader: Option<T>,
}
impl<T> FacetCollector<T>
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
/// Creates a new facet collector for aggregating a given field.
pub fn new(field: Field) -> FacetCollector<T> {
FacetCollector {
counters: HashMap::new(),
field: field,
ff_reader: None,
}
}
}
impl<T> Collector for FacetCollector<T>
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
Ok(())
}
fn collect(&mut self, doc: DocId, _: Score) {
let val = self.ff_reader
.as_ref()
.expect("collect() was called before set_segment. This should never happen.")
.get(doc);
*(self.counters.entry(val).or_insert(0)) += 1;
}
}
#[cfg(test)]
mod tests {
use collector::{chain, FacetCollector};
use query::QueryParser;
use fastfield::{I64FastFieldReader, U64FastFieldReader};
use schema::{self, FAST, STRING};
use Index;
#[test]
// create 10 documents, set num field value to 0 or 1 for even/odd ones
// make sure we have facet counters correctly filled
fn test_facet_collector_results() {
let mut schema_builder = schema::SchemaBuilder::new();
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
let text_field = schema_builder.add_text_field("text", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
for i in 0u64..10u64 {
index_writer.add_document(doc!(
num_field_i64 => ((i as i64) % 3i64) as i64,
num_field_u64 => (i % 2u64) as u64,
text_field => "text"
));
}
}
assert_eq!(index_writer.commit().unwrap(), 10u64);
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut ffvf_i64: FacetCollector<I64FastFieldReader> = FacetCollector::new(num_field_i64);
let mut ffvf_u64: FacetCollector<U64FastFieldReader> = FacetCollector::new(num_field_u64);
{
// perform the query
let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64);
let query_parser = QueryParser::for_index(&index, vec![text_field]);
let query = query_parser.parse_query("text:text").unwrap();
query.search(&searcher, &mut facet_collectors).unwrap();
}
assert_eq!(ffvf_u64.counters[&0], 5);
assert_eq!(ffvf_u64.counters[&1], 5);
assert_eq!(ffvf_i64.counters[&0], 4);
assert_eq!(ffvf_i64.counters[&1], 3);
}
}

View File

@@ -1,3 +1,7 @@
/*!
Defines how the documents matching a search query should be processed.
*/
use SegmentReader;
use SegmentLocalId;
use DocId;
@@ -13,14 +17,17 @@ pub use self::multi_collector::MultiCollector;
mod top_collector;
pub use self::top_collector::TopCollector;
mod facet_collector;
pub use self::facet_collector::FacetCollector;
mod chained_collector;
pub use self::chained_collector::chain;
/// Collectors are in charge of collecting and retaining relevant
/// Collectors are in charge of collecting and retaining relevant
/// information from the document found and scored by the query.
///
///
/// For instance,
/// For instance,
///
/// - keeping track of the top 10 best documents
/// - computing a breakdown over a fast field
@@ -29,7 +36,7 @@ pub use self::chained_collector::chain;
/// Queries are in charge of pushing the `DocSet` to the collector.
///
/// As they work on multiple segments, they first inform
/// the collector of a change in a segment and then
/// the collector of a change in a segment and then
/// call the `collect` method to push the document to the collector.
///
/// Temporally, our collector will receive calls
@@ -46,16 +53,23 @@ pub use self::chained_collector::chain;
///
/// Segments are not guaranteed to be visited in any specific order.
pub trait Collector {
/// `set_segment` is called before beginning to enumerate
/// `set_segment` is called before beginning to enumerate
/// on this segment.
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()>;
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()>;
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score);
}
impl<'a, C: Collector> Collector for &'a mut C {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
(*self).set_segment(segment_local_id, segment)
}
/// The query pushes the scored document to the collector via this method.
@@ -64,7 +78,6 @@ impl<'a, C: Collector> Collector for &'a mut C {
}
}
#[cfg(test)]
pub mod tests {
@@ -77,7 +90,7 @@ pub mod tests {
use fastfield::U64FastFieldReader;
use fastfield::FastFieldReader;
use schema::Field;
/// Stores all of the doc ids.
/// This collector is only used for tests.
/// It is unusable in practise, as it does not store
@@ -90,7 +103,7 @@ pub mod tests {
impl TestCollector {
/// Return the exhalist of documents.
pub fn docs(self,) -> Vec<DocId> {
pub fn docs(self) -> Vec<DocId> {
self.docs
}
}
@@ -106,7 +119,6 @@ pub mod tests {
}
impl Collector for TestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.offset += self.segment_max_doc;
self.segment_max_doc = reader.max_doc();
@@ -117,10 +129,7 @@ pub mod tests {
self.docs.push(doc + self.offset);
}
}
/// Collects in order all of the fast fields for all of the
/// doc in the `DocSet`
///
@@ -140,11 +149,11 @@ pub mod tests {
}
}
pub fn vals(self,) -> Vec<u64> {
pub fn vals(self) -> Vec<u64> {
self.vals
}
}
impl Collector for FastFieldTestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
@@ -157,7 +166,6 @@ pub mod tests {
}
}
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {

View File

@@ -5,9 +5,8 @@ use Result;
use SegmentReader;
use SegmentLocalId;
/// Multicollector makes it possible to collect on more than one collector.
/// It should only be used for use cases where the Collector types is unknown
/// It should only be used for use cases where the Collector types is unknown
/// at compile time.
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
pub struct MultiCollector<'a> {
@@ -23,9 +22,12 @@ impl<'a> MultiCollector<'a> {
}
}
impl<'a> Collector for MultiCollector<'a> {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
for collector in &mut self.collectors {
try!(collector.set_segment(segment_local_id, segment));
}
@@ -39,8 +41,6 @@ impl<'a> Collector for MultiCollector<'a> {
}
}
#[cfg(test)]
mod tests {
@@ -52,7 +52,8 @@ mod tests {
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors = MultiCollector::from(vec!(&mut top_collector, &mut count_collector));
let mut collectors =
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);

View File

@@ -12,8 +12,7 @@ use Score;
#[derive(Clone, Copy)]
struct GlobalScoredDoc {
score: Score,
doc_address: DocAddress
doc_address: DocAddress,
}
impl PartialOrd for GlobalScoredDoc {
@@ -25,10 +24,10 @@ impl PartialOrd for GlobalScoredDoc {
impl Ord for GlobalScoredDoc {
#[inline]
fn cmp(&self, other: &GlobalScoredDoc) -> Ordering {
other.score.partial_cmp(&self.score)
.unwrap_or(
other.doc_address.cmp(&self.doc_address)
)
other
.score
.partial_cmp(&self.score)
.unwrap_or_else(|| other.doc_address.cmp(&self.doc_address))
}
}
@@ -40,7 +39,6 @@ impl PartialEq for GlobalScoredDoc {
impl Eq for GlobalScoredDoc {}
/// The Top Collector keeps track of the K documents
/// with the best scores.
///
@@ -53,7 +51,6 @@ pub struct TopCollector {
}
impl TopCollector {
/// Creates a top collector, with a number of documents equal to "limit".
///
/// # Panics
@@ -68,9 +65,9 @@ impl TopCollector {
segment_id: 0,
}
}
/// Returns K best documents sorted in decreasing order.
///
///
/// Calling this method triggers the sort.
/// The result of the sort is not cached.
pub fn docs(&self) -> Vec<DocAddress> {
@@ -81,30 +78,27 @@ impl TopCollector {
}
/// Returns K best ScoredDocument sorted in decreasing order.
///
///
/// Calling this method triggers the sort.
/// The result of the sort is not cached.
pub fn score_docs(&self) -> Vec<(Score, DocAddress)> {
let mut scored_docs: Vec<GlobalScoredDoc> = self.heap
.iter()
.cloned()
.collect();
let mut scored_docs: Vec<GlobalScoredDoc> = self.heap.iter().cloned().collect();
scored_docs.sort();
scored_docs.into_iter()
.map(|GlobalScoredDoc {score, doc_address}| (score, doc_address))
scored_docs
.into_iter()
.map(|GlobalScoredDoc { score, doc_address }| (score, doc_address))
.collect()
}
/// Return true iff at least K documents have gone through
/// the collector.
#[inline]
pub fn at_capacity(&self, ) -> bool {
pub fn at_capacity(&self) -> bool {
self.heap.len() >= self.limit
}
}
impl Collector for TopCollector {
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
self.segment_id = segment_id;
Ok(())
@@ -113,25 +107,26 @@ impl Collector for TopCollector {
fn collect(&mut self, doc: DocId, score: Score) {
if self.at_capacity() {
// It's ok to unwrap as long as a limit of 0 is forbidden.
let limit_doc: GlobalScoredDoc = *self.heap.peek().expect("Top collector with size 0 is forbidden");
let limit_doc: GlobalScoredDoc = *self.heap
.peek()
.expect("Top collector with size 0 is forbidden");
if limit_doc.score < score {
let mut mut_head = self.heap.peek_mut().expect("Top collector with size 0 is forbidden");
let mut mut_head = self.heap
.peek_mut()
.expect("Top collector with size 0 is forbidden");
mut_head.score = score;
mut_head.doc_address = DocAddress(self.segment_id, doc);
mut_head.doc_address = DocAddress(self.segment_id, doc);
}
}
else {
} else {
let wrapped_doc = GlobalScoredDoc {
score: score,
doc_address: DocAddress(self.segment_id, doc)
doc_address: DocAddress(self.segment_id, doc),
};
self.heap.push(wrapped_doc);
}
}
}
#[cfg(test)]
mod tests {
@@ -147,13 +142,12 @@ mod tests {
top_collector.collect(3, 0.2);
top_collector.collect(5, 0.3);
assert!(!top_collector.at_capacity());
let score_docs: Vec<(Score, DocId)> = top_collector.score_docs()
let score_docs: Vec<(Score, DocId)> = top_collector
.score_docs()
.into_iter()
.map(|(score, doc_address)| (score, doc_address.doc()))
.collect();
assert_eq!(score_docs, vec!(
(0.8, 1), (0.3, 5), (0.2, 3),
));
assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]);
}
#[test]
@@ -171,9 +165,7 @@ mod tests {
.into_iter()
.map(|(score, doc_address)| (score, doc_address.doc()))
.collect();
assert_eq!(score_docs, vec!(
(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)
));
assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]);
}
{
let docs: Vec<DocId> = top_collector
@@ -181,10 +173,8 @@ mod tests {
.into_iter()
.map(|doc_address| doc_address.doc())
.collect();
assert_eq!(docs, vec!(7, 1, 5, 3));
assert_eq!(docs, vec![7, 1, 5, 3]);
}
}
#[test]

View File

@@ -2,23 +2,23 @@ use std::io::Write;
use std::io;
use common::serialize::BinarySerializable;
use std::mem;
use std::ops::Deref;
/// Computes the number of bits that will be used for bitpacking.
///
/// In general the target is the minimum number of bits
/// In general the target is the minimum number of bits
/// required to express the amplitude given in argument.
///
/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
///
///
/// The logic is slightly more convoluted here as for optimization
/// reasons, we want to ensure that a value spawns over at most 8 bytes
/// of aligns bytes.
///
/// Spawning over 9 bytes is possible for instance, if we do
///
/// Spanning over 9 bytes is possible for instance, if we do
/// bitpacking with an amplitude of 63 bits.
/// In this case, the second int will start on bit
/// 63 (which belongs to byte 7) and ends at byte 15;
/// 63 (which belongs to byte 7) and ends at byte 15;
/// Hence 9 bytes (from byte 7 to byte 15 included).
///
/// To avoid this, we force the number of bits to 64bits
@@ -32,8 +32,7 @@ pub fn compute_num_bits(amplitude: u64) -> u8 {
let amplitude = (64u32 - amplitude.leading_zeros()) as u8;
if amplitude <= 64 - 8 {
amplitude
}
else {
} else {
64
}
}
@@ -42,115 +41,144 @@ pub struct BitPacker {
mini_buffer: u64,
mini_buffer_written: usize,
num_bits: usize,
written_size: usize,
}
impl BitPacker {
pub fn new(num_bits: usize) -> BitPacker {
BitPacker {
mini_buffer: 0u64,
mini_buffer_written: 0,
num_bits: num_bits,
written_size: 0,
num_bits,
}
}
pub fn write<TWrite: Write>(&mut self, val: u64, output: &mut TWrite) -> io::Result<()> {
let val_u64 = val as u64;
if self.mini_buffer_written + self.num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
self.written_size += self.mini_buffer.serialize(output)?;
self.mini_buffer.serialize(output)?;
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
self.mini_buffer_written = self.mini_buffer_written + (self.num_bits as usize) - 64;
}
else {
} else {
self.mini_buffer |= val_u64 << self.mini_buffer_written;
self.mini_buffer_written += self.num_bits;
if self.mini_buffer_written == 64 {
self.written_size += self.mini_buffer.serialize(output)?;
self.mini_buffer.serialize(output)?;
self.mini_buffer_written = 0;
self.mini_buffer = 0u64;
}
}
}
Ok(())
}
fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()>{
pub(crate) fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8;
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
output.write_all(&arr[..num_bytes])?;
self.written_size += num_bytes;
self.mini_buffer_written = 0;
}
Ok(())
}
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<usize> {
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
self.flush(output)?;
Ok(self.written_size)
// Padding the write file to simplify reads.
output.write_all(&[0u8; 7])?;
Ok(())
}
}
pub struct BitUnpacker {
pub struct BitUnpacker<Data>
where
Data: Deref<Target = [u8]>,
{
num_bits: usize,
mask: u64,
data_ptr: *const u8,
data_len: usize,
data: Data,
}
impl BitUnpacker {
pub fn new(data: &[u8], num_bits: usize) -> BitUnpacker {
let mask: u64 =
if num_bits == 64 {
!0u64
}
else {
(1u64 << num_bits) - 1u64
};
impl<Data> BitUnpacker<Data>
where
Data: Deref<Target = [u8]>,
{
pub fn new(data: Data, num_bits: usize) -> BitUnpacker<Data> {
let mask: u64 = if num_bits == 64 {
!0u64
} else {
(1u64 << num_bits) - 1u64
};
BitUnpacker {
num_bits: num_bits,
mask: mask,
data_ptr: data.as_ptr(),
data_len: data.len()
num_bits,
mask,
data,
}
}
pub fn get(&self, idx: usize) -> u64 {
if self.num_bits == 0 {
return 0;
}
let addr = (idx * self.num_bits) / 8;
let bit_shift = idx * self.num_bits - addr * 8;
let val_unshifted_unmasked: u64;
if addr + 8 <= self.data_len {
val_unshifted_unmasked = unsafe { * (self.data_ptr.offset(addr as isize) as *const u64) };
}
else {
let mut arr = [0u8; 8];
if addr < self.data_len {
for i in 0..self.data_len - addr {
arr[i] = unsafe { *self.data_ptr.offset( (addr + i) as isize) };
let data: &[u8] = &*self.data;
let num_bits = self.num_bits;
let mask = self.mask;
let addr_in_bits = idx * num_bits;
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
if cfg!(feature = "simdcompression") {
// for simdcompression,
// the bitpacker is only used for fastfields,
// and we expect them to be always padded.
debug_assert!(
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 =
unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
(val_shifted & mask)
} else {
let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() {
unsafe { *(data[addr..].as_ptr() as *const u64) }
} else {
let mut buffer = [0u8; 8];
for i in addr..data.len() {
buffer[i - addr] += data[i];
}
}
val_unshifted_unmasked = unsafe { mem::transmute::<[u8; 8], u64>(arr) };
unsafe { *(buffer[..].as_ptr() as *const u64) }
};
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
(val_shifted & mask)
}
}
pub fn get_range(&self, start: u32, output: &mut [u64]) {
if self.num_bits == 0 {
for val in output.iter_mut() {
*val = 0;
}
} else {
let data: &[u8] = &*self.data;
let num_bits = self.num_bits;
let mask = self.mask;
let mut addr_in_bits = (start as usize) * num_bits;
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
let val_unshifted_unmasked: u64 =
unsafe { *(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;
}
}
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
(val_shifted & self.mask)
}
}
#[cfg(test)]
mod test {
use super::{BitPacker, BitUnpacker, compute_num_bits};
use super::{compute_num_bits, BitPacker, BitUnpacker};
#[test]
fn test_compute_num_bits() {
assert_eq!(compute_num_bits(1), 1u8);
@@ -162,31 +190,30 @@ mod test {
assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
fn test_bitpacker_util(len: usize, num_bits: usize) {
fn create_fastfield_bitpacker(len: usize, num_bits: usize) -> (BitUnpacker<Vec<u8>>, Vec<u64>) {
let mut data = Vec::new();
let mut bitpacker = BitPacker::new(num_bits);
let max_val: u64 = (1 << num_bits) - 1;
let vals: Vec<u64> = (0u64..len as u64).map(|i| {
if max_val == 0 {
0
}
else {
i % max_val
}
}).collect();
let vals: Vec<u64> = (0u64..len as u64)
.map(|i| if max_val == 0 { 0 } else { i % max_val })
.collect();
for &val in &vals {
bitpacker.write(val, &mut data).unwrap();
}
let num_bytes = bitpacker.close(&mut data).unwrap();
assert_eq!(num_bytes, (num_bits * len + 7) / 8);
assert_eq!(data.len(), num_bytes);
let bitunpacker = BitUnpacker::new(&data, num_bits);
bitpacker.close(&mut data).unwrap();
assert_eq!(data.len(), (num_bits * len + 7) / 8 + 7);
let bitunpacker = BitUnpacker::new(data, num_bits);
(bitunpacker, vals)
}
fn test_bitpacker_util(len: usize, num_bits: usize) {
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
for (i, val) in vals.iter().enumerate() {
assert_eq!(bitunpacker.get(i), *val);
}
}
#[test]
fn test_bitpacker() {
test_bitpacker_util(10, 3);
@@ -195,4 +222,17 @@ mod test {
test_bitpacker_util(6, 14);
test_bitpacker_util(1000, 14);
}
}
#[test]
fn test_bitpacker_range() {
let (bitunpacker, vals) = create_fastfield_bitpacker(100_000, 12);
let buffer_len = 100;
let mut buffer = vec![0u64; buffer_len];
for start in vec![0, 10, 20, 100, 1_000] {
bitunpacker.get_range(start as u32, &mut buffer[..]);
for i in 0..buffer_len {
assert_eq!(buffer[i], vals[start + i]);
}
}
}
}

View File

@@ -0,0 +1,184 @@
use std::io::Write;
use common::CountingWriter;
use std::collections::HashMap;
use schema::Field;
use common::VInt;
use directory::WritePtr;
use std::io;
use directory::ReadOnlySource;
use common::BinarySerializable;
/// A `CompositeWrite` is used to write a `CompositeFile`.
pub struct CompositeWrite<W = WritePtr> {
write: CountingWriter<W>,
offsets: HashMap<Field, usize>,
}
impl<W: Write> CompositeWrite<W> {
/// Crate a new API writer that writes a composite file
/// in a given write.
pub fn wrap(w: W) -> CompositeWrite<W> {
CompositeWrite {
write: CountingWriter::wrap(w),
offsets: HashMap::new(),
}
}
/// Start writing a new field.
pub fn for_field(&mut self, field: Field) -> &mut CountingWriter<W> {
let offset = self.write.written_bytes();
assert!(!self.offsets.contains_key(&field));
self.offsets.insert(field, offset);
&mut self.write
}
/// Close the composite file.
///
/// An index of the different field offsets
/// will be written as a footer.
pub fn close(mut self) -> io::Result<()> {
let footer_offset = self.write.written_bytes();
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
let mut offset_fields: Vec<_> = self.offsets
.iter()
.map(|(field, offset)| (offset, field))
.collect();
offset_fields.sort();
let mut prev_offset = 0;
for (offset, field) in offset_fields {
VInt((offset - prev_offset) as u64).serialize(&mut self.write)?;
field.serialize(&mut self.write)?;
prev_offset = *offset;
}
let footer_len = (self.write.written_bytes() - footer_offset) as u32;
footer_len.serialize(&mut self.write)?;
self.write.flush()?;
Ok(())
}
}
/// A composite file is an abstraction to store a
/// file partitioned by field.
///
/// The file needs to be written field by field.
/// A footer describes the start and stop offsets
/// for each field.
#[derive(Clone)]
pub struct CompositeFile {
data: ReadOnlySource,
offsets_index: HashMap<Field, (usize, usize)>,
}
impl CompositeFile {
/// Opens a composite file stored in a given
/// `ReadOnlySource`.
pub fn open(data: &ReadOnlySource) -> io::Result<CompositeFile> {
let end = data.len();
let footer_len_data = data.slice_from(end - 4);
let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize;
let footer_start = end - 4 - footer_len;
let footer_data = data.slice(footer_start, footer_start + footer_len);
let mut footer_buffer = footer_data.as_slice();
let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize;
let mut fields = vec![];
let mut offsets = vec![];
let mut field_index = HashMap::new();
let mut offset = 0;
for _ in 0..num_fields {
offset += VInt::deserialize(&mut footer_buffer)?.0 as usize;
let field = Field::deserialize(&mut footer_buffer)?;
offsets.push(offset);
fields.push(field);
}
offsets.push(footer_start);
for i in 0..num_fields {
let field = fields[i];
let start_offset = offsets[i];
let end_offset = offsets[i + 1];
field_index.insert(field, (start_offset, end_offset));
}
Ok(CompositeFile {
data: data.slice_to(footer_start),
offsets_index: field_index,
})
}
/// Returns a composite file that stores
/// no fields.
pub fn empty() -> CompositeFile {
CompositeFile {
offsets_index: HashMap::new(),
data: ReadOnlySource::empty(),
}
}
/// Returns the `ReadOnlySource` associated
/// to a given `Field` and stored in a `CompositeFile`.
pub fn open_read(&self, field: Field) -> Option<ReadOnlySource> {
self.offsets_index
.get(&field)
.map(|&(from, to)| self.data.slice(from, to))
}
}
#[cfg(test)]
mod test {
use std::io::Write;
use super::{CompositeFile, CompositeWrite};
use directory::{Directory, RAMDirectory};
use schema::Field;
use common::VInt;
use common::BinarySerializable;
use std::path::Path;
#[test]
fn test_composite_file() {
let path = Path::new("test_path");
let mut directory = RAMDirectory::create();
{
let w = directory.open_write(path).unwrap();
let mut composite_write = CompositeWrite::wrap(w);
{
let mut write_0 = composite_write.for_field(Field(0u32));
VInt(32431123u64).serialize(&mut write_0).unwrap();
write_0.flush().unwrap();
}
{
let mut write_4 = composite_write.for_field(Field(4u32));
VInt(2).serialize(&mut write_4).unwrap();
write_4.flush().unwrap();
}
composite_write.close().unwrap();
}
{
let r = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(&r).unwrap();
{
let file0 = composite_file.open_read(Field(0u32)).unwrap();
let mut file0_buf = file0.as_slice();
let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0;
assert_eq!(file0_buf.len(), 0);
assert_eq!(payload_0, 32431123u64);
}
{
let file4 = composite_file.open_read(Field(4u32)).unwrap();
let mut file4_buf = file4.as_slice();
let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0;
assert_eq!(file4_buf.len(), 0);
assert_eq!(payload_4, 2u64);
}
}
}
}

View File

@@ -0,0 +1,55 @@
use std::io::Write;
use std::io;
pub struct CountingWriter<W> {
underlying: W,
written_bytes: usize,
}
impl<W: Write> CountingWriter<W> {
pub fn wrap(underlying: W) -> CountingWriter<W> {
CountingWriter {
underlying,
written_bytes: 0,
}
}
pub fn written_bytes(&self) -> usize {
self.written_bytes
}
pub fn finish(mut self) -> io::Result<(W, usize)> {
self.flush()?;
Ok((self.underlying, self.written_bytes))
}
}
impl<W: Write> Write for CountingWriter<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
let written_size = self.underlying.write(buf)?;
self.written_bytes += written_size;
Ok(written_size)
}
fn flush(&mut self) -> io::Result<()> {
self.underlying.flush()
}
}
#[cfg(test)]
mod test {
use super::CountingWriter;
use std::io::Write;
#[test]
fn test_counting_writer() {
let buffer: Vec<u8> = vec![];
let mut counting_writer = CountingWriter::wrap(buffer);
let bytes = (0u8..10u8).collect::<Vec<u8>>();
counting_writer.write_all(&bytes).unwrap();
let (w, len): (Vec<u8>, usize) = counting_writer.finish().unwrap();
assert_eq!(len, 10);
assert_eq!(w.len(), 10);
}
}

View File

@@ -1,13 +1,18 @@
mod serialize;
mod timer;
mod vint;
mod counting_writer;
mod composite_file;
pub mod bitpacker;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::serialize::BinarySerializable;
pub use self::timer::Timing;
pub use self::timer::TimerTree;
pub use self::timer::OpenTimer;
pub use self::vint::VInt;
pub use self::counting_writer::CountingWriter;
use std::io;
/// Create a default io error given a string.
@@ -15,47 +20,50 @@ pub fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
/// Has length trait
pub trait HasLen {
/// Return length
fn len(&self,) -> usize;
fn len(&self) -> usize;
/// Returns true iff empty.
fn is_empty(&self,) -> bool {
fn is_empty(&self) -> bool {
self.len() == 0
}
}
const HIGHEST_BIT: u64 = 1 << 63;
/// Maps a `i64` to `u64`
///
/// For simplicity, tantivy internally handles `i64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `i64` to `u64` so that
/// `-2^63 .. 2^63-1` is mapped
/// to
/// `0 .. 2^64`
/// to
/// `0 .. 2^64-1`
/// in that order.
///
/// This is more suited than simply casting (`val as u64`)
/// because of bitpacking.
///
///
/// Imagine a list of `i64` ranging from -10 to 10.
/// When casting negative values, the negative values are projected
/// to values over 2^63, and all values end up requiring 64 bits.
///
/// # See also
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
#[inline(always)]
pub fn i64_to_u64(val: i64) -> u64 {
(val as u64) ^ HIGHEST_BIT
}
/// Reverse the mapping given by
/// `i64_to_u64`.
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline(always)]
pub fn u64_to_i64(val: u64) -> i64 {
(val ^ HIGHEST_BIT) as i64
}
#[cfg(test)]
mod test {
@@ -76,4 +84,4 @@ mod test {
test_i64_converter_helper(i);
}
}
}
}

View File

@@ -6,110 +6,103 @@ use std::io::Read;
use std::io;
use common::VInt;
pub trait BinarySerializable : fmt::Debug + Sized {
fn serialize(&self, writer: &mut Write) -> io::Result<usize>;
fn deserialize(reader: &mut Read) -> io::Result<Self>;
pub trait BinarySerializable: fmt::Debug + Sized {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()>;
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self>;
}
impl BinarySerializable for () {
fn serialize(&self, _: &mut Write) -> io::Result<usize> {
Ok(0)
fn serialize<W: Write>(&self, _: &mut W) -> io::Result<()> {
Ok(())
}
fn deserialize(_: &mut Read) -> io::Result<Self> {
fn deserialize<R: Read>(_: &mut R) -> io::Result<Self> {
Ok(())
}
}
impl<T: BinarySerializable> BinarySerializable for Vec<T> {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let mut total_size = try!(VInt(self.len() as u64).serialize(writer));
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?;
for it in self {
total_size += try!(it.serialize(writer));
it.serialize(writer)?;
}
Ok(total_size)
Ok(())
}
fn deserialize(reader: &mut Read) -> io::Result<Vec<T>> {
let num_items = try!(VInt::deserialize(reader)).val();
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
let num_items = VInt::deserialize(reader)?.val();
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = try!(T::deserialize(reader));
let item = T::deserialize(reader)?;
items.push(item);
}
Ok(items)
}
}
impl<Left: BinarySerializable, Right: BinarySerializable> BinarySerializable for (Left, Right) {
fn serialize(&self, write: &mut Write) -> io::Result<usize> {
Ok(try!(self.0.serialize(write)) + try!(self.1.serialize(write)))
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
self.0.serialize(write)?;
self.1.serialize(write)
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
Ok( (try!(Left::deserialize(reader)), try!(Right::deserialize(reader))) )
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
Ok((Left::deserialize(reader)?, Right::deserialize(reader)?))
}
}
impl BinarySerializable for u32 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u32::<Endianness>(*self)
.map(|_| 4)
}
fn deserialize(reader: &mut Read) -> io::Result<u32> {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u32> {
reader.read_u32::<Endianness>()
}
}
impl BinarySerializable for u64 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u64::<Endianness>(*self)
.map(|_| 8)
}
fn deserialize(reader: &mut Read) -> io::Result<u64> {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
reader.read_u64::<Endianness>()
}
}
impl BinarySerializable for i64 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_i64::<Endianness>(*self)
.map(|_| 8)
}
fn deserialize(reader: &mut Read) -> io::Result<i64> {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
reader.read_i64::<Endianness>()
}
}
impl BinarySerializable for u8 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
try!(writer.write_u8(*self));
Ok(1)
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u8(*self)
}
fn deserialize(reader: &mut Read) -> io::Result<u8> {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u8> {
reader.read_u8()
}
}
impl BinarySerializable for String {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
let mut size = try!(VInt(data.len() as u64).serialize(writer));
size += data.len();
try!(writer.write_all(data));
Ok(size)
VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data)
}
fn deserialize(reader: &mut Read) -> io::Result<String> {
let string_length = try!(VInt::deserialize(reader)).val() as usize;
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
try!(reader.take(string_length as u64).read_to_string(&mut result));
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
Ok(result)
}
}
#[cfg(test)]
mod test {
@@ -118,12 +111,10 @@ mod test {
fn serialize_test<T: BinarySerializable + Eq>(v: T, num_bytes: usize) {
let mut buffer: Vec<u8> = Vec::new();
if num_bytes != 0 {
assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes);
v.serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), num_bytes);
}
else {
} else {
v.serialize(&mut buffer).unwrap();
}
let mut cursor = &buffer[..];
@@ -147,15 +138,15 @@ mod test {
#[test]
fn test_serialize_string() {
serialize_test(String::from(""), 1);
serialize_test(String::from("ぽよぽよ"), 1 + 3*4);
serialize_test(String::from("富士さん見える。"), 1 + 3*8);
serialize_test(String::from("ぽよぽよ"), 1 + 3 * 4);
serialize_test(String::from("富士さん見える。"), 1 + 3 * 8);
}
#[test]
fn test_serialize_vec() {
let v: Vec<u8> = Vec::new();
serialize_test(v, 1);
serialize_test(vec!(1u32, 3u32), 1 + 4*2);
serialize_test(vec![1u32, 3u32], 1 + 4 * 2);
}
#[test]

View File

@@ -10,7 +10,7 @@ pub struct OpenTimer<'a> {
impl<'a> OpenTimer<'a> {
/// Starts timing a new named subtask
///
/// The timer is stopped automatically
/// The timer is stopped automatically
/// when the `OpenTimer` is dropped.
pub fn open(&mut self, name: &'static str) -> OpenTimer {
OpenTimer {
@@ -23,10 +23,13 @@ impl<'a> OpenTimer<'a> {
}
impl<'a> Drop for OpenTimer<'a> {
fn drop(&mut self,) {
self.timer_tree.timings.push(Timing {
fn drop(&mut self) {
self.timer_tree.timings.push(Timing {
name: self.name,
duration: self.start.to(PreciseTime::now()).num_microseconds().unwrap(),
duration: self.start
.to(PreciseTime::now())
.num_microseconds()
.unwrap(),
depth: self.depth,
});
}
@@ -47,12 +50,11 @@ pub struct TimerTree {
}
impl TimerTree {
/// Returns the total time elapsed in microseconds
pub fn total_time(&self,) -> i64 {
/// Returns the total time elapsed in microseconds
pub fn total_time(&self) -> i64 {
self.timings.last().unwrap().duration
}
/// Open a new named subtask
pub fn open(&mut self, name: &'static str) -> OpenTimer {
OpenTimer {
@@ -72,7 +74,6 @@ impl Default for TimerTree {
}
}
#[cfg(test)]
mod tests {

View File

@@ -3,59 +3,55 @@ use std::io;
use std::io::Write;
use std::io::Read;
/// Wrapper over a `u64` that serializes as a variable int.
/// Wrapper over a `u64` that serializes as a variable int.
#[derive(Debug, Eq, PartialEq)]
pub struct VInt(pub u64);
impl VInt {
pub fn val(&self,) -> u64 {
pub fn val(&self) -> u64 {
self.0
}
}
impl BinarySerializable for VInt {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
let mut remaining = self.0;
let mut written: usize = 0;
let mut buffer = [0u8; 10];
let mut i = 0;
loop {
let next_byte: u8 = (remaining % 128u64) as u8;
remaining /= 128u64;
if remaining == 0u64 {
buffer[written] = next_byte | 128u8;
written += 1;
break;
}
else {
buffer[written] = next_byte;
written += 1;
buffer[i] = next_byte | 128u8;
return writer.write_all(&buffer[0..i + 1]);
} else {
buffer[i] = next_byte;
}
i += 1;
}
try!(writer.write_all(&buffer[0..written]));
Ok(written)
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let mut bytes = reader.bytes();
let mut result = 0u64;
let mut shift = 0u64;
loop {
match bytes.next() {
Some(Ok(b)) => {
result += ((b % 128u8) as u64) << shift;
result += u64::from(b % 128u8) << shift;
if b & 128u8 != 0u8 {
break;
}
shift += 7;
}
_ => {
return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer"))
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"Reach end of buffer",
))
}
}
}
Ok(VInt(result))
}
}

View File

@@ -1,159 +0,0 @@
use super::{BlockEncoder, BlockDecoder};
use super::NUM_DOCS_PER_BLOCK;
use compression::{VIntEncoder, VIntDecoder};
pub struct CompositeEncoder {
block_encoder: BlockEncoder,
output: Vec<u8>,
}
impl CompositeEncoder {
pub fn new() -> CompositeEncoder {
CompositeEncoder {
block_encoder: BlockEncoder::new(),
output: Vec::with_capacity(500_000),
}
}
pub fn compress_sorted(&mut self, vals: &[u32]) -> &[u8] {
self.output.clear();
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
let mut offset = 0u32;
for i in 0..num_blocks {
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK .. (i + 1) * NUM_DOCS_PER_BLOCK];
let block_compressed = self.block_encoder.compress_block_sorted(vals_slice, offset);
offset = vals_slice[NUM_DOCS_PER_BLOCK - 1];
self.output.extend_from_slice(block_compressed);
}
let vint_compressed = self.block_encoder.compress_vint_sorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..], offset);
self.output.extend_from_slice(vint_compressed);
&self.output
}
pub fn compress_unsorted(&mut self, vals: &[u32]) -> &[u8] {
self.output.clear();
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
for i in 0..num_blocks {
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK .. (i + 1) * NUM_DOCS_PER_BLOCK];
let block_compressed = self.block_encoder.compress_block_unsorted(vals_slice);
self.output.extend_from_slice(block_compressed);
}
let vint_compressed = self.block_encoder.compress_vint_unsorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..]);
self.output.extend_from_slice(vint_compressed);
&self.output
}
}
pub struct CompositeDecoder {
block_decoder: BlockDecoder,
vals: Vec<u32>,
}
impl CompositeDecoder {
pub fn new() -> CompositeDecoder {
CompositeDecoder {
block_decoder: BlockDecoder::new(),
vals: Vec::with_capacity(500_000),
}
}
pub fn uncompress_sorted(&mut self, mut compressed_data: &[u8], uncompressed_len: usize) -> &[u32] {
if uncompressed_len > self.vals.capacity() {
let extra_capacity = uncompressed_len - self.vals.capacity();
self.vals.reserve(extra_capacity);
}
let mut offset = 0u32;
self.vals.clear();
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
for _ in 0..num_blocks {
compressed_data = self.block_decoder.uncompress_block_sorted(compressed_data, offset);
offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
self.vals.extend_from_slice(self.block_decoder.output_array());
}
self.block_decoder.uncompress_vint_sorted(compressed_data, offset, uncompressed_len % NUM_DOCS_PER_BLOCK);
self.vals.extend_from_slice(self.block_decoder.output_array());
&self.vals
}
pub fn uncompress_unsorted(&mut self, mut compressed_data: &[u8], uncompressed_len: usize) -> &[u32] {
self.vals.clear();
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
for _ in 0..num_blocks {
compressed_data = self.block_decoder.uncompress_block_unsorted(compressed_data);
self.vals.extend_from_slice(self.block_decoder.output_array());
}
self.block_decoder.uncompress_vint_unsorted(compressed_data, uncompressed_len % NUM_DOCS_PER_BLOCK);
self.vals.extend_from_slice(self.block_decoder.output_array());
&self.vals
}
}
impl Into<Vec<u32>> for CompositeDecoder {
fn into(self) -> Vec<u32> {
self.vals
}
}
#[cfg(test)]
pub mod tests {
use test::Bencher;
use super::*;
use compression::tests::generate_array;
#[test]
fn test_composite_unsorted() {
let data = generate_array(10_000, 0.1);
let mut encoder = CompositeEncoder::new();
let compressed = encoder.compress_unsorted(&data);
assert!(compressed.len() <= 19_794);
let mut decoder = CompositeDecoder::new();
let result = decoder.uncompress_unsorted(&compressed, data.len());
for i in 0..data.len() {
assert_eq!(data[i], result[i]);
}
}
#[test]
fn test_composite_sorted() {
let data = generate_array(10_000, 0.1);
let mut encoder = CompositeEncoder::new();
let compressed = encoder.compress_sorted(&data);
assert!(compressed.len() <= 7_826);
let mut decoder = CompositeDecoder::new();
let result = decoder.uncompress_sorted(&compressed, data.len());
for i in 0..data.len() {
assert_eq!(data[i], result[i]);
}
}
const BENCH_NUM_INTS: usize = 99_968;
#[bench]
fn bench_compress(b: &mut Bencher) {
let mut encoder = CompositeEncoder::new();
let data = generate_array(BENCH_NUM_INTS, 0.1);
b.iter(|| {
encoder.compress_sorted(&data);
});
}
#[bench]
fn bench_uncompress(b: &mut Bencher) {
let mut encoder = CompositeEncoder::new();
let data = generate_array(BENCH_NUM_INTS, 0.1);
let compressed = encoder.compress_sorted(&data);
let mut decoder = CompositeDecoder::new();
b.iter(|| {
decoder.uncompress_sorted(compressed, BENCH_NUM_INTS);
});
}
}

View File

@@ -1,112 +1,135 @@
#![allow(dead_code)]
mod composite;
pub use self::composite::{CompositeEncoder, CompositeDecoder};
mod stream;
pub use self::stream::CompressedIntStream;
#[cfg(not(feature="simdcompression"))]
#[cfg(not(feature = "simdcompression"))]
mod pack {
mod compression_pack_nosimd;
pub use self::compression_pack_nosimd::*;
pub use self::compression_pack_nosimd::{BlockDecoder, BlockEncoder};
}
#[cfg(feature="simdcompression")]
#[cfg(feature = "simdcompression")]
mod pack {
mod compression_pack_simd;
pub use self::compression_pack_simd::*;
pub use self::compression_pack_simd::{BlockDecoder, BlockEncoder};
}
pub use self::pack::{BlockEncoder, BlockDecoder};
pub use self::pack::{BlockDecoder, BlockEncoder};
#[cfg( any(not(feature="simdcompression"), target_env="msvc") )]
#[cfg(any(not(feature = "simdcompression"), target_env = "msvc"))]
mod vint {
mod compression_vint_nosimd;
pub use self::compression_vint_nosimd::*;
pub(crate) use self::compression_vint_nosimd::*;
}
#[cfg( all(feature="simdcompression", not(target_env="msvc")) )]
#[cfg(all(feature = "simdcompression", not(target_env = "msvc")))]
mod vint {
mod compression_vint_simd;
pub use self::compression_vint_simd::*;
pub(crate) use self::compression_vint_simd::*;
}
/// Returns the size in bytes of a compressed block, given `num_bits`.
pub fn compressed_block_size(num_bits: u8) -> usize {
1 + (num_bits as usize) * 16
}
pub trait VIntEncoder {
/// Compresses an array of `u32` integers,
/// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
/// and variable bytes encoding.
///
/// The method takes an array of ints to compress, and returns
/// a `&[u8]` representing the compressed data.
///
/// The method also takes an offset to give the value of the
/// hypothetical previous element in the delta-encoding.
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8];
/// Compresses an array of `u32` integers,
/// using variable bytes encoding.
///
/// The method takes an array of ints to compress, and returns
/// a `&[u8]` representing the compressed data.
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8];
}
pub trait VIntDecoder {
fn uncompress_vint_sorted<'a>(&mut self, compressed_data: &'a [u8], offset: u32, num_els: usize) -> &'a [u8];
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> &'a [u8];
/// Uncompress an array of `u32` integers,
/// that were compressed using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
/// and variable bytes encoding.
///
/// The method takes a number of int to decompress, and returns
/// the amount of bytes that were read to decompress them.
///
/// The method also takes an offset to give the value of the
/// hypothetical previous element in the delta-encoding.
///
/// For instance, if delta encoded are `1, 3, 9`, and the
/// `offset` is 5, then the output will be:
/// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18`
fn uncompress_vint_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize,
) -> usize;
/// Uncompress an array of `u32s`, compressed using variable
/// byte encoding.
///
/// The method takes a number of int to decompress, and returns
/// the amount of bytes that were read to decompress them.
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize;
}
impl VIntEncoder for BlockEncoder {
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8] {
vint::compress_sorted(input, &mut self.output, offset)
}
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8] {
vint::compress_unsorted(input, &mut self.output)
}
}
impl VIntDecoder for BlockDecoder {
fn uncompress_vint_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize) -> &'a [u8] {
num_els: usize,
) -> usize {
self.output_len = num_els;
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
}
fn uncompress_vint_unsorted<'a>(
&mut self,
compressed_data: &'a [u8],
num_els: usize) -> &'a [u8] {
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
self.output_len = num_els;
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
}
}
}
pub const NUM_DOCS_PER_BLOCK: usize = 128; //< should be a power of 2 to let the compiler optimize.
pub const COMPRESSION_BLOCK_SIZE: usize = 128;
#[cfg(test)]
pub mod tests {
use rand::Rng;
use rand::SeedableRng;
use rand::XorShiftRng;
use super::*;
use tests;
use test::Bencher;
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
(0..u32::max_value())
.filter(|_| rng.next_f32()< ratio)
.take(n)
.collect()
}
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
generate_array_with_seed(n, ratio, 4)
}
#[test]
fn test_encode_sorted_block() {
let vals: Vec<u32> = (0u32..128u32).map(|i| i*7).collect();
let vals: Vec<u32> = (0u32..128u32).map(|i| i * 7).collect();
let mut encoder = BlockEncoder::new();
let compressed_data = encoder.compress_block_sorted(&vals, 0);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_sorted(compressed_data, 0);
assert_eq!(remaining_data.len(), 0);
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0);
assert_eq!(consumed_num_bytes, compressed_data.len());
}
for i in 0..128 {
assert_eq!(vals[i], decoder.output(i));
@@ -115,33 +138,33 @@ pub mod tests {
#[test]
fn test_encode_sorted_block_with_offset() {
let vals: Vec<u32> = (0u32..128u32).map(|i| 11 + i*7).collect();
let vals: Vec<u32> = (0u32..128u32).map(|i| 11 + i * 7).collect();
let mut encoder = BlockEncoder::new();
let compressed_data = encoder.compress_block_sorted(&vals, 10);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_sorted(compressed_data, 10);
assert_eq!(remaining_data.len(), 0);
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10);
assert_eq!(consumed_num_bytes, compressed_data.len());
}
for i in 0..128 {
assert_eq!(vals[i], decoder.output(i));
}
}
#[test]
fn test_encode_sorted_block_with_junk() {
let mut compressed: Vec<u8> = Vec::new();
let n = 128;
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32)*7u32).collect();
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32).collect();
let mut encoder = BlockEncoder::new();
let compressed_data = encoder.compress_block_sorted(&vals, 10);
compressed.extend_from_slice(compressed_data);
compressed.push(173u8);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_sorted(&compressed, 10);
assert_eq!(remaining_data.len(), 1);
assert_eq!(remaining_data[0], 173u8);
let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10);
assert_eq!(consumed_num_bytes, compressed.len() - 1);
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
for i in 0..n {
assert_eq!(vals[i], decoder.output(i));
@@ -152,82 +175,91 @@ pub mod tests {
fn test_encode_unsorted_block_with_junk() {
let mut compressed: Vec<u8> = Vec::new();
let n = 128;
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32)*7u32 % 12).collect();
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect();
let mut encoder = BlockEncoder::new();
let compressed_data = encoder.compress_block_unsorted(&vals);
compressed.extend_from_slice(compressed_data);
compressed.push(173u8);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_unsorted(&compressed);
assert_eq!(remaining_data.len(), 1);
assert_eq!(remaining_data[0], 173u8);
let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed);
assert_eq!(consumed_num_bytes + 1, compressed.len());
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
for i in 0..n {
assert_eq!(vals[i], decoder.output(i));
}
}
#[test]
fn test_encode_vint() {
{
let expected_length = 154;
let mut encoder = BlockEncoder::new();
let input: Vec<u32> = (0u32..123u32)
.map(|i| 4 + i * 7 / 2)
.into_iter()
.collect();
let input: Vec<u32> = (0u32..123u32).map(|i| 4 + i * 7 / 2).into_iter().collect();
for offset in &[0u32, 1u32, 2u32] {
let encoded_data = encoder.compress_vint_sorted(&input, *offset);
assert!(encoded_data.len() <= expected_length);
let mut decoder = BlockDecoder::new();
let remaining_data = decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
assert_eq!(0, remaining_data.len());
let consumed_num_bytes =
decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
assert_eq!(consumed_num_bytes, encoded_data.len());
assert_eq!(input, decoder.output_array());
}
}
}
#[bench]
fn bench_compress(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_DOCS_PER_BLOCK, 0.1);
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
b.iter(|| {
encoder.compress_block_sorted(&data, 0u32);
});
}
#[bench]
fn bench_uncompress(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_DOCS_PER_BLOCK, 0.1);
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let compressed = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
let mut decoder = BlockDecoder::new();
b.iter(|| {
decoder.uncompress_block_sorted(compressed, 0u32);
});
}
#[test]
fn test_all_docs_compression_numbits() {
for num_bits in 0..33 {
let mut data = [0u32; 128];
if num_bits > 0 {
data[0] = 1 << (num_bits - 1);
}
let mut encoder = BlockEncoder::new();
let compressed = encoder.compress_block_unsorted(&data);
assert_eq!(compressed[0] as usize, num_bits);
assert_eq!(compressed.len(), compressed_block_size(compressed[0]));
}
}
const NUM_INTS_BENCH_VINT: usize = 10;
#[bench]
fn bench_compress_vint(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
b.iter(|| {
encoder.compress_vint_sorted(&data, 0u32);
});
}
#[bench]
fn bench_uncompress_vint(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
let mut decoder = BlockDecoder::new();
b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
});

View File

@@ -1,16 +1,17 @@
use common::bitpacker::compute_num_bits;
use common::bitpacker::{BitPacker, BitUnpacker};
use common::CountingWriter;
use std::cmp;
use std::io::Write;
use super::super::NUM_DOCS_PER_BLOCK;
use super::super::COMPRESSION_BLOCK_SIZE;
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) -> usize {
let mut max_delta = 0;
pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usize {
let mut max_delta = 0;
{
let mut local_offset = offset;
for i in 0..NUM_DOCS_PER_BLOCK {
for i in 0..COMPRESSION_BLOCK_SIZE {
let val = vals[i];
let delta = val - local_offset;
max_delta = cmp::max(max_delta, delta);
@@ -18,54 +19,67 @@ pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) ->
local_offset = val;
}
}
let num_bits = compute_num_bits(max_delta);
output.write_all(&[num_bits]).unwrap();
let mut counting_writer = CountingWriter::wrap(output);
let num_bits = compute_num_bits(max_delta as u64);
counting_writer.write_all(&[num_bits]).unwrap();
let mut bit_packer = BitPacker::new(num_bits as usize);
for val in vals {
bit_packer.write(*val, &mut output).unwrap();
bit_packer.write(*val as u64, &mut counting_writer).unwrap();
}
1 + bit_packer.close(&mut output).expect("packing in memory should never fail")
counting_writer.written_bytes()
}
pub struct BlockEncoder {
pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
pub output_len: usize,
input_buffer: [u32; NUM_DOCS_PER_BLOCK],
input_buffer: [u32; COMPRESSION_BLOCK_SIZE],
}
impl BlockEncoder {
pub fn new() -> BlockEncoder {
BlockEncoder {
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
output_len: 0,
input_buffer: [0u32; NUM_DOCS_PER_BLOCK],
}
input_buffer: [0u32; COMPRESSION_BLOCK_SIZE],
}
}
pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
self.input_buffer.clone_from_slice(vals);
let compressed_size = compress_sorted(&mut self.input_buffer, &mut self.output, offset);
&self.output[..compressed_size]
}
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
let compressed_size: usize = {
let mut output: &mut [u8] = &mut self.output;
let max = vals.iter().cloned().max().expect("compress unsorted called with an empty array");
let num_bits = compute_num_bits(max);
output.write_all(&[num_bits]).unwrap();
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
let compressed_size = {
let output: &mut [u8] = &mut self.output;
let max = vals.iter()
.cloned()
.max()
.expect("compress unsorted called with an empty array");
let num_bits = compute_num_bits(max as u64);
let mut counting_writer = CountingWriter::wrap(output);
counting_writer.write_all(&[num_bits]).unwrap();
let mut bit_packer = BitPacker::new(num_bits as usize);
for val in vals {
bit_packer.write(*val, &mut output).unwrap();
bit_packer.write(*val as u64, &mut counting_writer).unwrap();
}
1 + bit_packer.close(&mut output).expect("packing in memory should never fail")
for _ in vals.len()..COMPRESSION_BLOCK_SIZE {
bit_packer
.write(vals[0] as u64, &mut counting_writer)
.unwrap();
}
bit_packer.flush(&mut counting_writer).expect(
"Flushing the bitpacking \
in an in RAM buffer should never fail",
);
// we avoid writing "closing", because we
// do not want 7 bytes of padding here.
counting_writer.written_bytes()
};
&self.output[..compressed_size]
}
}
pub struct BlockDecoder {
@@ -73,55 +87,56 @@ pub struct BlockDecoder {
pub output_len: usize,
}
impl BlockDecoder {
pub fn new() -> BlockDecoder {
BlockDecoder::with_val(0u32)
}
pub fn with_val(val: u32) -> BlockDecoder {
BlockDecoder {
output: [val; COMPRESSED_BLOCK_MAX_SIZE],
output_len: 0,
}
}
pub fn uncompress_block_sorted<'a>(&mut self, compressed_data: &'a [u8], mut offset: u32) -> &'a[u8] {
pub fn uncompress_block_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
mut offset: u32,
) -> usize {
let consumed_size = {
let num_bits = compressed_data[0];
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
for i in 0..NUM_DOCS_PER_BLOCK {
for i in 0..COMPRESSION_BLOCK_SIZE {
let delta = bit_unpacker.get(i);
let val = offset + delta;
let val = offset + delta as u32;
self.output[i] = val;
offset = val;
}
1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8
1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8
};
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size
}
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a[u8] {
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
let num_bits = compressed_data[0];
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
for i in 0..NUM_DOCS_PER_BLOCK {
self.output[i] = bit_unpacker.get(i);
for i in 0..COMPRESSION_BLOCK_SIZE {
self.output[i] = bit_unpacker.get(i) as u32;
}
let consumed_size = 1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8;
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
let consumed_size = 1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8;
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size
}
#[inline]
pub fn output_array(&self,) -> &[u32] {
pub fn output_array(&self) -> &[u32] {
&self.output[..self.output_len]
}
#[inline]
pub fn output(&self, idx: usize) -> u32 {
self.output[idx]
}
}

View File

@@ -1,28 +1,22 @@
use super::super::NUM_DOCS_PER_BLOCK;
use super::super::COMPRESSION_BLOCK_SIZE;
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
mod simdcomp {
use libc::size_t;
extern {
pub fn compress_sorted(
data: *const u32,
output: *mut u8,
offset: u32) -> size_t;
extern "C" {
pub fn compress_sorted(data: *const u32, output: *mut u8, offset: u32) -> size_t;
pub fn uncompress_sorted(
compressed_data: *const u8,
output: *mut u32,
offset: u32) -> size_t;
pub fn compress_unsorted(
data: *const u32,
output: *mut u8) -> size_t;
offset: u32,
) -> size_t;
pub fn uncompress_unsorted(
compressed_data: *const u8,
output: *mut u32) -> size_t;
pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t;
pub fn uncompress_unsorted(compressed_data: *const u8, output: *mut u32) -> size_t;
}
}
@@ -31,7 +25,9 @@ fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize {
}
fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
unsafe { simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset) }
unsafe {
simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset)
}
}
fn compress_unsorted(vals: &[u32], output: &mut [u8]) -> usize {
@@ -42,31 +38,28 @@ fn uncompress_unsorted(compressed_data: &[u8], output: &mut [u32]) -> usize {
unsafe { simdcomp::uncompress_unsorted(compressed_data.as_ptr(), output.as_mut_ptr()) }
}
pub struct BlockEncoder {
pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
pub output_len: usize,
}
impl BlockEncoder {
pub fn new() -> BlockEncoder {
BlockEncoder {
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
output_len: 0,
}
}
}
pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
let compressed_size = compress_sorted(vals, &mut self.output, offset);
&self.output[..compressed_size]
}
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
let compressed_size = compress_unsorted(vals, &mut self.output);
&self.output[..compressed_size]
}
}
pub struct BlockDecoder {
@@ -74,40 +67,52 @@ pub struct BlockDecoder {
pub output_len: usize,
}
impl BlockDecoder {
pub fn new() -> BlockDecoder {
BlockDecoder::with_val(0u32)
}
pub fn with_val(val: u32) -> BlockDecoder {
BlockDecoder {
output: [val; COMPRESSED_BLOCK_MAX_SIZE],
output_len: 0,
}
}
pub fn uncompress_block_sorted<'a>(&mut self, compressed_data: &'a [u8], offset: u32) -> &'a[u8] {
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset);
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size
}
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a[u8] {
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
let consumed_size = uncompress_unsorted(compressed_data, &mut self.output);
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
self.output_len = COMPRESSION_BLOCK_SIZE;
consumed_size
}
#[inline]
pub fn output_array(&self,) -> &[u32] {
pub fn output_array(&self) -> &[u32] {
&self.output[..self.output_len]
}
#[inline]
pub fn output(&self, idx: usize) -> u32 {
self.output[idx]
}
}
#[cfg(test)]
mod tests {
use super::BlockEncoder;
#[test]
fn test_all_docs_compression_len() {
let data: Vec<u32> = (0u32..128u32).collect();
let mut encoder = BlockEncoder::new();
let compressed = encoder.compress_block_sorted(&data, 0u32);
assert_eq!(compressed.len(), 17);
}
}

130
src/compression/stream.rs Normal file
View File

@@ -0,0 +1,130 @@
use compression::BlockDecoder;
use compression::COMPRESSION_BLOCK_SIZE;
use compression::compressed_block_size;
use directory::{ReadOnlySource, SourceRead};
/// Reads a stream of compressed ints.
///
/// Tantivy uses `CompressedIntStream` to read
/// the position file.
/// The `.skip(...)` makes it possible to avoid
/// decompressing blocks that are not required.
pub struct CompressedIntStream {
buffer: SourceRead,
block_decoder: BlockDecoder,
inner_offset: usize,
}
impl CompressedIntStream {
/// Opens a compressed int stream.
pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream {
CompressedIntStream {
buffer: SourceRead::from(source),
block_decoder: BlockDecoder::new(),
inner_offset: COMPRESSION_BLOCK_SIZE,
}
}
/// Fills a buffer with the next `output.len()` integers,
/// and advance the stream by that many els.
pub fn read(&mut self, output: &mut [u32]) {
let mut num_els: usize = output.len();
let mut start: usize = 0;
loop {
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
if num_els >= available {
if available > 0 {
let uncompressed_block =
&self.block_decoder.output_array()[self.inner_offset..];
output[start..][..available].clone_from_slice(uncompressed_block);
}
num_els -= available;
start += available;
let num_consumed_bytes = self.block_decoder
.uncompress_block_unsorted(self.buffer.as_ref());
self.buffer.advance(num_consumed_bytes);
self.inner_offset = 0;
} else {
let uncompressed_block = &self.block_decoder.output_array()
[self.inner_offset..self.inner_offset + num_els];
output[start..][..num_els].clone_from_slice(uncompressed_block);
self.inner_offset += num_els;
break;
}
}
}
/// Skip the next `skip_len` integer.
///
/// If a full block is skipped, calling
/// `.skip(...)` will avoid decompressing it.
pub fn skip(&mut self, mut skip_len: usize) {
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
if available >= skip_len {
self.inner_offset += skip_len;
} else {
skip_len -= available;
// entirely skip decompressing some blocks.
while skip_len >= COMPRESSION_BLOCK_SIZE {
skip_len -= COMPRESSION_BLOCK_SIZE;
let num_bits: u8 = self.buffer.as_ref()[0];
let block_len = compressed_block_size(num_bits);
self.buffer.advance(block_len);
}
let num_consumed_bytes = self.block_decoder
.uncompress_block_unsorted(self.buffer.as_ref());
self.buffer.advance(num_consumed_bytes);
self.inner_offset = skip_len;
}
}
}
#[cfg(test)]
pub mod tests {
use super::CompressedIntStream;
use compression::compressed_block_size;
use compression::COMPRESSION_BLOCK_SIZE;
use compression::BlockEncoder;
use directory::ReadOnlySource;
fn create_stream_buffer() -> ReadOnlySource {
let mut buffer: Vec<u8> = vec![];
let mut encoder = BlockEncoder::new();
let vals: Vec<u32> = (0u32..1_025u32).collect();
for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) {
let compressed_block = encoder.compress_block_unsorted(chunk);
let num_bits = compressed_block[0];
assert_eq!(compressed_block_size(num_bits), compressed_block.len());
buffer.extend_from_slice(compressed_block);
}
if cfg!(simd) {
buffer.extend_from_slice(&[0u8; 7]);
}
ReadOnlySource::from(buffer)
}
#[test]
fn test_compressed_int_stream() {
let buffer = create_stream_buffer();
let mut stream = CompressedIntStream::wrap(buffer);
let mut block: [u32; COMPRESSION_BLOCK_SIZE] = [0u32; COMPRESSION_BLOCK_SIZE];
stream.read(&mut block[0..2]);
assert_eq!(block[0], 0);
assert_eq!(block[1], 1);
stream.skip(5);
stream.read(&mut block[0..3]);
assert_eq!(block[0], 7);
assert_eq!(block[1], 8);
assert_eq!(block[2], 9);
stream.skip(500);
stream.read(&mut block[0..3]);
assert_eq!(block[0], 510);
assert_eq!(block[1], 511);
assert_eq!(block[2], 512);
stream.skip(511);
stream.read(&mut block[..1]);
assert_eq!(block[0], 1024);
}
}

View File

@@ -1,6 +1,9 @@
#[inline(always)]
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
pub(crate) fn compress_sorted<'a>(
input: &[u32],
output: &'a mut [u8],
mut offset: u32,
) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v - offset;
@@ -12,8 +15,7 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32)
output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
}
else {
} else {
output[byte_written] = next_byte;
byte_written += 1;
}
@@ -23,7 +25,7 @@ pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32)
}
#[inline(always)]
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v;
@@ -34,8 +36,7 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
}
else {
} else {
output[byte_written] = next_byte;
byte_written += 1;
}
@@ -45,10 +46,11 @@ pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
}
#[inline(always)]
pub fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32) -> &'a [u8] {
pub(crate) fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32,
) -> usize {
let mut read_byte = 0;
let mut result = offset;
let num_els = output.len();
@@ -65,13 +67,11 @@ pub fn uncompress_sorted<'a>(
}
output[i] = result;
}
&compressed_data[read_byte..]
read_byte
}
#[inline(always)]
pub fn uncompress_unsorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32]) -> &'a [u8] {
pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
let mut read_byte = 0;
let num_els = output.len();
for i in 0..num_els {
@@ -88,5 +88,5 @@ pub fn uncompress_unsorted<'a>(
}
output[i] = result;
}
&compressed_data[read_byte..]
read_byte
}

View File

@@ -1,82 +1,72 @@
mod streamvbyte {
use libc::size_t;
extern {
extern "C" {
pub fn streamvbyte_delta_encode(
data: *const u32,
num_els: u32,
output: *mut u8,
offset: u32) -> size_t;
offset: u32,
) -> size_t;
pub fn streamvbyte_delta_decode(
compressed_data: *const u8,
output: *mut u32,
num_els: u32,
offset: u32) -> size_t;
pub fn streamvbyte_encode(
data: *const u32,
num_els: u32,
output: *mut u8) -> size_t;
offset: u32,
) -> size_t;
pub fn streamvbyte_encode(data: *const u32, num_els: u32, output: *mut u8) -> size_t;
pub fn streamvbyte_decode(
compressed_data: *const u8,
output: *mut u32,
num_els: usize) -> size_t;
num_els: usize,
) -> size_t;
}
}
#[inline(always)]
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
let compress_length = unsafe {
streamvbyte::streamvbyte_delta_encode(
input.as_ptr(),
input.len() as u32,
output.as_mut_ptr(),
offset)
offset,
)
};
&output[..compress_length]
}
#[inline(always)]
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
let compress_length = unsafe {
streamvbyte::streamvbyte_encode(
input.as_ptr(),
input.len() as u32,
output.as_mut_ptr())
};
streamvbyte::streamvbyte_encode(input.as_ptr(), input.len() as u32, output.as_mut_ptr())
};
&output[..compress_length]
}
#[inline(always)]
pub fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32) -> &'a [u8] {
let consumed_bytes = unsafe {
pub(crate) fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32,
) -> usize {
unsafe {
streamvbyte::streamvbyte_delta_decode(
compressed_data.as_ptr(),
output.as_mut_ptr(),
output.len() as u32,
offset)
};
&compressed_data[consumed_bytes..]
offset,
)
}
}
#[inline(always)]
pub fn uncompress_unsorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32]) -> &'a [u8] {
let consumed_bytes = unsafe {
streamvbyte::streamvbyte_decode(
compressed_data.as_ptr(),
output.as_mut_ptr(),
output.len())
};
&compressed_data[consumed_bytes..]
pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
unsafe {
streamvbyte::streamvbyte_decode(compressed_data.as_ptr(), output.as_mut_ptr(), output.len())
}
}

View File

@@ -1,5 +1,5 @@
use Result;
use Error;
use error::{ErrorKind, ResultExt};
use serde_json;
use schema::Schema;
use std::sync::Arc;
@@ -23,24 +23,24 @@ use directory::ManagedDirectory;
use core::META_FILEPATH;
use super::segment::create_segment;
use indexer::segment_updater::save_new_metas;
use tokenizer::TokenizerManager;
const NUM_SEARCHERS: usize = 12;
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data);
serde_json::from_str(&meta_string)
.map_err(|e| Error::CorruptedFile(META_FILEPATH.clone(), Box::new(e)))
serde_json::from_str(&meta_string).chain_err(|| ErrorKind::CorruptedFile(META_FILEPATH.clone()))
}
/// Tantivy's Search Index
/// Search Index
pub struct Index {
directory: ManagedDirectory,
schema: Schema,
searcher_pool: Arc<Pool<Searcher>>,
tokenizers: TokenizerManager,
}
impl Index {
/// Creates a new index using the `RAMDirectory`.
///
@@ -48,20 +48,29 @@ impl Index {
/// This should only be used for unit tests.
pub fn create_in_ram(schema: Schema) -> Index {
let ram_directory = RAMDirectory::create();
let directory = ManagedDirectory::new(ram_directory).expect("Creating a managed directory from a brand new RAM directory should never fail.");
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail") // unwrap is ok here
// unwrap is ok here
let directory = ManagedDirectory::new(ram_directory).expect(
"Creating a managed directory from a brand new RAM directory \
should never fail.",
);
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
}
/// Creates a new index in a given filepath.
/// The index will use the `MMapDirectory`.
///
/// If a previous index was in this directory, then its meta file will be destroyed.
pub fn create(directory_path: &Path, schema: Schema) -> Result<Index> {
pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
let directory = ManagedDirectory::new(mmap_directory)?;
Index::from_directory(directory, schema)
}
/// Accessor for the tokenizer manager.
pub fn tokenizers(&self) -> &TokenizerManager {
&self.tokenizers
}
/// Creates a new index in a temp directory.
///
/// The index will use the `MMapDirectory` in a newly created directory.
@@ -77,29 +86,31 @@ impl Index {
}
/// Creates a new index given a directory and an `IndexMeta`.
fn create_from_metas(directory: ManagedDirectory, metas: IndexMeta) -> Result<Index> {
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
let schema = metas.schema.clone();
let index = Index {
directory: directory,
schema: schema,
directory,
schema,
searcher_pool: Arc::new(Pool::new()),
tokenizers: TokenizerManager::default(),
};
try!(index.load_searchers());
index.load_searchers()?;
Ok(index)
}
/// Create a new index from a directory.
pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
Index::create_from_metas(directory, IndexMeta::with_schema(schema))
let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas)
}
/// Opens a new directory from an index path.
pub fn open(directory_path: &Path) -> Result<Index> {
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
let directory = ManagedDirectory::new(mmap_directory)?;
let metas = try!(load_metas(&directory));
Index::create_from_metas(directory, metas)
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
}
/// Returns the index opstamp.
@@ -125,14 +136,14 @@ impl Index {
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn writer_with_num_threads(&self,
num_threads: usize,
heap_size_in_bytes: usize)
-> Result<IndexWriter> {
pub fn writer_with_num_threads(
&self,
num_threads: usize,
heap_size_in_bytes: usize,
) -> Result<IndexWriter> {
open_index_writer(self, num_threads, heap_size_in_bytes)
}
/// Creates a multithreaded writer
/// It just calls `writer_with_num_threads` with the number of cores as `num_threads`
///
@@ -153,8 +164,7 @@ impl Index {
/// Returns the list of segments that are searchable
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
Ok(self
.searchable_segment_metas()?
Ok(self.searchable_segment_metas()?
.into_iter()
.map(|segment_meta| self.segment(segment_meta))
.collect())
@@ -186,26 +196,27 @@ impl Index {
pub fn searchable_segment_metas(&self) -> Result<Vec<SegmentMeta>> {
Ok(load_metas(self.directory())?.segments)
}
/// Returns the list of segment ids that are searchable.
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
Ok(self.searchable_segment_metas()?
.iter()
.map(|segment_meta| segment_meta.id())
.collect())
.iter()
.map(|segment_meta| segment_meta.id())
.collect())
}
/// Creates a new generation of searchers after
/// a change of the set of searchable indexes.
///
/// This needs to be called when a new segment has been
/// published or after a merge.
pub fn load_searchers(&self) -> Result<()> {
let searchable_segments = self.searchable_segments()?;
let segment_readers: Vec<SegmentReader> = try!(searchable_segments
.into_iter()
.map(SegmentReader::open)
.collect());
let segment_readers: Vec<SegmentReader> = searchable_segments
.iter()
.map(SegmentReader::open)
.collect::<Result<_>>()?;
let searchers = (0..NUM_SEARCHERS)
.map(|_| Searcher::from(segment_readers.clone()))
.collect();
@@ -228,7 +239,6 @@ impl Index {
}
}
impl fmt::Debug for Index {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Index({:?})", self.directory)
@@ -240,7 +250,8 @@ impl Clone for Index {
Index {
directory: self.directory.clone(),
schema: self.schema.clone(),
searcher_pool: self.searcher_pool.clone(),
searcher_pool: Arc::clone(&self.searcher_pool),
tokenizers: self.tokenizers.clone(),
}
}
}

View File

@@ -2,14 +2,14 @@ use schema::Schema;
use core::SegmentMeta;
/// Meta information about the `Index`.
///
///
/// This object is serialized on disk in the `meta.json` file.
/// It keeps information about
/// It keeps information about
/// * the searchable segments,
/// * the index docstamp
/// * the index `docstamp`
/// * the schema
///
#[derive(Clone,Debug,Serialize, Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct IndexMeta {
pub segments: Vec<SegmentMeta>,
pub schema: Schema,
@@ -19,8 +19,8 @@ pub struct IndexMeta {
impl IndexMeta {
pub fn with_schema(schema: Schema) -> IndexMeta {
IndexMeta {
segments: vec!(),
schema: schema,
segments: vec![],
schema,
opstamp: 0u64,
}
}

View File

@@ -0,0 +1,152 @@
use directory::{ReadOnlySource, SourceRead};
use termdict::{TermDictionary, TermDictionaryImpl};
use postings::{BlockSegmentPostings, SegmentPostings};
use postings::TermInfo;
use schema::IndexRecordOption;
use schema::Term;
use std::cmp;
use fastfield::DeleteBitSet;
use schema::Schema;
use compression::CompressedIntStream;
/// The inverted index reader is in charge of accessing
/// the inverted index associated to a specific field.
///
/// # Note
///
/// It is safe to delete the segment associated to
/// an `InvertedIndexReader`. As long as it is open,
/// the `ReadOnlySource` it is relying on should
/// stay available.
///
///
/// `InvertedIndexReader` are created by calling
/// the `SegmentReader`'s [`.inverted_index(...)`] method
pub struct InvertedIndexReader {
termdict: TermDictionaryImpl,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
delete_bitset: DeleteBitSet,
schema: Schema,
}
impl InvertedIndexReader {
pub(crate) fn new(
termdict_source: ReadOnlySource,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
delete_bitset: DeleteBitSet,
schema: Schema,
) -> InvertedIndexReader {
InvertedIndexReader {
termdict: TermDictionaryImpl::from_source(termdict_source),
postings_source,
positions_source,
delete_bitset,
schema,
}
}
/// Returns the term info associated with the term.
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
self.termdict.get(term.as_slice())
}
/// Return the term dictionary datastructure.
pub fn terms(&self) -> &TermDictionaryImpl {
&self.termdict
}
/// Resets the block segment to another position of the postings
/// file.
///
/// This is useful for enumerating through a list of terms,
/// and consuming the associated posting lists while avoiding
/// reallocating a `BlockSegmentPostings`.
///
/// # Warning
///
/// This does not reset the positions list.
pub fn reset_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings,
) {
let offset = term_info.postings_offset as usize;
let end_source = self.postings_source.len();
let postings_slice = self.postings_source.slice(offset, end_source);
let postings_reader = SourceRead::from(postings_slice);
block_postings.reset(term_info.doc_freq as usize, postings_reader);
}
/// Returns a block postings given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
) -> BlockSegmentPostings {
let offset = term_info.postings_offset as usize;
let postings_data = self.postings_source.slice_from(offset);
let has_freq = option.has_freq();
BlockSegmentPostings::from_data(
term_info.doc_freq as usize,
SourceRead::from(postings_data),
has_freq,
)
}
/// Returns a posting object given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
) -> SegmentPostings {
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
let delete_bitset = self.delete_bitset.clone();
let position_stream = {
if option.has_positions() {
let position_offset = term_info.positions_offset;
let positions_source = self.positions_source.slice_from(position_offset as usize);
let mut stream = CompressedIntStream::wrap(positions_source);
stream.skip(term_info.positions_inner_offset as usize);
Some(stream)
} else {
None
}
};
SegmentPostings::from_block_postings(block_postings, delete_bitset, position_stream)
}
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encountered and indexed.
///
/// If the field was not indexed with the indexing options that cover
/// the requested options, the returned `SegmentPostings` the method does not fail
/// and returns a `SegmentPostings` with as much information as possible.
///
/// For instance, requesting `IndexRecordOption::Freq` for a
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
/// with `DocId`s and frequencies.
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
let field = term.field();
let field_entry = self.schema.get_field_entry(field);
let term_info = get!(self.get_term_info(term));
let maximum_option = get!(field_entry.field_type().get_index_record_option());
let best_effort_option = cmp::min(maximum_option, option);
Some(self.read_postings_from_terminfo(&term_info, best_effort_option))
}
/// Returns the number of documents containing the term.
pub fn doc_freq(&self, term: &Term) -> u32 {
match self.get_term_info(term) {
Some(term_info) => term_info.doc_freq,
None => 0,
}
}
}

View File

@@ -7,8 +7,9 @@ mod segment;
mod index_meta;
mod pool;
mod segment_meta;
mod term_iterator;
mod inverted_index_reader;
pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::searcher::Searcher;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
@@ -18,8 +19,6 @@ pub use self::segment::SerializableSegment;
pub use self::index::Index;
pub use self::segment_meta::SegmentMeta;
pub use self::index_meta::IndexMeta;
pub use self::term_iterator::TermIterator;
use std::path::PathBuf;
@@ -27,7 +26,7 @@ lazy_static! {
/// The meta file contains all the information about the list of segments and the schema
/// of the index.
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");
/// The managed file contains a list of files that were created by the tantivy
/// and will therefore be garbage collected when they are deemed useless by tantivy.
///
@@ -40,4 +39,4 @@ lazy_static! {
///
/// If the process is killed and this file remains, it is safe to remove it manually.
pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock");
}
}

View File

@@ -17,10 +17,10 @@ pub struct Pool<T> {
}
impl<T> Pool<T> {
pub fn new() -> Pool<T> {
let queue = Arc::new(MsQueue::new());
Pool {
queue: Arc::new(MsQueue::new()),
queue,
freshest_generation: AtomicUsize::default(),
next_generation: AtomicUsize::default(),
}
@@ -30,52 +30,52 @@ impl<T> Pool<T> {
let next_generation = self.next_generation.fetch_add(1, Ordering::SeqCst) + 1;
for item in items {
let gen_item = GenerationItem {
item: item,
item,
generation: next_generation,
};
self.queue.push(gen_item);
}
self.advertise_generation(next_generation);
}
/// At the exit of this method,
/// At the exit of this method,
/// - freshest_generation has a value greater or equal than generation
/// - freshest_generation has a value that has been advertised
/// - freshest_generation has
/// - freshest_generation has)
fn advertise_generation(&self, generation: usize) {
// not optimal at all but the easiest to read proof.
// not optimal at all but the easiest to read proof.
loop {
let former_generation = self.freshest_generation.load(Ordering::Acquire);
if former_generation >= generation {
break;
}
self.freshest_generation.compare_and_swap(former_generation, generation, Ordering::SeqCst);
}
self.freshest_generation.compare_and_swap(
former_generation,
generation,
Ordering::SeqCst,
);
}
}
fn generation(&self,) -> usize {
fn generation(&self) -> usize {
self.freshest_generation.load(Ordering::Acquire)
}
pub fn acquire(&self,) -> LeasedItem<T> {
pub fn acquire(&self) -> LeasedItem<T> {
let generation = self.generation();
loop {
let gen_item = self.queue.pop();
if gen_item.generation >= generation {
return LeasedItem {
gen_item: Some(gen_item),
recycle_queue: self.queue.clone(),
}
}
else {
recycle_queue: Arc::clone(&self.queue),
};
} else {
// this searcher is obsolete,
// removing it from the pool.
}
}
}
}
pub struct LeasedItem<T> {
@@ -84,29 +84,33 @@ pub struct LeasedItem<T> {
}
impl<T> Deref for LeasedItem<T> {
type Target = T;
fn deref(&self) -> &T {
&self.gen_item.as_ref().expect("Unwrapping a leased item should never fail").item // unwrap is safe here
&self.gen_item
.as_ref()
.expect("Unwrapping a leased item should never fail")
.item // unwrap is safe here
}
}
impl<T> DerefMut for LeasedItem<T> {
fn deref_mut(&mut self) -> &mut T {
&mut self.gen_item.as_mut().expect("Unwrapping a mut leased item should never fail").item // unwrap is safe here
&mut self.gen_item
.as_mut()
.expect("Unwrapping a mut leased item should never fail")
.item // unwrap is safe here
}
}
impl<T> Drop for LeasedItem<T> {
fn drop(&mut self) {
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None).expect("Unwrapping a leased item should never fail");
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
.expect("Unwrapping a leased item should never fail");
self.recycle_queue.push(gen_item);
}
}
#[cfg(test)]
mod tests {
@@ -127,4 +131,4 @@ mod tests {
assert_eq!(*pool.acquire(), 11);
}
}
}
}

View File

@@ -6,84 +6,97 @@ use common::TimerTree;
use query::Query;
use DocId;
use DocAddress;
use schema::Term;
use core::TermIterator;
use schema::{Field, Term};
use termdict::{TermDictionary, TermMerger};
use std::sync::Arc;
use std::fmt;
use core::InvertedIndexReader;
/// Holds a list of `SegmentReader`s ready for search.
///
/// It guarantees that the `Segment` will not be removed before
/// It guarantees that the `Segment` will not be removed before
/// the destruction of the `Searcher`.
///
///
pub struct Searcher {
segment_readers: Vec<SegmentReader>,
}
impl Searcher {
/// Fetches a document from tantivy's store given a `DocAddress`.
///
/// The searcher uses the segment ordinal to route the
/// the request to the right `Segment`.
/// the request to the right `Segment`.
pub fn doc(&self, doc_address: &DocAddress) -> Result<Document> {
let DocAddress(segment_local_id, doc_id) = *doc_address;
let segment_reader = &self.segment_readers[segment_local_id as usize];
segment_reader.doc(doc_id)
}
/// Returns the overall number of documents in the index.
pub fn num_docs(&self,) -> DocId {
pub fn num_docs(&self) -> DocId {
self.segment_readers
.iter()
.map(|segment_reader| segment_reader.num_docs())
.fold(0u32, |acc, val| acc + val)
}
/// Return the overall number of documents containing
/// the given term.
/// the given term.
pub fn doc_freq(&self, term: &Term) -> u32 {
self.segment_readers
.iter()
.map(|segment_reader| segment_reader.doc_freq(term))
.map(|segment_reader| segment_reader.inverted_index(term.field()).doc_freq(term))
.fold(0u32, |acc, val| acc + val)
}
/// Returns a Stream over all of the sorted unique terms of
/// the searcher.
///
/// This includes all of the fields from all of the segment_readers.
/// See [TermIterator](struct.TermIterator.html).
///
/// # Warning
/// This API is very likely to change in the future.
pub fn terms<'a>(&'a self) -> TermIterator<'a> {
TermIterator::from(self.segment_readers())
}
/// Return the list of segment readers
pub fn segment_readers(&self,) -> &[SegmentReader] {
pub fn segment_readers(&self) -> &[SegmentReader] {
&self.segment_readers
}
/// Returns the segment_reader associated with the given segment_ordinal
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
&self.segment_readers[segment_ord as usize]
}
/// Runs a query on the segment readers wrapped by the searcher
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<TimerTree> {
query.search(self, collector)
}
/// Return the field searcher associated to a `Field`.
pub fn field(&self, field: Field) -> FieldSearcher {
let inv_index_readers = self.segment_readers
.iter()
.map(|segment_reader| segment_reader.inverted_index(field))
.collect::<Vec<_>>();
FieldSearcher::new(inv_index_readers)
}
}
pub struct FieldSearcher {
inv_index_readers: Vec<Arc<InvertedIndexReader>>,
}
impl FieldSearcher {
fn new(inv_index_readers: Vec<Arc<InvertedIndexReader>>) -> FieldSearcher {
FieldSearcher { inv_index_readers }
}
/// Returns a Stream over all of the sorted unique terms of
/// for the given field.
pub fn terms(&self) -> TermMerger {
let term_streamers: Vec<_> = self.inv_index_readers
.iter()
.map(|inverted_index| inverted_index.terms().stream())
.collect();
TermMerger::new(term_streamers)
}
}
impl From<Vec<SegmentReader>> for Searcher {
fn from(segment_readers: Vec<SegmentReader>) -> Searcher {
Searcher {
segment_readers: segment_readers,
}
Searcher { segment_readers }
}
}
@@ -95,4 +108,4 @@ impl fmt::Debug for Searcher {
.collect::<Vec<_>>();
write!(f, "Searcher({:?})", segment_ids)
}
}
}

View File

@@ -3,7 +3,7 @@ use std::path::PathBuf;
use schema::Schema;
use std::fmt;
use core::SegmentId;
use directory::{ReadOnlySource, WritePtr, FileProtection};
use directory::{FileProtection, ReadOnlySource, WritePtr};
use indexer::segment_serializer::SegmentSerializer;
use super::SegmentComponent;
use core::Index;
@@ -26,19 +26,20 @@ impl fmt::Debug for Segment {
}
/// Creates a new segment given an `Index` and a `SegmentId`
///
/// The function is here to make it private outside `tantivy`.
///
/// The function is here to make it private outside `tantivy`.
pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
Segment {
index: index,
meta: meta,
}
Segment { index, meta }
}
impl Segment {
/// Returns the index the segment belongs to.
pub fn index(&self) -> &Index {
&self.index
}
/// Returns our index's schema.
pub fn schema(&self,) -> Schema {
pub fn schema(&self) -> Schema {
self.index.schema()
}
@@ -53,19 +54,18 @@ impl Segment {
}
/// Returns the segment's id.
pub fn id(&self,) -> SegmentId {
pub fn id(&self) -> SegmentId {
self.meta.id()
}
/// Returns the relative path of a component of our segment.
///
/// It just joins the segment id with the extension
///
/// It just joins the segment id with the extension
/// associated to a segment component.
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
self.meta.relative_path(component)
}
/// Protects a specific component file from being deleted.
///
/// Returns a FileProtection object. The file is guaranteed
@@ -77,16 +77,22 @@ impl Segment {
}
/// Open one of the component file for a *regular* read.
pub fn open_read(&self, component: SegmentComponent) -> result::Result<ReadOnlySource, OpenReadError> {
pub fn open_read(
&self,
component: SegmentComponent,
) -> result::Result<ReadOnlySource, OpenReadError> {
let path = self.relative_path(component);
let source = try!(self.index.directory().open_read(&path));
let source = self.index.directory().open_read(&path)?;
Ok(source)
}
/// Open one of the component file for *regular* write.
pub fn open_write(&mut self, component: SegmentComponent) -> result::Result<WritePtr, OpenWriteError> {
pub fn open_write(
&mut self,
component: SegmentComponent,
) -> result::Result<WritePtr, OpenWriteError> {
let path = self.relative_path(component);
let write = try!(self.index.directory_mut().open_write(&path));
let write = self.index.directory_mut().open_write(&path)?;
Ok(write)
}
}
@@ -114,20 +120,20 @@ mod tests {
let mut index = Index::create_in_ram(SchemaBuilder::new().build());
let segment = index.new_segment();
let path = segment.relative_path(SegmentComponent::POSTINGS);
let directory = index.directory_mut();
directory.atomic_write(&*path, &vec!(0u8)).unwrap();
directory.atomic_write(&*path, &vec![0u8]).unwrap();
let living_files = HashSet::new();
{
let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
assert!(directory.exists(&*path));
directory.garbage_collect(living_files.clone());
directory.garbage_collect(|| living_files.clone());
assert!(directory.exists(&*path));
}
directory.garbage_collect(living_files);
directory.garbage_collect(|| living_files);
assert!(!directory.exists(&*path));
}
}
}

View File

@@ -1,27 +1,41 @@
/// Enum describing each component of a tantivy segment.
/// Each component is stored in its own file,
/// using the pattern `segment_uuid`.`component_extension`,
/// except the delete component that takes an `segment_uuid`.`delete_opstamp`.`component_extension`
#[derive(Copy, Clone)]
pub enum SegmentComponent {
/// Postings (or inverted list). Sorted lists of document ids, associated to terms
POSTINGS,
/// Positions of terms in each document.
POSITIONS,
/// Column-oriented random-access storage of fields.
FASTFIELDS,
/// Stores the sum of the length (in terms) of each field for each document.
/// Field norms are stored as a special u64 fast field.
FIELDNORMS,
/// Dictionary associating `Term`s to `TermInfo`s which is
/// simply an address into the `postings` file and the `positions` file.
TERMS,
/// Row-oriented, LZ4-compressed storage of the documents.
/// Accessing a document from the store is relatively slow, as it
/// requires to decompress the entire block it belongs to.
STORE,
DELETE
/// Bitset describing which document of the segment is deleted.
DELETE,
}
impl SegmentComponent {
pub fn iterator() -> impl Iterator<Item=&'static SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
/// Iterates through the components.
pub fn iterator() -> impl Iterator<Item = &'static SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS,
SegmentComponent::FASTFIELDS,
SegmentComponent::FIELDNORMS,
SegmentComponent::TERMS,
SegmentComponent::STORE,
SegmentComponent::DELETE
SegmentComponent::DELETE,
];
SEGMENT_COMPONENTS.into_iter()
}
}
}

View File

@@ -1,29 +1,27 @@
use uuid::Uuid;
use std::fmt;
use std::cmp::{Ordering, Ord};
use std::cmp::{Ord, Ordering};
#[cfg(test)]
use std::sync::atomic;
/// Tantivy SegmentId.
/// Uuid identifying a segment.
///
/// Tantivy's segment are identified
/// Tantivy's segment are identified
/// by a UUID which is used to prefix the filenames
/// of all of the file associated with the segment.
///
/// In unit test, for reproducability, the SegmentId are
/// In unit test, for reproducability, the `SegmentId` are
/// simply generated in an autoincrement fashion.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct SegmentId(Uuid);
#[cfg(test)]
lazy_static! {
static ref AUTO_INC_COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::default();
static ref EMPTY_ARR: [u8; 8] = [0u8; 8];
}
// During tests, we generate the segment id in a autoincrement manner
// for consistency of segment id between run.
//
@@ -46,20 +44,19 @@ impl SegmentId {
SegmentId(create_uuid())
}
/// Returns a shorter identifier of the segment.
///
/// We are using UUID4, so only 6 bits are fixed,
/// and the rest is random.
///
/// Picking the first 8 chars is ok to identify
/// Picking the first 8 chars is ok to identify
/// segments in a display message.
pub fn short_uuid_string(&self,) -> String {
pub fn short_uuid_string(&self) -> String {
(&self.0.simple().to_string()[..8]).to_string()
}
/// Returns a segment uuid string.
pub fn uuid_string(&self,) -> String {
pub fn uuid_string(&self) -> String {
self.0.simple().to_string()
}
}
@@ -70,7 +67,6 @@ impl fmt::Debug for SegmentId {
}
}
impl PartialOrd for SegmentId {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))

View File

@@ -9,7 +9,7 @@ struct DeleteMeta {
opstamp: u64,
}
/// SegmentMeta contains simple meta information about a segment.
/// `SegmentMeta` contains simple meta information about a segment.
///
/// For instance the number of docs it contains,
/// how many are deleted, etc.
@@ -17,16 +17,15 @@ struct DeleteMeta {
pub struct SegmentMeta {
segment_id: SegmentId,
max_doc: u32,
deletes: Option<DeleteMeta>,
deletes: Option<DeleteMeta>,
}
impl SegmentMeta {
/// Creates a new segment meta for
/// Creates a new segment meta for
/// a segment with no deletes and no documents.
pub fn new(segment_id: SegmentId) -> SegmentMeta {
SegmentMeta {
segment_id: segment_id,
segment_id,
max_doc: 0,
deletes: None,
}
@@ -53,16 +52,13 @@ impl SegmentMeta {
/// and are not used by any segment anymore.
pub fn list_files(&self) -> HashSet<PathBuf> {
SegmentComponent::iterator()
.map(|component| {
self.relative_path(*component)
})
.map(|component| self.relative_path(*component))
.collect::<HashSet<PathBuf>>()
}
/// Returns the relative path of a component of our segment.
///
/// It just joins the segment id with the extension
///
/// It just joins the segment id with the extension
/// associated to a segment component.
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
let mut path = self.id().uuid_string();
@@ -73,7 +69,7 @@ impl SegmentMeta {
SegmentComponent::STORE => ".store".to_string(),
SegmentComponent::FASTFIELDS => ".fast".to_string(),
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
SegmentComponent::DELETE => {format!(".{}.del", self.delete_opstamp().unwrap_or(0))},
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
});
PathBuf::from(path)
}
@@ -95,9 +91,7 @@ impl SegmentMeta {
/// Returns the opstamp of the last delete operation
/// taken in account in this segment.
pub fn delete_opstamp(&self) -> Option<u64> {
self.deletes
.as_ref()
.map(|delete_meta| delete_meta.opstamp)
self.deletes.as_ref().map(|delete_meta| delete_meta.opstamp)
}
/// Returns true iff the segment meta contains
@@ -114,8 +108,8 @@ impl SegmentMeta {
#[doc(hidden)]
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
self.deletes = Some(DeleteMeta {
num_deleted_docs: num_deleted_docs,
opstamp: opstamp,
num_deleted_docs,
opstamp,
});
}
}

View File

@@ -2,30 +2,23 @@ use Result;
use core::Segment;
use core::SegmentId;
use core::SegmentComponent;
use schema::Term;
use std::sync::RwLock;
use common::HasLen;
use core::SegmentMeta;
use fastfield::{self, FastFieldNotAvailableError};
use fastfield::DeleteBitSet;
use store::StoreReader;
use schema::Document;
use directory::ReadOnlySource;
use schema::Document;
use DocId;
use std::str;
use postings::TermInfo;
use datastruct::FstMap;
use std::sync::Arc;
use std::collections::HashMap;
use common::CompositeFile;
use std::fmt;
use core::InvertedIndexReader;
use schema::Field;
use postings::SegmentPostingsOption;
use postings::SegmentPostings;
use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader};
use fastfield::{FastFieldReader, U64FastFieldReader};
use schema::Schema;
use schema::FieldType;
use postings::FreqHandler;
use schema::TextIndexingOptions;
/// Entry point to access all of the datastructures of the `Segment`
///
@@ -40,15 +33,19 @@ use schema::TextIndexingOptions;
///
#[derive(Clone)]
pub struct SegmentReader {
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
segment_id: SegmentId,
segment_meta: SegmentMeta,
term_infos: Arc<FstMap<TermInfo>>,
postings_data: ReadOnlySource,
termdict_composite: CompositeFile,
postings_composite: CompositeFile,
positions_composite: CompositeFile,
fast_fields_composite: CompositeFile,
fieldnorms_composite: CompositeFile,
store_reader: StoreReader,
fast_fields_reader: Arc<FastFieldsReader>,
fieldnorms_reader: Arc<FastFieldsReader>,
delete_bitset: DeleteBitSet,
positions_data: ReadOnlySource,
schema: Schema,
}
@@ -60,7 +57,7 @@ impl SegmentReader {
pub fn max_doc(&self) -> DocId {
self.segment_meta.max_doc()
}
/// Returns the number of documents.
/// Deleted documents are not counted.
///
@@ -69,18 +66,13 @@ impl SegmentReader {
pub fn num_docs(&self) -> DocId {
self.segment_meta.num_docs()
}
/// Return the number of documents that have been
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
self.delete_bitset.len() as DocId
}
#[doc(hidden)]
pub fn fast_fields_reader(&self) -> &FastFieldsReader {
&*self.fast_fields_reader
}
/// Accessor to a segment's fast field reader given a field.
///
/// Returns the u64 fast value reader if the field
@@ -91,91 +83,132 @@ impl SegmentReader {
///
/// # Panics
/// May panic if the index is corrupted.
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>(&self, field: Field) -> fastfield::Result<TFastFieldReader> {
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>(
&self,
field: Field,
) -> fastfield::Result<TFastFieldReader> {
let field_entry = self.schema.get_field_entry(field);
if !TFastFieldReader::is_enabled(field_entry.field_type()) {
Err(FastFieldNotAvailableError::new(field_entry))
}
else {
Ok(
self.fast_fields_reader
.open_reader(field)
.expect("Fast field file corrupted.")
)
} else {
self.fast_fields_composite
.open_read(field)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(TFastFieldReader::open)
}
}
/// Accessor to the segment's `Field norms`'s reader.
///
/// Field norms are the length (in tokens) of the fields.
/// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
/// It is used in the computation of the [TfIdf]
/// (https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
///
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U64FastFieldReader> {
self.fieldnorms_reader.open_reader(field)
self.fieldnorms_composite
.open_read(field)
.map(U64FastFieldReader::open)
}
/// Returns the number of documents containing the term.
pub fn doc_freq(&self, term: &Term) -> u32 {
match self.get_term_info(term) {
Some(term_info) => term_info.doc_freq,
None => 0,
}
}
/// Accessor to the segment's `StoreReader`.
pub fn get_store_reader(&self) -> &StoreReader {
&self.store_reader
}
/// Open a new segment for reading.
pub fn open(segment: Segment) -> Result<SegmentReader> {
pub fn open(segment: &Segment) -> Result<SegmentReader> {
let termdict_source = segment.open_read(SegmentComponent::TERMS)?;
let termdict_composite = CompositeFile::open(&termdict_source)?;
let source = try!(segment.open_read(SegmentComponent::TERMS));
let term_infos = try!(FstMap::from_source(source));
let store_reader = StoreReader::from(try!(segment.open_read(SegmentComponent::STORE)));
let postings_shared_mmap = try!(segment.open_read(SegmentComponent::POSTINGS));
let fast_field_data = try!(segment.open_read(SegmentComponent::FASTFIELDS));
let fast_fields_reader = try!(FastFieldsReader::open(fast_field_data));
let fieldnorms_data = try!(segment.open_read(SegmentComponent::FIELDNORMS));
let fieldnorms_reader = try!(FastFieldsReader::open(fieldnorms_data));
let positions_data = segment
.open_read(SegmentComponent::POSITIONS)
.unwrap_or_else(|_| ReadOnlySource::empty());
let delete_bitset =
if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
DeleteBitSet::open(delete_data)
let store_source = segment.open_read(SegmentComponent::STORE)?;
let store_reader = StoreReader::from_source(store_source);
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
let postings_composite = CompositeFile::open(&postings_source)?;
let positions_composite = {
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) {
CompositeFile::open(&source)?
} else {
CompositeFile::empty()
}
else {
DeleteBitSet::empty()
};
};
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
let delete_bitset = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
DeleteBitSet::open(delete_data)
} else {
DeleteBitSet::empty()
};
let schema = segment.schema();
Ok(SegmentReader {
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
segment_meta: segment.meta().clone(),
postings_data: postings_shared_mmap,
term_infos: Arc::new(term_infos),
termdict_composite,
postings_composite,
fast_fields_composite,
fieldnorms_composite,
segment_id: segment.id(),
store_reader: store_reader,
fast_fields_reader: Arc::new(fast_fields_reader),
fieldnorms_reader: Arc::new(fieldnorms_reader),
delete_bitset: delete_bitset,
positions_data: positions_data,
schema: schema,
store_reader,
delete_bitset,
positions_composite,
schema,
})
}
/// Return the term dictionary datastructure.
pub fn term_infos(&self) -> &FstMap<TermInfo> {
&self.term_infos
/// Returns a field reader associated to the field given in argument.
///
/// The field reader is in charge of iterating through the
/// term dictionary associated to a specific field,
/// and opening the posting list associated to any term.
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
if let Some(inv_idx_reader) = self.inv_idx_reader_cache
.read()
.expect("Lock poisoned. This should never happen")
.get(&field)
{
Arc::clone(inv_idx_reader);
}
let termdict_source: ReadOnlySource = self.termdict_composite
.open_read(field)
.expect("Index corrupted. Failed to open field term dictionary in composite file.");
let postings_source = self.postings_composite
.open_read(field)
.expect("Index corrupted. Failed to open field postings in composite file.");
let positions_source = self.positions_composite
.open_read(field)
.expect("Index corrupted. Failed to open field positions in composite file.");
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
termdict_source,
postings_source,
positions_source,
self.delete_bitset.clone(),
self.schema.clone(),
));
// by releasing the lock in between, we may end up opening the inverting index
// twice, but this is fine.
self.inv_idx_reader_cache
.write()
.expect("Field reader cache lock poisoned. This should never happen.")
.insert(field, Arc::clone(&inv_idx_reader));
inv_idx_reader
}
/// Returns the document (or to be accurate, its stored field)
/// bearing the given doc id.
/// This method is slow and should seldom be called from
@@ -184,85 +217,6 @@ impl SegmentReader {
self.store_reader.get(doc_id)
}
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encounterred and indexed.
///
/// If the field was not indexed with the indexing options that cover
/// the requested options, the returned `SegmentPostings` the method does not fail
/// and returns a `SegmentPostings` with as much information as possible.
///
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions`
/// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies.
pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<SegmentPostings> {
let field = term.field();
let field_entry = self.schema.get_field_entry(field);
let term_info = get!(self.get_term_info(&term));
let offset = term_info.postings_offset as usize;
let postings_data = &self.postings_data[offset..];
let freq_handler = match *field_entry.field_type() {
FieldType::Str(ref options) => {
let indexing_options = options.get_indexing_options();
match option {
SegmentPostingsOption::NoFreq => {
FreqHandler::new_without_freq()
}
SegmentPostingsOption::Freq => {
if indexing_options.is_termfreq_enabled() {
FreqHandler::new_with_freq()
}
else {
FreqHandler::new_without_freq()
}
}
SegmentPostingsOption::FreqAndPositions => {
if indexing_options == TextIndexingOptions::TokenizedWithFreqAndPosition {
let offseted_position_data = &self.positions_data[term_info.positions_offset as usize ..];
FreqHandler::new_with_freq_and_position(offseted_position_data)
}
else if indexing_options.is_termfreq_enabled()
{
FreqHandler::new_with_freq()
}
else {
FreqHandler::new_without_freq()
}
}
}
}
_ => {
FreqHandler::new_without_freq()
}
};
Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, &self.delete_bitset, freq_handler))
}
/// Returns the posting list associated with a term.
///
/// If the term is not found, return None.
/// Even when non-null, because of deletes, the posting object
/// returned by this method may contain no documents.
pub fn read_postings_all_info(&self, term: &Term) -> Option<SegmentPostings> {
let field_entry = self.schema.get_field_entry(term.field());
let segment_posting_option = match *field_entry.field_type() {
FieldType::Str(ref text_options) => {
match text_options.get_indexing_options() {
TextIndexingOptions::TokenizedWithFreq => SegmentPostingsOption::Freq,
TextIndexingOptions::TokenizedWithFreqAndPosition => SegmentPostingsOption::FreqAndPositions,
_ => SegmentPostingsOption::NoFreq,
}
}
FieldType::U64(_) | FieldType::I64(_) => SegmentPostingsOption::NoFreq
};
self.read_postings(term, segment_posting_option)
}
/// Returns the term info associated with the term.
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
self.term_infos.get(term.as_slice())
}
/// Returns the segment id
pub fn segment_id(&self) -> SegmentId {
self.segment_id
@@ -274,7 +228,6 @@ impl SegmentReader {
&self.delete_bitset
}
/// Returns true iff the `doc` is marked
/// as deleted.
pub fn is_deleted(&self, doc: DocId) -> bool {
@@ -282,7 +235,6 @@ impl SegmentReader {
}
}
impl fmt::Debug for SegmentReader {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "SegmentReader({:?})", self.segment_id)

View File

@@ -1,183 +0,0 @@
use fst::Streamer;
use std::mem;
use std::collections::BinaryHeap;
use fst::map::Keys;
use schema::Field;
use schema::Term;
use core::SegmentReader;
use std::cmp::Ordering;
#[derive(PartialEq, Eq, Debug)]
struct HeapItem {
term: Term,
segment_ord: usize,
}
impl PartialOrd for HeapItem {
fn partial_cmp(&self, other: &HeapItem) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for HeapItem {
fn cmp(&self, other: &HeapItem) -> Ordering {
(&other.term, &other.segment_ord).cmp(&(&self.term, &self.segment_ord))
}
}
/// Given a list of sorted term streams,
/// returns an iterator over sorted unique terms.
///
/// The item yield is actually a pair with
/// - the term
/// - a slice with the ordinal of the segments containing
/// the terms.
pub struct TermIterator<'a> {
key_streams: Vec<Keys<'a>>,
heap: BinaryHeap<HeapItem>,
// Buffer hosting the list of segment ordinals containing
// the current term.
current_term: Term,
current_segment_ords: Vec<usize>,
}
impl<'a> TermIterator<'a> {
fn new(key_streams: Vec<Keys<'a>>) -> TermIterator<'a> {
let key_streams_len = key_streams.len();
TermIterator {
key_streams: key_streams,
heap: BinaryHeap::new(),
current_term: Term::from_field_text(Field(0), ""),
current_segment_ords: (0..key_streams_len).collect(),
}
}
/// Advance the term iterator to the next term.
/// Returns true if there is indeed another term
/// False if there is none.
pub fn advance(&mut self) -> bool {
self.advance_segments();
if let Some(mut head) = self.heap.pop() {
mem::swap(&mut self.current_term, &mut head.term);
self.current_segment_ords.push(head.segment_ord);
loop {
match self.heap.peek() {
Some(&ref next_heap_it) if next_heap_it.term == self.current_term => {}
_ => { break; }
}
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
self.current_segment_ords.push(next_heap_it.segment_ord);
}
true
}
else {
false
}
}
/// Returns the current term.
///
/// This method may be called
/// iff advance() has been called before
/// and "true" was returned.
pub fn term(&self) -> &Term {
&self.current_term
}
/// Returns the sorted list of segment ordinals
/// that include the current term.
///
/// This method may be called
/// iff advance() has been called before
/// and "true" was returned.
pub fn segment_ords(&self) -> &[usize]{
&self.current_segment_ords[..]
}
fn advance_segments(&mut self) {
for segment_ord in self.current_segment_ords.drain(..) {
if let Some(term) = self.key_streams[segment_ord].next() {
self.heap.push(HeapItem {
term: Term::from_bytes(term),
segment_ord: segment_ord,
});
}
}
}
}
impl<'a, 'f> Streamer<'a> for TermIterator<'f> {
type Item = &'a Term;
fn next(&'a mut self) -> Option<Self::Item> {
if self.advance() {
Some(&self.current_term)
}
else {
None
}
}
}
impl<'a> From<&'a [SegmentReader]> for TermIterator<'a> {
fn from(segment_readers: &'a [SegmentReader]) -> TermIterator<'a> {
TermIterator::new(
segment_readers
.iter()
.map(|reader| reader.term_infos().keys())
.collect()
)
}
}
#[cfg(test)]
mod tests {
use super::*;
use schema::{SchemaBuilder, Document, TEXT};
use core::Index;
#[test]
fn test_term_iterator() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
{
let mut doc = Document::default();
doc.add_text(text_field, "a b d f");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
{
let mut doc = Document::default();
doc.add_text(text_field, "a b c d f");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
{
{
let mut doc = Document::default();
doc.add_text(text_field, "e f");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut term_it = searcher.terms();
let mut terms = String::new();
while let Some(term) = term_it.next() {
terms.push_str(term.text());
}
assert_eq!(terms, "abcdef");
}
}

View File

@@ -1,152 +0,0 @@
#![allow(should_implement_trait)]
use std::io;
use std::io::Write;
use fst;
use fst::raw::Fst;
use directory::ReadOnlySource;
use common::BinarySerializable;
use std::marker::PhantomData;
fn convert_fst_error(e: fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
}
pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
fst_builder: fst::MapBuilder<W>,
data: Vec<u8>,
_phantom_: PhantomData<V>,
}
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
pub fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
let fst_builder = try!(fst::MapBuilder::new(w).map_err(convert_fst_error));
Ok(FstMapBuilder {
fst_builder: fst_builder,
data: Vec::new(),
_phantom_: PhantomData,
})
}
/// Horribly unsafe, nobody should ever do that... except me :)
///
/// If used, it must be used by systematically alternating calls
/// to insert_key and insert_value.
///
/// TODO see if I can bend Rust typesystem to enforce that
/// in a nice way.
pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
try!(self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error));
Ok(())
}
/// Horribly unsafe, nobody should ever do that... except me :)
pub fn insert_value(&mut self, value: &V) -> io::Result<()> {
try!(value.serialize(&mut self.data));
Ok(())
}
#[cfg(test)]
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> {
try!(self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error));
try!(value.serialize(&mut self.data));
Ok(())
}
pub fn finish(self,) -> io::Result<W> {
let mut file = try!(
self.fst_builder
.into_inner()
.map_err(convert_fst_error));
let footer_size = self.data.len() as u32;
try!(file.write_all(&self.data));
try!((footer_size as u32).serialize(&mut file));
try!(file.flush());
Ok(file)
}
}
pub struct FstMap<V: BinarySerializable> {
fst_index: fst::Map,
values_mmap: ReadOnlySource,
_phantom_: PhantomData<V>,
}
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
Ok(fst::Map::from(match source {
ReadOnlySource::Anonymous(data) => try!(Fst::from_shared_bytes(data.data, data.start, data.len).map_err(convert_fst_error)),
ReadOnlySource::Mmap(mmap_readonly) => try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)),
}))
}
impl<V: BinarySerializable> FstMap<V> {
pub fn keys(&self,) -> fst::map::Keys {
self.fst_index.keys()
}
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
let total_len = source.len();
let length_offset = total_len - 4;
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
let footer_size = try!(u32::deserialize(&mut split_len_buffer)) as usize;
let split_len = length_offset - footer_size;
let fst_source = source.slice(0, split_len);
let values_source = source.slice(split_len, length_offset);
let fst_index = try!(open_fst_index(fst_source));
Ok(FstMap {
fst_index: fst_index,
values_mmap: values_source,
_phantom_: PhantomData,
})
}
fn read_value(&self, offset: u64) -> V {
let buffer = self.values_mmap.as_slice();
let mut cursor = &buffer[(offset as usize)..];
V::deserialize(&mut cursor).expect("Data in FST is corrupted")
}
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<V> {
self.fst_index
.get(key)
.map(|offset| self.read_value(offset))
}
}
#[cfg(test)]
mod tests {
use super::*;
use directory::{RAMDirectory, Directory};
use std::path::PathBuf;
use fst::Streamer;
#[test]
fn test_fstmap() {
let mut directory = RAMDirectory::create();
let path = PathBuf::from("fstmap");
{
let write = directory.open_write(&path).unwrap();
let mut fstmap_builder = FstMapBuilder::new(write).unwrap();
fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap();
fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap();
fstmap_builder.finish().unwrap();
}
let source = directory.open_read(&path).unwrap();
let fstmap: FstMap<u32> = FstMap::from_source(source).unwrap();
assert_eq!(fstmap.get("abc"), Some(34u32));
assert_eq!(fstmap.get("abcd"), Some(346u32));
let mut keys = fstmap.keys();
assert_eq!(keys.next().unwrap(), "abc".as_bytes());
assert_eq!(keys.next().unwrap(), "abcd".as_bytes());
assert_eq!(keys.next(), None);
}
}

View File

@@ -1,7 +1,4 @@
mod fstmap;
mod skip;
pub mod stacker;
pub use self::fstmap::FstMapBuilder;
pub use self::fstmap::FstMap;
pub use self::skip::{SkipListBuilder, SkipList};
pub use self::skip::{SkipList, SkipListBuilder};

View File

@@ -6,8 +6,6 @@ mod skiplist;
pub use self::skiplist_builder::SkipListBuilder;
pub use self::skiplist::SkipList;
#[cfg(test)]
mod tests {
@@ -114,9 +112,9 @@ mod tests {
let mut skip_list: SkipList<()> = SkipList::from(output.as_slice());
assert_eq!(skip_list.next().unwrap(), (0, ()));
skip_list.seek(431);
assert_eq!(skip_list.next().unwrap(), (431,()) );
assert_eq!(skip_list.next().unwrap(), (431, ()));
skip_list.seek(1003);
assert_eq!(skip_list.next().unwrap(), (1004,()) );
assert_eq!(skip_list.next().unwrap(), (1004, ()));
assert_eq!(skip_list.next(), None);
}

View File

@@ -13,14 +13,12 @@ struct Layer<'a, T> {
}
impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
type Item = (DocId, T);
fn next(&mut self,)-> Option<(DocId, T)> {
fn next(&mut self) -> Option<(DocId, T)> {
if self.next_id == u32::max_value() {
None
}
else {
} else {
let cur_val = T::deserialize(&mut self.cursor).unwrap();
let cur_id = self.next_id;
self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
@@ -31,19 +29,18 @@ impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
impl<'a, T: BinarySerializable> From<&'a [u8]> for Layer<'a, T> {
fn from(data: &'a [u8]) -> Layer<'a, T> {
let mut cursor = data;
let mut cursor = data;
let next_id = u32::deserialize(&mut cursor).unwrap_or(u32::max_value());
Layer {
data: data,
cursor: cursor,
next_id: next_id,
data,
cursor,
next_id,
_phantom_: PhantomData,
}
}
}
impl<'a, T: BinarySerializable> Layer<'a, T> {
fn empty() -> Layer<'a, T> {
Layer {
data: &EMPTY,
@@ -53,11 +50,11 @@ impl<'a, T: BinarySerializable> Layer<'a, T> {
}
}
fn seek_offset(&mut self, offset: usize) {
fn seek_offset(&mut self, offset: usize) {
self.cursor = &self.data[offset..];
self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
}
// Returns the last element (key, val)
// such that (key < doc_id)
//
@@ -67,31 +64,32 @@ impl<'a, T: BinarySerializable> Layer<'a, T> {
let mut val = None;
while self.next_id < doc_id {
match self.next() {
None => { break; },
v => { val = v; }
None => {
break;
}
v => {
val = v;
}
}
}
val
}
}
pub struct SkipList<'a, T: BinarySerializable> {
data_layer: Layer<'a, T>,
skip_layers: Vec<Layer<'a, u32>>,
}
impl<'a, T: BinarySerializable> Iterator for SkipList<'a, T> {
type Item = (DocId, T);
fn next(&mut self,)-> Option<(DocId, T)> {
fn next(&mut self) -> Option<(DocId, T)> {
self.data_layer.next()
}
}
impl<'a, T: BinarySerializable> SkipList<'a, T> {
pub fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> {
let mut next_layer_skip: Option<(DocId, u32)> = None;
for skip_layer in &mut self.skip_layers {
@@ -99,39 +97,32 @@ impl<'a, T: BinarySerializable> SkipList<'a, T> {
skip_layer.seek_offset(offset as usize);
}
next_layer_skip = skip_layer.seek(doc_id);
}
if let Some((_, offset)) = next_layer_skip {
self.data_layer.seek_offset(offset as usize);
}
self.data_layer.seek(doc_id)
}
if let Some((_, offset)) = next_layer_skip {
self.data_layer.seek_offset(offset as usize);
}
self.data_layer.seek(doc_id)
}
}
impl<'a, T: BinarySerializable> From<&'a [u8]> for SkipList<'a, T> {
fn from(mut data: &'a [u8]) -> SkipList<'a, T> {
let offsets: Vec<u32> = Vec::deserialize(&mut data).unwrap();
let num_layers = offsets.len();
let layers_data: &[u8] = data;
let data_layer: Layer<'a, T> =
if num_layers == 0 { Layer::empty() }
else {
let first_layer_data: &[u8] = &layers_data[..offsets[0] as usize];
Layer::from(first_layer_data)
};
let data_layer: Layer<'a, T> = if num_layers == 0 {
Layer::empty()
} else {
let first_layer_data: &[u8] = &layers_data[..offsets[0] as usize];
Layer::from(first_layer_data)
};
let skip_layers = (0..max(1, num_layers) - 1)
.map(|i| (offsets[i] as usize, offsets[i + 1] as usize))
.map(|(start, stop)| {
Layer::from(&layers_data[start..stop])
})
.map(|(start, stop)| Layer::from(&layers_data[start..stop]))
.collect();
SkipList {
skip_layers: skip_layers,
data_layer: data_layer,
skip_layers,
data_layer,
}
}
}

View File

@@ -13,19 +13,18 @@ struct LayerBuilder<T: BinarySerializable> {
}
impl<T: BinarySerializable> LayerBuilder<T> {
fn written_size(&self,) -> usize {
fn written_size(&self) -> usize {
self.buffer.len()
}
fn write(&self, output: &mut Write) -> Result<(), io::Error> {
try!(output.write_all(&self.buffer));
output.write_all(&self.buffer)?;
Ok(())
}
fn with_period(period: usize) -> LayerBuilder<T> {
LayerBuilder {
period: period,
period,
buffer: Vec::new(),
remaining: period,
len: 0,
@@ -37,29 +36,27 @@ impl<T: BinarySerializable> LayerBuilder<T> {
self.remaining -= 1;
self.len += 1;
let offset = self.written_size() as u32;
try!(doc_id.serialize(&mut self.buffer));
try!(value.serialize(&mut self.buffer));
doc_id.serialize(&mut self.buffer)?;
value.serialize(&mut self.buffer)?;
Ok(if self.remaining == 0 {
self.remaining = self.period;
Some((doc_id, offset))
}
else { None })
self.remaining = self.period;
Some((doc_id, offset))
} else {
None
})
}
}
pub struct SkipListBuilder<T: BinarySerializable> {
period: usize,
data_layer: LayerBuilder<T>,
skip_layers: Vec<LayerBuilder<u32>>,
}
impl<T: BinarySerializable> SkipListBuilder<T> {
pub fn new(period: usize) -> SkipListBuilder<T> {
SkipListBuilder {
period: period,
period,
data_layer: LayerBuilder::with_period(period),
skip_layers: Vec::new(),
}
@@ -75,20 +72,20 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
pub fn insert(&mut self, doc_id: DocId, dest: &T) -> io::Result<()> {
let mut layer_id = 0;
let mut skip_pointer = try!(self.data_layer.insert(doc_id, dest));
let mut skip_pointer = self.data_layer.insert(doc_id, dest)?;
loop {
skip_pointer = match skip_pointer {
Some((skip_doc_id, skip_offset)) =>
try!(self
.get_skip_layer(layer_id)
.insert(skip_doc_id, &skip_offset)),
None => { return Ok(()); }
Some((skip_doc_id, skip_offset)) => self.get_skip_layer(layer_id)
.insert(skip_doc_id, &skip_offset)?,
None => {
return Ok(());
}
};
layer_id += 1;
}
}
pub fn write<W: Write>(self, output: &mut Write) -> io::Result<()> {
pub fn write<W: Write>(self, output: &mut W) -> io::Result<()> {
let mut size: u32 = 0;
let mut layer_sizes: Vec<u32> = Vec::new();
size += self.data_layer.buffer.len() as u32;
@@ -97,10 +94,10 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
size += layer.buffer.len() as u32;
layer_sizes.push(size);
}
try!(layer_sizes.serialize(output));
try!(self.data_layer.write(output));
layer_sizes.serialize(output)?;
self.data_layer.write(output)?;
for layer in self.skip_layers.iter().rev() {
try!(layer.write(output));
layer.write(output)?;
}
Ok(())
}

View File

@@ -1,7 +1,6 @@
use std::mem;
use super::heap::{Heap, HeapAllocable};
#[inline]
pub fn is_power_of_2(val: u32) -> bool {
val & (val - 1) == 0
@@ -9,11 +8,10 @@ pub fn is_power_of_2(val: u32) -> bool {
#[inline]
pub fn jump_needed(val: u32) -> bool {
val > 3 && is_power_of_2(val)
val > 3 && is_power_of_2(val)
}
#[derive(Debug)]
#[derive(Debug, Clone)]
pub struct ExpUnrolledLinkedList {
len: u32,
end: u32,
@@ -24,10 +22,9 @@ pub struct ExpUnrolledLinkedList {
}
impl ExpUnrolledLinkedList {
pub fn iter<'a>(&self, addr: u32, heap: &'a Heap) -> ExpUnrolledLinkedListIterator<'a> {
ExpUnrolledLinkedListIterator {
heap: heap,
heap,
addr: addr + 2u32 * (mem::size_of::<u32>() as u32),
len: self.len,
consumed: 0,
@@ -42,16 +39,21 @@ impl ExpUnrolledLinkedList {
// the next block as a size of (length so far),
// and we need to add 1u32 to store the pointer
// to the next element.
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
let new_block_addr: u32 = heap.allocate_space(new_block_size);
heap.set(self.end, &new_block_addr);
self.end = new_block_addr;
self.end = new_block_addr;
}
heap.set(self.end, &val);
self.end += mem::size_of::<u32>() as u32;
}
}
impl HeapAllocable for u32 {
fn with_addr(_addr: u32) -> u32 {
0u32
}
}
impl HeapAllocable for ExpUnrolledLinkedList {
fn with_addr(addr: u32) -> ExpUnrolledLinkedList {
@@ -77,33 +79,26 @@ pub struct ExpUnrolledLinkedListIterator<'a> {
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
type Item = u32;
fn next(&mut self,) -> Option<u32> {
fn next(&mut self) -> Option<u32> {
if self.consumed == self.len {
None
}
else {
} else {
let addr: u32;
self.consumed += 1;
if jump_needed(self.consumed) {
addr = *self.heap.get_mut_ref(self.addr);
}
else {
} else {
addr = self.addr;
}
self.addr = addr + mem::size_of::<u32>() as u32;
Some(*self.heap.get_mut_ref(addr))
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use super::super::heap::Heap;
use test::Bencher;
@@ -147,7 +142,7 @@ mod tests {
#[bench]
fn bench_push_stack(bench: &mut Bencher) {
let heap = Heap::with_capacity(64_000_000);
let heap = Heap::with_capacity(64_000_000);
bench.iter(|| {
let mut stacks = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
@@ -163,4 +158,4 @@ mod tests {
heap.clear();
});
}
}
}

View File

@@ -1,22 +1,78 @@
use std::iter;
use super::heap::{Heap, HeapAllocable, BytesRef};
use std::mem;
use super::heap::{BytesRef, Heap, HeapAllocable};
/// dbj2 hash function
fn djb2(key: &[u8]) -> u64 {
let mut state: u64 = 5381;
for &b in key {
state = (state << 5).wrapping_add(state).wrapping_add(b as u64);
mod murmurhash2 {
const SEED: u32 = 3_242_157_231u32;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let m: u32 = 0x5bd1_e995;
let r = 24;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { *key_ptr };
k = k.wrapping_mul(m);
k ^= k >> r;
k = k.wrapping_mul(m);
k = k.wrapping_mul(m);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining = len & 3;
let key_ptr_u8: *const u8 = key_ptr as *const u8;
match remaining {
3 => {
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(2)) } << 16;
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
h ^= unsafe { u32::from(*key_ptr_u8) };
h = h.wrapping_mul(m);
}
2 => {
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
h ^= unsafe { u32::from(*key_ptr_u8) };
h = h.wrapping_mul(m);
}
1 => {
h ^= unsafe { u32::from(*key_ptr_u8) };
h = h.wrapping_mul(m);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(m);
h ^ (h >> 15)
}
state
}
impl Default for BytesRef {
fn default() -> BytesRef {
BytesRef {
start: 0u32,
stop: 0u32,
}
}
/// Split the thread memory budget into
/// - the heap size
/// - the hash table "table" itself.
///
/// Returns (the heap size in bytes, the hash table size in number of bits)
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
let table_size_limit: usize = per_thread_memory_budget / 5;
let compute_table_size = |num_bits: usize| {
(1 << num_bits) * mem::size_of::<KeyValue>()
};
let table_num_bits: usize = (1..)
.into_iter()
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
.last()
.expect(&format!(
"Per thread memory is too small: {}",
per_thread_memory_budget
));
let table_size = compute_table_size(table_num_bits);
let heap_size = per_thread_memory_budget - table_size;
(heap_size, table_num_bits)
}
/// `KeyValue` is the item stored in the hash table.
@@ -25,27 +81,21 @@ impl Default for BytesRef {
///
/// The key and the value are actually stored contiguously.
/// For this reason, the (start, stop) information is actually redundant
/// and can be simplified in the future
/// and can be simplified in the future
#[derive(Copy, Clone, Default)]
struct KeyValue {
key: BytesRef,
value_addr: u32,
key_value_addr: BytesRef,
hash: u32,
}
impl KeyValue {
fn is_empty(&self,) -> bool {
self.key.stop == 0u32
fn is_empty(&self) -> bool {
self.key_value_addr.is_null()
}
}
pub enum Entry {
Vacant(usize),
Occupied(u32),
}
/// Customized `HashMap` with string keys
///
///
/// This `HashMap` takes String as keys. Keys are
/// stored in a user defined heap.
///
@@ -67,112 +117,91 @@ struct QuadraticProbing {
}
impl QuadraticProbing {
fn compute(key: &[u8], mask: usize) -> QuadraticProbing {
let hash = djb2(key) as usize;
QuadraticProbing {
hash: hash,
i: 0,
mask: mask,
}
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
QuadraticProbing { hash, i: 0, mask }
}
#[inline]
fn next(&mut self) -> usize {
fn next_probe(&mut self) -> usize {
self.i += 1;
(self.hash + self.i * self.i) & self.mask
}
}
impl<'a> HashMap<'a> {
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a> {
let table_size = 1 << num_bucket_power_of_2;
let table: Vec<KeyValue> = iter::repeat(KeyValue::default())
.take(table_size)
.collect();
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
HashMap {
table: table.into_boxed_slice(),
heap: heap,
heap,
mask: table_size - 1,
occupied: Vec::with_capacity(table_size / 2),
}
}
fn probe(&self, key: &[u8]) -> QuadraticProbing {
QuadraticProbing::compute(key, self.mask)
fn probe(&self, hash: u32) -> QuadraticProbing {
QuadraticProbing::compute(hash as usize, self.mask)
}
pub fn is_saturated(&self) -> bool {
self.table.len() < self.occupied.len() * 5
self.table.len() < self.occupied.len() * 3
}
fn get_key(&self, bytes_ref: BytesRef) -> &[u8] {
self.heap.get_slice(bytes_ref)
#[inline(never)]
fn get_key_value(&self, bytes_ref: BytesRef) -> (&[u8], u32) {
let key_bytes: &[u8] = self.heap.get_slice(bytes_ref);
let expull_addr: u32 = bytes_ref.addr() + 2 + key_bytes.len() as u32;
(key_bytes, expull_addr)
}
pub fn set_bucket(&mut self, key_bytes: &[u8], bucket: usize, addr: u32) -> u32 {
pub fn set_bucket(&mut self, hash: u32, key_bytes_ref: BytesRef, bucket: usize) {
self.occupied.push(bucket);
self.table[bucket] = KeyValue {
key: self.heap.allocate_and_set(key_bytes),
value_addr: addr,
key_value_addr: key_bytes_ref,
hash,
};
addr
}
pub fn iter<'b: 'a>(&'b self,) -> impl Iterator<Item=(&'a [u8], u32)> + 'b {
let heap: &'a Heap = self.heap;
let table: &'b [KeyValue] = &self.table;
self.occupied
.iter()
.cloned()
.map(move |bucket: usize| {
let kv = table[bucket];
let addr = kv.value_addr;
(heap.get_slice(kv.key), addr)
})
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32)> + 'b {
self.occupied.iter().cloned().map(move |bucket: usize| {
let kv = self.table[bucket];
self.get_key_value(kv.key_value_addr)
})
}
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(&mut self, key: S) -> &mut V {
let entry = self.lookup(key.as_ref());
match entry {
Entry::Occupied(addr) => {
self.heap.get_mut_ref(addr)
}
Entry::Vacant(bucket) => {
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
self.set_bucket(key.as_ref(), bucket, addr);
val
}
}
}
pub fn lookup<S: AsRef<[u8]>>(&self, key: S) -> Entry {
let key_bytes: &[u8] = key.as_ref();
let mut probe = self.probe(key_bytes);
let hash = murmurhash2::murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next();
let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
return Entry::Vacant(bucket);
}
if self.get_key(kv.key) == key_bytes {
return Entry::Occupied(kv.value_addr);
let key_bytes_ref = self.heap.allocate_and_set(key_bytes);
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
self.set_bucket(hash, key_bytes_ref, bucket);
return val;
} else if kv.hash == hash {
let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr);
if stored_key == key_bytes {
return self.heap.get_mut_ref(expull_addr);
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use super::super::heap::{Heap, HeapAllocable};
use super::djb2;
use super::murmurhash2::murmurhash2;
use test::Bencher;
use std::collections::hash_map::DefaultHasher;
use std::hash::Hasher;
use std::collections::HashSet;
use super::split_memory;
struct TestValue {
val: u32,
@@ -188,6 +217,13 @@ mod tests {
}
}
#[test]
fn test_hashmap_size() {
assert_eq!(split_memory(100_000), (67232, 9));
assert_eq!(split_memory(1_000_000), (737856, 12));
assert_eq!(split_memory(10_000_000), (7902848, 15));
}
#[test]
fn test_hash_map() {
let heap = Heap::with_capacity(2_000_000);
@@ -224,23 +260,41 @@ mod tests {
assert!(iter_values.next().is_none());
}
#[bench]
fn bench_djb2(bench: &mut Bencher) {
let v = String::from("abwer");
bench.iter(|| {
djb2(v.as_bytes())
});
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
#[bench]
fn bench_siphasher(bench: &mut Bencher) {
let v = String::from("abwer");
bench.iter(|| {
let mut h = DefaultHasher::new();
h.write(v.as_bytes());
h.finish()
fn bench_murmurhash_2(b: &mut Bencher) {
let keys: Vec<&'static str> =
vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
keys.iter()
.map(|&s| s.as_bytes())
.map(murmurhash2::murmurhash2)
.map(|h| h as u64)
.last()
.unwrap()
});
}
}

View File

@@ -1,12 +1,29 @@
use std::cell::UnsafeCell;
use std::mem;
use std::ptr;
use byteorder::{ByteOrder, NativeEndian};
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
///
/// The slice will encode the length of the `&[u8]` slice
/// on 16-bits, and then the data is encoded.
#[derive(Copy, Clone)]
pub struct BytesRef {
pub start: u32,
pub stop: u32,
pub struct BytesRef(u32);
impl BytesRef {
pub fn is_null(&self) -> bool {
self.0 == u32::max_value()
}
pub fn addr(&self) -> u32 {
self.0
}
}
impl Default for BytesRef {
fn default() -> BytesRef {
BytesRef(u32::max_value())
}
}
/// Object that can be allocated in tantivy's custom `Heap`.
@@ -19,20 +36,19 @@ pub struct Heap {
inner: UnsafeCell<InnerHeap>,
}
#[cfg_attr(feature = "cargo-clippy", allow(mut_from_ref))]
impl Heap {
/// Creates a new heap with a given capacity
pub fn with_capacity(num_bytes: usize) -> Heap {
Heap {
inner: UnsafeCell::new(
InnerHeap::with_capacity(num_bytes)
),
inner: UnsafeCell::new(InnerHeap::with_capacity(num_bytes)),
}
}
fn inner(&self,) -> &mut InnerHeap {
unsafe { &mut *self.inner.get() }
fn inner(&self) -> &mut InnerHeap {
unsafe { &mut *self.inner.get() }
}
/// Clears the heap. All the underlying data is lost.
///
/// This heap does not support deallocation.
@@ -40,14 +56,9 @@ impl Heap {
pub fn clear(&self) {
self.inner().clear();
}
/// Return the heap capacity.
pub fn capacity(&self,) -> u32 {
self.inner().capacity()
}
/// Return amount of free space, in bytes.
pub fn num_free_bytes(&self,) -> u32 {
pub fn num_free_bytes(&self) -> u32 {
self.inner().num_free_bytes()
}
@@ -56,128 +67,167 @@ impl Heap {
pub fn allocate_space(&self, num_bytes: usize) -> u32 {
self.inner().allocate_space(num_bytes)
}
/// Allocate an object in the heap
pub fn allocate_object<V: HeapAllocable>(&self,) -> (u32, &mut V) {
pub fn allocate_object<V: HeapAllocable>(&self) -> (u32, &mut V) {
let addr = self.inner().allocate_space(mem::size_of::<V>());
let v: V = V::with_addr(addr);
self.inner().set(addr, &v);
(addr, self.inner().get_mut_ref(addr))
}
/// Stores a `&[u8]` in the heap and returns the destination BytesRef.
pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef {
self.inner().allocate_and_set(data)
}
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
/// given as argumetn
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
self.inner().get_slice(bytes_ref.start, bytes_ref.stop)
self.inner().get_slice(bytes_ref)
}
/// Stores an item's data in the heap, at the given `address`.
pub fn set<Item>(&self, addr: u32, val: &Item) {
self.inner().set(addr, val);
}
/// Returns a reference to an `Item` at a given `addr`.
#[cfg(test)]
pub fn get_ref<Item>(&self, addr: u32) -> &Item {
/// Returns a mutable reference for an object at a given Item.
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
self.inner().get_mut_ref(addr)
}
/// Returns a mutable reference to an `Item` at a given `addr`.
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
self.inner().get_mut_ref(addr)
#[cfg(test)]
pub fn get_ref<Item>(&self, addr: u32) -> &mut Item {
self.get_mut_ref(addr)
}
}
struct InnerHeap {
buffer: Vec<u8>,
buffer_len: u32,
used: u32,
has_been_resized: bool,
next_heap: Option<Box<InnerHeap>>,
}
impl InnerHeap {
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
let buffer: Vec<u8> = vec![0u8; num_bytes];
InnerHeap {
buffer: buffer,
buffer,
buffer_len: num_bytes as u32,
next_heap: None,
used: 0u32,
has_been_resized: false,
}
}
pub fn clear(&mut self) {
self.used = 0u32;
self.next_heap = None;
}
pub fn capacity(&self,) -> u32 {
self.buffer.len() as u32
}
// Returns the number of free bytes. If the buffer
// has reached it's capacity and overflowed to another buffer, return 0.
pub fn num_free_bytes(&self,) -> u32 {
if self.has_been_resized {
pub fn num_free_bytes(&self) -> u32 {
if self.next_heap.is_some() {
0u32
} else {
self.buffer_len - self.used
}
else {
(self.buffer.len() as u32) - self.used
}
}
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
let addr = self.used;
self.used += num_bytes as u32;
let buffer_len = self.buffer.len();
if self.used > buffer_len as u32 {
self.buffer.resize(buffer_len * 2, 0u8);
self.has_been_resized = true
if self.used <= self.buffer_len {
addr
} else {
if self.next_heap.is_none() {
info!(
r#"Exceeded heap size. The segment will be committed right
after indexing this document."#,
);
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
}
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
}
addr
}
fn get_slice(&self, start: u32, stop: u32) -> &[u8] {
&self.buffer[start as usize..stop as usize]
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
let start = bytes_ref.0;
if start >= self.buffer_len {
self.next_heap
.as_ref()
.unwrap()
.get_slice(BytesRef(start - self.buffer_len))
} else {
let start = start as usize;
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
&self.buffer[start + 2..start + 2 + len]
}
}
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
&mut self.buffer[start as usize..stop as usize]
if start >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
} else {
&mut self.buffer[start as usize..stop as usize]
}
}
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
let start = self.allocate_space(data.len());
let stop = start + data.len() as u32;
self.get_mut_slice(start, stop).clone_from_slice(data);
BytesRef {
start: start as u32,
stop: stop as u32,
}
assert!(data.len() < u16::max_value() as usize);
let total_len = 2 + data.len();
let start = self.allocate_space(total_len);
let total_buff = self.get_mut_slice(start, start + total_len as u32);
NativeEndian::write_u16(&mut total_buff[0..2], data.len() as u16);
total_buff[2..].clone_from_slice(data);
BytesRef(start)
}
fn get_mut(&mut self, addr: u32) -> *mut u8 {
let addr_isize = addr as isize;
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut(addr - self.buffer_len)
} else {
let addr_isize = addr as isize;
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
}
}
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
let v_ptr = v_ptr_u8 as *mut Item;
unsafe { &mut *v_ptr }
}
fn set<Item>(&mut self, addr: u32, val: &Item) {
let v_ptr: *const Item = val as *const Item;
let v_ptr_u8: *const u8 = v_ptr as *const u8;
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
unsafe {
let dest_ptr: *mut u8 = self.get_mut(addr);
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.get_mut_ref(addr - self.buffer_len)
} else {
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
let v_ptr = v_ptr_u8 as *mut Item;
unsafe { &mut *v_ptr }
}
}
}
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
if addr >= self.buffer_len {
self.next_heap
.as_mut()
.unwrap()
.set(addr - self.buffer_len, val);
} else {
let v_ptr: *const Item = val as *const Item;
let v_ptr_u8: *const u8 = v_ptr as *const u8;
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
unsafe {
let dest_ptr: *mut u8 = self.get_mut(addr);
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
}
}
}
}

View File

@@ -1,46 +1,43 @@
mod hashmap;
pub(crate) mod hashmap;
mod heap;
mod expull;
pub use self::heap::{Heap, HeapAllocable};
pub use self::expull::ExpUnrolledLinkedList;
pub use self::hashmap::{HashMap, Entry};
pub use self::hashmap::HashMap;
#[test]
fn test_unrolled_linked_list() {
use std::collections;
let heap = Heap::with_capacity(30_000_000);
{
heap.clear();
let mut ks: Vec<usize> = (1..5).map(|k| k * 100).collect();
ks.push(2);
ks.push(3);
for k in (1..5).map(|k| k * 100) {
for k in (1..5).map(|k| k * 100) {
let mut hashmap: HashMap = HashMap::new(10, &heap);
for j in 0..k {
for i in 0..500 {
let mut list: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string());
list.push(i*j, &heap);
let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string());
v.push(i * j, &heap);
}
}
let mut map_addr: collections::HashMap<Vec<u8>, u32> = collections::HashMap::new();
for (key, addr) in hashmap.iter() {
map_addr.insert(Vec::from(key), addr);
}
for i in 0..500 {
match hashmap.lookup(i.to_string()) {
Entry::Occupied(addr) => {
let v: &mut ExpUnrolledLinkedList = heap.get_mut_ref(addr);
let mut it = v.iter(addr, &heap);
for j in 0..k {
assert_eq!(it.next().unwrap(), i*j);
}
assert!(!it.next().is_some());
}
_ => {
panic!("should never happen");
}
let key: String = i.to_string();
let addr: u32 = *map_addr.get(key.as_bytes()).unwrap();
let exp_pull: &ExpUnrolledLinkedList = heap.get_ref(addr);
let mut it = exp_pull.iter(addr, &heap);
for j in 0..k {
assert_eq!(it.next().unwrap(), i * j);
}
assert!(!it.next().is_some());
}
}
}
}
}

View File

@@ -1,38 +1,37 @@
use std::marker::Send;
use std::fmt;
use std::path::Path;
use directory::error::{OpenReadError, DeleteError, OpenWriteError};
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use std::result;
use std::io;
use std::marker::Sync;
/// Write-once read many (WORM) abstraction for where
/// tantivy's data should be stored.
/// tantivy's data should be stored.
///
/// There are currently two implementations of `Directory`
///
///
/// - The [`MMapDirectory`](struct.MmapDirectory.html), this
/// should be your default choice.
/// - The [`RAMDirectory`](struct.RAMDirectory.html), which
/// should be your default choice.
/// - The [`RAMDirectory`](struct.RAMDirectory.html), which
/// should be used mostly for tests.
///
///
pub trait Directory: fmt::Debug + Send + Sync + 'static {
/// Opens a virtual file for read.
///
///
/// Once a virtual file is open, its data may not
/// change.
///
/// Specifically, subsequent writes or flushes should
/// have no effect on the returned `ReadOnlySource` object.
/// have no effect on the returned `ReadOnlySource` object.
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError>;
/// Removes a file
///
/// Removing a file will not affect an eventual
/// existing ReadOnlySource pointing to it.
///
///
/// Removing a nonexistent file, yields a
/// `DeleteError::DoesNotExist`.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError>;
@@ -40,18 +39,18 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
/// Returns true iff the file exists
fn exists(&self, path: &Path) -> bool;
/// Opens a writer for the *virtual file* associated with
/// Opens a writer for the *virtual file* associated with
/// a Path.
///
/// Right after this call, the file should be created
/// and any subsequent call to `open_read` for the
/// and any subsequent call to `open_read` for the
/// same path should return a `ReadOnlySource`.
///
///
/// Write operations may be aggressively buffered.
/// The client of this trait is responsible for calling flush
/// to ensure that subsequent `read` operations
/// to ensure that subsequent `read` operations
/// will take into account preceding `write` operations.
///
///
/// Flush operation should also be persistent.
///
/// The user shall not rely on `Drop` triggering `flush`.
@@ -60,7 +59,7 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
///
/// The file may not previously exist.
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError>;
/// Reads the full content file that has been written using
/// atomic_write.
///
@@ -68,17 +67,13 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError>;
/// Atomically replace the content of a file with data.
///
///
/// This calls ensure that reads can never *observe*
/// a partially written file.
///
///
/// The file may or may not previously exist.
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
/// Clones the directory and boxes the clone
/// Clones the directory and boxes the clone
fn box_clone(&self) -> Box<Directory>;
}

View File

@@ -1,52 +1,214 @@
use std::error::Error as StdError;
use std::path::PathBuf;
use std::io;
use std::fmt;
/// General IO error with an optional path to the offending file.
#[derive(Debug)]
pub struct IOError {
path: Option<PathBuf>,
err: io::Error,
}
impl fmt::Display for IOError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self.path {
Some(ref path) => write!(f, "io error occurred on path '{:?}': '{}'", path, self.err),
None => write!(f, "io error occurred: '{}'", self.err),
}
}
}
impl StdError for IOError {
fn description(&self) -> &str {
"io error occurred"
}
fn cause(&self) -> Option<&StdError> {
Some(&self.err)
}
}
impl IOError {
pub(crate) fn with_path(path: PathBuf, err: io::Error) -> Self {
IOError {
path: Some(path),
err,
}
}
}
impl From<io::Error> for IOError {
fn from(err: io::Error) -> IOError {
IOError { path: None, err }
}
}
/// Error that may occur when opening a directory
#[derive(Debug)]
pub enum OpenDirectoryError {
/// The underlying directory does not exists.
/// The underlying directory does not exists.
DoesNotExist(PathBuf),
/// The path exists but is not a directory.
NotADirectory(PathBuf),
}
impl fmt::Display for OpenDirectoryError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
OpenDirectoryError::DoesNotExist(ref path) => {
write!(f, "the underlying directory '{:?}' does not exist", path)
}
OpenDirectoryError::NotADirectory(ref path) => {
write!(f, "the path '{:?}' exists but is not a directory", path)
}
}
}
}
impl StdError for OpenDirectoryError {
fn description(&self) -> &str {
"error occurred while opening a directory"
}
fn cause(&self) -> Option<&StdError> {
None
}
}
/// Error that may occur when starting to write in a file
#[derive(Debug)]
pub enum OpenWriteError {
/// Our directory is WORM, writing an existing file is forbidden.
/// Checkout the `Directory` documentation.
/// Checkout the `Directory` documentation.
FileAlreadyExists(PathBuf),
/// Any kind of IO error that happens when
/// Any kind of IO error that happens when
/// writing in the underlying IO device.
IOError(io::Error),
IOError(IOError),
}
impl From<io::Error> for OpenWriteError {
fn from(err: io::Error) -> OpenWriteError {
impl From<IOError> for OpenWriteError {
fn from(err: IOError) -> OpenWriteError {
OpenWriteError::IOError(err)
}
}
impl fmt::Display for OpenWriteError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
OpenWriteError::FileAlreadyExists(ref path) => {
write!(f, "the file '{:?}' already exists", path)
}
OpenWriteError::IOError(ref err) => write!(
f,
"an io error occurred while opening a file for writing: '{}'",
err
),
}
}
}
impl StdError for OpenWriteError {
fn description(&self) -> &str {
"error occurred while opening a file for writing"
}
fn cause(&self) -> Option<&StdError> {
match *self {
OpenWriteError::FileAlreadyExists(_) => None,
OpenWriteError::IOError(ref err) => Some(err),
}
}
}
/// Error that may occur when accessing a file read
#[derive(Debug)]
pub enum OpenReadError {
/// The file does not exists.
FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when
/// Any kind of IO error that happens when
/// interacting with the underlying IO device.
IOError(io::Error),
IOError(IOError),
}
impl From<IOError> for OpenReadError {
fn from(err: IOError) -> OpenReadError {
OpenReadError::IOError(err)
}
}
impl fmt::Display for OpenReadError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
OpenReadError::FileDoesNotExist(ref path) => {
write!(f, "the file '{:?}' does not exist", path)
}
OpenReadError::IOError(ref err) => write!(
f,
"an io error occurred while opening a file for reading: '{}'",
err
),
}
}
}
impl StdError for OpenReadError {
fn description(&self) -> &str {
"error occurred while opening a file for reading"
}
fn cause(&self) -> Option<&StdError> {
match *self {
OpenReadError::FileDoesNotExist(_) => None,
OpenReadError::IOError(ref err) => Some(err),
}
}
}
/// Error that may occur when trying to delete a file
#[derive(Debug)]
pub enum DeleteError {
/// The file does not exists.
FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when
/// Any kind of IO error that happens when
/// interacting with the underlying IO device.
IOError(io::Error),
/// The file may not be deleted because it is
IOError(IOError),
/// The file may not be deleted because it is
/// protected.
FileProtected(PathBuf),
}
impl From<IOError> for DeleteError {
fn from(err: IOError) -> DeleteError {
DeleteError::IOError(err)
}
}
impl fmt::Display for DeleteError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
DeleteError::FileDoesNotExist(ref path) => {
write!(f, "the file '{:?}' does not exist", path)
}
DeleteError::FileProtected(ref path) => {
write!(f, "the file '{:?}' is protected and can't be deleted", path)
}
DeleteError::IOError(ref err) => {
write!(f, "an io error occurred while deleting a file: '{}'", err)
}
}
}
}
impl StdError for DeleteError {
fn description(&self) -> &str {
"error occurred while deleting a file"
}
fn cause(&self) -> Option<&StdError> {
match *self {
DeleteError::FileDoesNotExist(_) | DeleteError::FileProtected(_) => None,
DeleteError::IOError(ref err) => Some(err),
}
}
}

View File

@@ -1,23 +1,23 @@
use std::path::{Path, PathBuf};
use serde_json;
use directory::error::{OpenReadError, DeleteError, OpenWriteError};
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use std::result;
use std::io;
use Directory;
use std::sync::{Arc, RwLock};
use std::collections::HashSet;
use std::sync::RwLockWriteGuard;
use std::io::Write;
use core::MANAGED_FILEPATH;
use std::collections::HashMap;
use std::fmt;
use Result;
use Error;
use error::{ErrorKind, Result, ResultExt};
/// Wrapper of directories that keeps track of files created by Tantivy.
///
/// A managed directory is just a wrapper of a directory
/// that keeps a (persisted) list of the files that
/// that keeps a (persisted) list of the files that
/// have been created (and not deleted) by tantivy so far.
///
/// Thanks to this list, it implements a `garbage_collect` method
@@ -35,7 +35,6 @@ struct MetaInformation {
protected_files: HashMap<PathBuf, usize>,
}
/// A `FileProtection` prevents the garbage collection of a file.
///
/// See `ManagedDirectory.protect_file_from_delete`.
@@ -45,19 +44,18 @@ pub struct FileProtection {
}
fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
let mut meta_informations_wlock = directory.meta_informations
let mut meta_informations_wlock = directory
.meta_informations
.write()
.expect("Managed file lock poisoned");
if let Some(counter_ref_mut) = meta_informations_wlock
.protected_files
.get_mut(path) {
if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) {
(*counter_ref_mut) -= 1;
}
}
impl fmt::Debug for FileProtection {
fn fmt(&self, formatter: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
write!(formatter, "FileProtectionFor({:?})", self.path)
write!(formatter, "FileProtectionFor({:?})", self.path)
}
}
@@ -67,33 +65,39 @@ impl Drop for FileProtection {
}
}
impl ManagedDirectory {
/// Saves the file containing the list of existing files
/// that were created by tantivy.
fn save_managed_paths(
directory: &mut Directory,
wlock: &RwLockWriteGuard<MetaInformation>,
) -> io::Result<()> {
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
write!(&mut w, "\n")?;
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
Ok(())
}
impl ManagedDirectory {
/// Wraps a directory as managed directory.
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
match directory.atomic_read(&MANAGED_FILEPATH) {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
.map_err(|e| Error::CorruptedFile(MANAGED_FILEPATH.clone(), Box::new(e)))?;
.chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?;
Ok(ManagedDirectory {
directory: box directory,
meta_informations: Arc::new(RwLock::new(
MetaInformation {
managed_paths: managed_files,
protected_files: HashMap::default()
})),
meta_informations: Arc::new(RwLock::new(MetaInformation {
managed_paths: managed_files,
protected_files: HashMap::default(),
})),
})
}
Err(OpenReadError::FileDoesNotExist(_)) => {
Ok(ManagedDirectory {
directory: box directory,
meta_informations: Arc::default(),
})
}
Err(OpenReadError::IOError(e)) => {
Err(From::from(e))
}
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
directory: box directory,
meta_informations: Arc::default(),
}),
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
}
}
@@ -101,27 +105,40 @@ impl ManagedDirectory {
///
/// Removes the files that were created by `tantivy` and are not
/// used by any segment anymore.
///
///
/// * `living_files` - List of files that are still used by the index.
///
/// This method does not panick nor returns errors.
/// If a file cannot be deleted (for permission reasons for instance)
/// an error is simply logged, and the file remains in the list of managed
/// files.
pub fn garbage_collect(&mut self, living_files: HashSet<PathBuf>) {
let mut files_to_delete = vec!();
{ // releasing the lock as .delete() will use it too.
pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
info!("Garbage collect");
let mut files_to_delete = vec![];
{
// releasing the lock as .delete() will use it too.
let meta_informations_rlock = self.meta_informations
.read()
.expect("Managed directory rlock poisoned in garbage collect.");
// It is crucial to get the living files after acquiring the
// read lock of meta informations. That way, we
// avoid the following scenario.
//
// 1) we get the list of living files.
// 2) someone creates a new file.
// 3) we start garbage collection and remove this file
// even though it is a living file.
let living_files = get_living_files();
for managed_path in &meta_informations_rlock.managed_paths {
if !living_files.contains(managed_path) {
files_to_delete.push(managed_path.clone());
}
}
}
let mut deleted_files = vec!();
let mut deleted_files = vec![];
{
for file_to_delete in files_to_delete {
match self.delete(&file_to_delete) {
@@ -130,13 +147,14 @@ impl ManagedDirectory {
deleted_files.push(file_to_delete);
}
Err(file_error) => {
error!("Failed to delete {:?}", file_to_delete);
match file_error {
DeleteError::FileDoesNotExist(_) => {
deleted_files.push(file_to_delete);
}
DeleteError::IOError(_) => {
if !cfg!(target_os = "windows") {
// On windows, delete is expected to fail if the file
// is mmapped.
error!("Failed to delete {:?}", file_to_delete);
}
}
@@ -144,38 +162,34 @@ impl ManagedDirectory {
// this is expected.
}
}
}
}
}
}
if !deleted_files.is_empty() {
// update the list of managed files by removing
// update the list of managed files by removing
// the file that were removed.
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed directory wlock poisoned (2).");
{
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed directory wlock poisoned (2).");
let managed_paths_write = &mut meta_informations_wlock.managed_paths;
for delete_file in &deleted_files {
managed_paths_write.remove(delete_file);
}
}
if let Err(_) = self.save_managed_paths() {
if save_managed_paths(self.directory.as_mut(), &meta_informations_wlock).is_err() {
error!("Failed to save the list of managed files.");
}
}
}
/// Protects a file from being garbage collected.
///
/// The method returns a `FileProtection` object.
/// The file will not be garbage collected as long as the
/// `FileProtection` object is kept alive.
/// `FileProtection` object is kept alive.
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
let pathbuf = path.to_owned();
{
@@ -193,52 +207,33 @@ impl ManagedDirectory {
}
}
/// Saves the file containing the list of existing files
/// that were created by tantivy.
fn save_managed_paths(&mut self,) -> io::Result<()> {
let managed_paths;
{
let meta_informations_rlock = self.meta_informations
.read()
.expect("Managed file lock poisoned");
managed_paths = meta_informations_rlock.managed_paths.clone();
}
let mut w = try!(serde_json::to_vec(&managed_paths));
try!(write!(&mut w, "\n"));
self.directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
Ok(())
}
/// Registers a file as managed
///
/// This method must be called before the file is
///
/// This method must be called before the file is
/// actually created to ensure that a failure between
/// registering the filepath and creating the file
/// will not lead to garbage files that will
/// will not lead to garbage files that will
/// never get removed.
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
let has_changed = {
let mut meta_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned");
meta_wlock.managed_paths.insert(filepath.to_owned())
};
let mut meta_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned");
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
if has_changed {
self.save_managed_paths()?;
save_managed_paths(self.directory.as_mut(), &meta_wlock)?;
}
Ok(())
}
}
impl Directory for ManagedDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
self.directory.open_read(path)
}
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
self.register_file_as_managed(path)?;
self.register_file_as_managed(path)
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
self.directory.open_write(path)
}
@@ -258,7 +253,7 @@ impl Directory for ManagedDirectory {
.expect("poisoned lock in managed directory meta");
if let Some(counter) = metas_rlock.protected_files.get(path) {
if *counter > 0 {
return Err(DeleteError::FileProtected(path.to_owned()))
return Err(DeleteError::FileProtected(path.to_owned()));
}
}
}
@@ -268,34 +263,30 @@ impl Directory for ManagedDirectory {
fn exists(&self, path: &Path) -> bool {
self.directory.exists(path)
}
fn box_clone(&self) -> Box<Directory> {
box self.clone()
}
}
impl Clone for ManagedDirectory {
fn clone(&self) -> ManagedDirectory {
ManagedDirectory {
directory: self.directory.box_clone(),
meta_informations: self.meta_informations.clone(),
meta_informations: Arc::clone(&self.meta_informations),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use directory::MmapDirectory;
use std::path::Path;
use std::path::Path;
use std::io::Write;
use tempdir::TempDir;
lazy_static! {
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
@@ -313,18 +304,18 @@ mod tests {
write_file.flush().unwrap();
}
{
managed_directory.atomic_write(*TEST_PATH2, &vec!(0u8,1u8)).unwrap();
managed_directory
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
.unwrap();
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> = [TEST_PATH1.to_owned()]
.into_iter()
.cloned()
.collect();
managed_directory.garbage_collect(living_files);
let living_files: HashSet<PathBuf> =
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(managed_directory.exists(*TEST_PATH1));
@@ -340,13 +331,13 @@ mod tests {
}
{
let living_files: HashSet<PathBuf> = HashSet::new();
managed_directory.garbage_collect(living_files);
managed_directory.garbage_collect(|| living_files);
}
{
assert!(!managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
}
}
#[test]
@@ -357,11 +348,13 @@ mod tests {
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(living_files.clone());
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(|| living_files.clone());
if cfg!(target_os = "windows") {
// On Windows, gc should try and fail the file as it is mmapped.
assert!(managed_directory.exists(*TEST_PATH1));
@@ -369,16 +362,13 @@ mod tests {
drop(_mmap_read);
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
managed_directory.garbage_collect(living_files);
managed_directory.garbage_collect(|| living_files);
assert!(!managed_directory.exists(*TEST_PATH1));
} else {
assert!(!managed_directory.exists(*TEST_PATH1));
}
else {
assert!(!managed_directory.exists(*TEST_PATH1));
}
}
#[test]
fn test_managed_directory_protect() {
let tempdir = TempDir::new("index").unwrap();
@@ -387,19 +377,19 @@ mod tests {
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
{
let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1);
managed_directory.garbage_collect(living_files.clone());
managed_directory.garbage_collect(|| living_files.clone());
assert!(managed_directory.exists(*TEST_PATH1));
}
managed_directory.garbage_collect(living_files.clone());
managed_directory.garbage_collect(|| living_files.clone());
assert!(!managed_directory.exists(*TEST_PATH1));
}
}

View File

@@ -1,7 +1,7 @@
use atomicwrites;
use common::make_io_err;
use directory::Directory;
use directory::error::{OpenWriteError, OpenReadError, DeleteError, OpenDirectoryError};
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use directory::ReadOnlySource;
use directory::shared_vec_slice::SharedVecSlice;
use directory::WritePtr;
@@ -23,37 +23,30 @@ use std::sync::RwLock;
use std::sync::Weak;
use tempdir::TempDir;
fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
let convert_file_error = |err: io::Error| {
if err.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.clone())
fn open_mmap(full_path: &Path) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
let file = File::open(&full_path).map_err(|e| {
if e.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.to_owned())
} else {
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
}
else {
OpenReadError::IOError(err)
}
};
let file = File::open(&full_path).map_err(convert_file_error)?;
let meta_data = file
.metadata()
.map_err(|e| OpenReadError::IOError(e))?;
})?;
let meta_data = file.metadata()
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
if meta_data.len() == 0 {
// if the file size is 0, it will not be possible
// if the file size is 0, it will not be possible
// to mmap the file, so we return an anonymous mmap_cache
// instead.
return Ok(None)
return Ok(None);
}
match Mmap::open(&file, Protection::Read) {
Ok(mmap) => {
Ok(Some(Arc::new(mmap)))
}
Err(e) => {
Err(OpenReadError::IOError(e))
}
Ok(mmap) => Ok(Some(Arc::new(mmap))),
Err(e) => Err(IOError::with_path(full_path.to_owned(), e))?,
}
}
#[derive(Default,Clone,Debug,Serialize,Deserialize)]
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
pub struct CacheCounters {
// Number of time the cache prevents to call `mmap`
pub hit: usize,
@@ -65,7 +58,7 @@ pub struct CacheCounters {
pub miss_weak: usize,
}
#[derive(Clone,Debug,Serialize,Deserialize)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct CacheInfo {
pub counters: CacheCounters,
pub mmapped: Vec<PathBuf>,
@@ -89,10 +82,8 @@ impl Default for MmapCache {
}
}
impl MmapCache {
fn cleanup(&mut self) {
fn cleanup(&mut self) {
let previous_cache_size = self.cache.len();
let mut new_cache = HashMap::new();
mem::swap(&mut new_cache, &mut self.cache);
@@ -107,16 +98,14 @@ impl MmapCache {
fn get_info(&mut self) -> CacheInfo {
self.cleanup();
let paths: Vec<PathBuf> = self.cache.keys()
.cloned()
.collect();
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
CacheInfo {
counters: self.counters.clone(),
mmapped: paths,
}
}
fn get_mmap(&mut self, full_path: PathBuf) -> Result<Option<Arc<Mmap>>, OpenReadError> {
fn get_mmap(&mut self, full_path: &PathBuf) -> Result<Option<Arc<Mmap>>, OpenReadError> {
// if we exceed this limit, then we go through the weak
// and remove those that are obsolete.
if self.cache.len() > self.purge_weak_limit {
@@ -126,27 +115,24 @@ impl MmapCache {
HashMapEntry::Occupied(mut occupied_entry) => {
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
self.counters.hit += 1;
Some(mmap_arc.clone())
}
else {
Some(Arc::clone(&mmap_arc))
} else {
// The entry exists but the weak ref has been destroyed.
self.counters.miss_weak += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
if let Some(mmap_arc) = open_mmap(full_path)? {
occupied_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
}
else {
} else {
None
}
}
}
HashMapEntry::Vacant(vacant_entry) => {
self.counters.miss_empty += 1;
if let Some(mmap_arc) = open_mmap(&full_path)? {
if let Some(mmap_arc) = open_mmap(full_path)? {
vacant_entry.insert(Arc::downgrade(&mmap_arc));
Some(mmap_arc)
}
else {
} else {
None
}
}
@@ -156,35 +142,33 @@ impl MmapCache {
/// Directory storing data in files, read via mmap.
///
/// The Mmap object are cached to limit the
/// system calls.
/// The Mmap object are cached to limit the
/// system calls.
#[derive(Clone)]
pub struct MmapDirectory {
root_path: PathBuf,
mmap_cache: Arc<RwLock<MmapCache>>,
_temp_directory: Arc<Option<TempDir>>,
}
impl fmt::Debug for MmapDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "MmapDirectory({:?})", self.root_path)
}
write!(f, "MmapDirectory({:?})", self.root_path)
}
}
impl MmapDirectory {
/// Creates a new MmapDirectory in a temporary directory.
///
/// This is mostly useful to test the MmapDirectory itself.
/// For your unit tests, prefer the RAMDirectory.
/// For your unit tests, prefer the RAMDirectory.
pub fn create_from_tempdir() -> io::Result<MmapDirectory> {
let tempdir = try!(TempDir::new("index"));
let tempdir = TempDir::new("index")?;
let tempdir_path = PathBuf::from(tempdir.path());
let directory = MmapDirectory {
root_path: PathBuf::from(tempdir_path),
root_path: tempdir_path,
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(Some(tempdir))
_temp_directory: Arc::new(Some(tempdir)),
};
Ok(directory)
}
@@ -193,18 +177,21 @@ impl MmapDirectory {
///
/// Returns an error if the `directory_path` does not
/// exist or if it is not a directory.
pub fn open(directory_path: &Path) -> Result<MmapDirectory, OpenDirectoryError> {
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
let directory_path: &Path = directory_path.as_ref();
if !directory_path.exists() {
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(directory_path)))
}
else if !directory_path.is_dir() {
Err(OpenDirectoryError::NotADirectory(PathBuf::from(directory_path)))
}
else {
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
directory_path,
)))
} else if !directory_path.is_dir() {
Err(OpenDirectoryError::NotADirectory(PathBuf::from(
directory_path,
)))
} else {
Ok(MmapDirectory {
root_path: PathBuf::from(directory_path),
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(None)
_temp_directory: Arc::new(None),
})
}
}
@@ -232,18 +219,19 @@ impl MmapDirectory {
use std::os::windows::fs::OpenOptionsExt;
use winapi::winbase;
open_opts.write(true)
open_opts
.write(true)
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
}
let fd = try!(open_opts.open(&self.root_path));
try!(fd.sync_all());
let fd = open_opts.open(&self.root_path)?;
fd.sync_all()?;
Ok(())
}
/// Returns some statistical information
/// about the Mmap cache.
///
/// The `MmapDirectory` embeds a `MmapDirectory`
///
/// The `MmapDirectory` embeds a `MmapDirectory`
/// to avoid multiplying the `mmap` system calls.
pub fn get_cache_info(&mut self) -> CacheInfo {
self.mmap_cache
@@ -251,12 +239,10 @@ impl MmapDirectory {
.expect("Mmap cache lock is poisoned.")
.get_info()
}
}
/// This Write wraps a File, but has the specificity of
/// call `sync_all` on flush.
/// This Write wraps a File, but has the specificity of
/// call `sync_all` on flush.
struct SafeFileWriter(File);
impl SafeFileWriter {
@@ -266,13 +252,12 @@ impl SafeFileWriter {
}
impl Write for SafeFileWriter {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.0.write(buf)
}
fn flush(&mut self) -> io::Result<()> {
try!(self.0.flush());
self.0.flush()?;
self.0.sync_all()
}
}
@@ -283,53 +268,53 @@ impl Seek for SafeFileWriter {
}
}
impl Directory for MmapDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache
.write()
.map_err(|_| OpenReadError::IOError(
make_io_err(format!("Failed to acquired write lock on mmap cache while reading {:?}", path))
))?;
Ok(mmap_cache.get_mmap(full_path)?
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while reading {:?}",
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
Ok(mmap_cache
.get_mmap(&full_path)?
.map(MmapReadOnly::from)
.map(ReadOnlySource::Mmap)
.unwrap_or(ReadOnlySource::Anonymous(SharedVecSlice::empty()))
)
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
debug!("Open Write {:?}", path);
let full_path = self.resolve_path(path);
let open_res = OpenOptions::new()
.write(true)
.create_new(true)
.open(full_path);
let mut file = try!(
open_res.map_err(|err| {
if err.kind() == io::ErrorKind::AlreadyExists {
OpenWriteError::FileAlreadyExists(PathBuf::from(path))
}
else {
OpenWriteError::IOError(err)
}
})
);
let mut file = open_res.map_err(|err| {
if err.kind() == io::ErrorKind::AlreadyExists {
OpenWriteError::FileAlreadyExists(path.to_owned())
} else {
IOError::with_path(path.to_owned(), err).into()
}
})?;
// making sure the file is created.
try!(file.flush());
file.flush()
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
// Apparetntly, on some filesystem syncing the parent
// directory is required.
try!(self.sync_directory());
self.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
let writer = SafeFileWriter::new(file);
Ok(BufWriter::new(Box::new(writer)))
}
@@ -337,26 +322,26 @@ impl Directory for MmapDirectory {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
debug!("Deleting file {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = try!(self.mmap_cache
.write()
.map_err(|_|
DeleteError::IOError(make_io_err(format!("Failed to acquired write lock on mmap cache while deleting {:?}", path))))
);
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while deleting {:?}",
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
// Removing the entry in the MMap cache.
// The munmap will appear on Drop,
// when the last reference is gone.
mmap_cache.cache.remove(&full_path);
match fs::remove_file(&full_path) {
Ok(_) => {
self.sync_directory()
.map_err(|e| DeleteError::IOError(e))
}
Ok(_) => self.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned()))
}
else {
Err(DeleteError::IOError(e))
} else {
Err(IOError::with_path(path.to_owned(), e).into())
}
}
}
@@ -373,39 +358,32 @@ impl Directory for MmapDirectory {
match File::open(&full_path) {
Ok(mut file) => {
file.read_to_end(&mut buffer)
.map_err(|e| OpenReadError::IOError(e))?;
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
Ok(buffer)
}
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
}
else {
Err(OpenReadError::IOError(e))
} else {
Err(IOError::with_path(path.to_owned(), e).into())
}
}
}
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
debug!("Atomic Write {:?}", path);
let full_path = self.resolve_path(path);
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite);
try!(meta_file.write(|f| {
f.write_all(data)
}));
meta_file.write(|f| f.write_all(data))?;
Ok(())
}
fn box_clone(&self,) -> Box<Directory> {
fn box_clone(&self) -> Box<Directory> {
Box::new(self.clone())
}
}
#[cfg(test)]
mod tests {
@@ -457,9 +435,8 @@ mod tests {
}
}
assert_eq!(mmap_directory.get_cache_info().counters.miss_empty, 10);
{
{
// test weak miss
// the first pass create the weak refs.
for path in &paths {
@@ -475,7 +452,7 @@ mod tests {
}
{
let mut saved_readmmaps = vec!();
let mut saved_readmmaps = vec![];
// Keeps reference alive
for (i, path) in paths.iter().enumerate() {
let r = mmap_directory.open_read(path).unwrap();
@@ -483,7 +460,6 @@ mod tests {
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
}
let cache_info = mmap_directory.get_cache_info();
println!("{:?}", cache_info);
assert_eq!(cache_info.counters.miss_empty, 30);
assert_eq!(cache_info.counters.miss_weak, 10);
assert_eq!(cache_info.mmapped.len(), 10);
@@ -494,7 +470,6 @@ mod tests {
}
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
}
}

View File

@@ -1,3 +1,8 @@
/*!
WORM directory abstraction.
*/
mod mmap_directory;
mod ram_directory;
mod directory;
@@ -8,14 +13,15 @@ mod managed_directory;
/// Errors specific to the directory module.
pub mod error;
use std::io::{Write, Seek};
use std::io::{BufWriter, Seek, Write};
use std::io::BufWriter;
pub use self::read_only_source::ReadOnlySource;
pub use self::directory::Directory;
pub use self::ram_directory::RAMDirectory;
pub use self::mmap_directory::MmapDirectory;
pub use self::managed_directory::{ManagedDirectory, FileProtection};
pub(crate) use self::read_only_source::SourceRead;
pub(crate) use self::managed_directory::{FileProtection, ManagedDirectory};
/// Synonym of Seek + Write
pub trait SeekableWrite: Seek + Write {}
@@ -31,8 +37,8 @@ pub type WritePtr = BufWriter<Box<SeekableWrite>>;
mod tests {
use super::*;
use std::path::Path;
use std::io::{Write, Seek, SeekFrom};
use std::path::Path;
use std::io::{Seek, SeekFrom, Write};
lazy_static! {
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
@@ -65,7 +71,7 @@ mod tests {
assert!(directory.exists(*TEST_PATH));
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7,3,5]).unwrap();
write_file.write_all(&[7, 3, 5]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
@@ -81,9 +87,9 @@ mod tests {
{
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[4, 3, 7,3,5]).unwrap();
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
write_file.seek(SeekFrom::Start(0)).unwrap();
write_file.write_all(&[3,1]).unwrap();
write_file.write_all(&[3, 1]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
@@ -98,7 +104,6 @@ mod tests {
{
directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
}
{
assert!(directory.open_write(*TEST_PATH).is_err());

View File

@@ -1,24 +1,24 @@
use std::collections::HashMap;
use std::fmt;
use std::io::{self, BufWriter, Cursor, Write, Seek, SeekFrom};
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};
use std::result;
use std::sync::{Arc, RwLock};
use common::make_io_err;
use directory::{Directory, ReadOnlySource};
use directory::error::{OpenWriteError, OpenReadError, DeleteError};
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::WritePtr;
use super::shared_vec_slice::SharedVecSlice;
/// Writer associated with the `RAMDirectory`
///
///
/// The Writer just writes a buffer.
///
/// # Panics
///
/// On drop, if the writer was left in a *dirty* state.
/// That is, if flush was not called after the last call
/// to write.
/// to write.
///
struct VecWriter {
path: PathBuf,
@@ -32,7 +32,7 @@ impl VecWriter {
VecWriter {
path: path_buf,
data: Cursor::new(Vec::new()),
shared_directory: shared_directory,
shared_directory,
is_flushed: true,
}
}
@@ -40,8 +40,11 @@ impl VecWriter {
impl Drop for VecWriter {
fn drop(&mut self) {
if !self.is_flushed {
panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.", self.path)
if !self.is_flushed {
panic!(
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
self.path
)
}
}
}
@@ -55,13 +58,14 @@ impl Seek for VecWriter {
impl Write for VecWriter {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.is_flushed = false;
try!(self.data.write_all(buf));
self.data.write_all(buf)?;
Ok(buf.len())
}
fn flush(&mut self) -> io::Result<()> {
self.is_flushed = true;
try!(self.shared_directory.write(self.path.clone(), self.data.get_ref()));
self.shared_directory
.write(self.path.clone(), self.data.get_ref())?;
Ok(())
}
}
@@ -69,38 +73,40 @@ impl Write for VecWriter {
#[derive(Clone)]
struct InnerDirectory(Arc<RwLock<HashMap<PathBuf, Arc<Vec<u8>>>>>);
impl InnerDirectory {
fn new() -> InnerDirectory {
InnerDirectory(Arc::new(RwLock::new(HashMap::new())))
}
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
let mut map = try!(
self.0
.write()
.map_err(|_| make_io_err(format!("Failed to lock the directory, when trying to write {:?}", path)))
);
let mut map = self.0.write().map_err(|_| {
make_io_err(format!(
"Failed to lock the directory, when trying to write {:?}",
path
))
})?;
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
Ok(prev_value.is_some())
}
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
self.0
.read()
.map_err(|_| {
let io_err = make_io_err(format!("Failed to acquire read lock for the directory, when trying to read {:?}", path));
OpenReadError::IOError(io_err)
let msg = format!(
"Failed to acquire read lock for the \
directory when trying to read {:?}",
path
);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|readable_map| {
readable_map
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
.map(|data| {
ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone()))
})
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
.map(Arc::clone)
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data)))
})
}
@@ -108,18 +114,17 @@ impl InnerDirectory {
self.0
.write()
.map_err(|_| {
let io_err = make_io_err(format!("Failed to acquire write lock for the directory, when trying to delete {:?}", path));
DeleteError::IOError(io_err)
let msg = format!(
"Failed to acquire write lock for the \
directory when trying to delete {:?}",
path
);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|mut writable_map| {
match writable_map.remove(path) {
Some(_) => {
Ok(())
},
None => {
Err(DeleteError::FileDoesNotExist(PathBuf::from(path)))
}
}
.and_then(|mut writable_map| match writable_map.remove(path) {
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})
}
@@ -129,16 +134,14 @@ impl InnerDirectory {
.expect("Failed to get read lock directory.")
.contains_key(path)
}
}
impl fmt::Debug for RAMDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "RAMDirectory")
}
write!(f, "RAMDirectory")
}
}
/// A Directory storing everything in anonymous memory.
///
/// It is mainly meant for unit testing.
@@ -150,11 +153,10 @@ pub struct RAMDirectory {
}
impl RAMDirectory {
/// Constructor
pub fn create() -> RAMDirectory {
RAMDirectory {
fs: InnerDirectory::new()
fs: InnerDirectory::new(),
}
}
}
@@ -163,15 +165,19 @@ impl Directory for RAMDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
self.fs.open_read(path)
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
let path_buf = PathBuf::from(path);
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
let exists = self.fs
.write(path_buf.clone(), &Vec::new())
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
// force the creation of the file to mimic the MMap directory.
if try!(self.fs.write(path_buf.clone(), &Vec::new())) {
if exists {
Err(OpenWriteError::FileAlreadyExists(path_buf))
}
else {
} else {
Ok(BufWriter::new(Box::new(vec_writer)))
}
}
@@ -180,28 +186,25 @@ impl Directory for RAMDirectory {
self.fs.delete(path)
}
fn exists(&self, path: &Path) -> bool {
self.fs.exists(path)
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
let read = self.open_read(path)?;
Ok(read.as_slice()
.to_owned())
Ok(read.as_slice().to_owned())
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
let path_buf = PathBuf::from(path);
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
try!(self.fs.write(path_buf, &Vec::new()));
try!(vec_writer.write_all(data));
try!(vec_writer.flush());
self.fs.write(path_buf, &Vec::new())?;
vec_writer.write_all(data)?;
vec_writer.flush()?;
Ok(())
}
fn box_clone(&self,) -> Box<Directory> {
fn box_clone(&self) -> Box<Directory> {
Box::new(self.clone())
}
}

View File

@@ -2,10 +2,12 @@ use fst::raw::MmapReadOnly;
use std::ops::Deref;
use super::shared_vec_slice::SharedVecSlice;
use common::HasLen;
use std::slice;
use std::io::{self, Read};
use stable_deref_trait::StableDeref;
/// Read object that represents files in tantivy.
///
///
/// These read objects are only in charge to deliver
/// the data in the form of a constant read-only `&[u8]`.
/// Whatever happens to the directory file, the data
@@ -13,12 +15,13 @@ use common::HasLen;
pub enum ReadOnlySource {
/// Mmap source of data
Mmap(MmapReadOnly),
/// Wrapping a `Vec<u8>`
/// Wrapping a `Vec<u8>`
Anonymous(SharedVecSlice),
}
unsafe impl StableDeref for ReadOnlySource {}
impl Deref for ReadOnlySource {
type Target = [u8];
fn deref(&self) -> &[u8] {
@@ -27,35 +30,38 @@ impl Deref for ReadOnlySource {
}
impl ReadOnlySource {
/// Creates an empty ReadOnlySource
pub fn empty() -> ReadOnlySource {
ReadOnlySource::Anonymous(SharedVecSlice::empty())
}
/// Returns the data underlying the ReadOnlySource object.
pub fn as_slice(&self,) -> &[u8] {
pub fn as_slice(&self) -> &[u8] {
match *self {
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe {
mmap_read_only.as_slice()
},
ReadOnlySource::Anonymous(ref shared_vec) => {
shared_vec.as_slice()
},
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
}
}
/// Creates a ReadOnlySource that is just a
/// Splits into 2 `ReadOnlySource`, at the offset given
/// as an argument.
pub fn split(self, addr: usize) -> (ReadOnlySource, ReadOnlySource) {
let left = self.slice(0, addr);
let right = self.slice_from(addr);
(left, right)
}
/// Creates a ReadOnlySource that is just a
/// view over a slice of the data.
///
///
/// Keep in mind that any living slice extends
/// the lifetime of the original ReadOnlySource,
///
///
/// For instance, if `ReadOnlySource` wraps 500MB
/// worth of data in anonymous memory, and only a
/// 1KB slice is remaining, the whole `500MBs`
/// 1KB slice is remaining, the whole `500MBs`
/// are retained in memory.
pub fn slice(&self, from_offset:usize, to_offset:usize) -> ReadOnlySource {
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
match *self {
ReadOnlySource::Mmap(ref mmap_read_only) => {
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
@@ -63,13 +69,30 @@ impl ReadOnlySource {
}
ReadOnlySource::Anonymous(ref shared_vec) => {
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
},
}
}
}
/// Like `.slice(...)` but enforcing only the `from`
/// boundary.
///
/// Equivalent to `.slice(from_offset, self.len())`
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
let len = self.len();
self.slice(from_offset, len)
}
/// Like `.slice(...)` but enforcing only the `to`
/// boundary.
///
/// Equivalent to `.slice(0, to_offset)`
pub fn slice_to(&self, to_offset: usize) -> ReadOnlySource {
self.slice(0, to_offset)
}
}
impl HasLen for ReadOnlySource {
fn len(&self,) -> usize {
fn len(&self) -> usize {
self.as_slice().len()
}
}
@@ -79,3 +102,48 @@ impl Clone for ReadOnlySource {
self.slice(0, self.len())
}
}
impl From<Vec<u8>> for ReadOnlySource {
fn from(data: Vec<u8>) -> ReadOnlySource {
let shared_data = SharedVecSlice::from(data);
ReadOnlySource::Anonymous(shared_data)
}
}
/// Acts as a owning cursor over the data backed up by a `ReadOnlySource`
pub(crate) struct SourceRead {
_data_owner: ReadOnlySource,
cursor: &'static [u8],
}
impl SourceRead {
// Advance the cursor by a given number of bytes.
pub fn advance(&mut self, len: usize) {
self.cursor = &self.cursor[len..];
}
}
impl AsRef<[u8]> for SourceRead {
fn as_ref(&self) -> &[u8] {
self.cursor
}
}
impl From<ReadOnlySource> for SourceRead {
// Creates a new `SourceRead` from a given `ReadOnlySource`
fn from(source: ReadOnlySource) -> SourceRead {
let len = source.len();
let slice_ptr = source.as_slice().as_ptr();
let static_slice = unsafe { slice::from_raw_parts(slice_ptr, len) };
SourceRead {
_data_owner: source,
cursor: static_slice,
}
}
}
impl Read for SourceRead {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
self.cursor.read(buf)
}
}

View File

@@ -1,15 +1,13 @@
use std::sync::Arc;
#[derive(Clone)]
pub struct SharedVecSlice {
pub data: Arc<Vec<u8>>,
pub start: usize,
pub len: usize
pub start: usize,
pub len: usize,
}
impl SharedVecSlice {
pub fn empty() -> SharedVecSlice {
SharedVecSlice::new(Arc::new(Vec::new()))
}
@@ -17,21 +15,27 @@ impl SharedVecSlice {
pub fn new(data: Arc<Vec<u8>>) -> SharedVecSlice {
let data_len = data.len();
SharedVecSlice {
data: data,
data,
start: 0,
len: data_len,
}
}
pub fn as_slice(&self,) -> &[u8] {
pub fn as_slice(&self) -> &[u8] {
&self.data[self.start..self.start + self.len]
}
pub fn slice(&self, from_offset: usize, to_offset:usize) -> SharedVecSlice {
pub fn slice(&self, from_offset: usize, to_offset: usize) -> SharedVecSlice {
SharedVecSlice {
data: self.data.clone(),
data: Arc::clone(&self.data),
start: self.start + from_offset,
len: to_offset - from_offset,
}
}
}
impl From<Vec<u8>> for SharedVecSlice {
fn from(data: Vec<u8>) -> SharedVecSlice {
SharedVecSlice::new(Arc::new(data))
}
}

View File

@@ -1,109 +1,138 @@
#![allow(enum_variant_names)]
/// Definition of Tantivy's error and result.
//! Definition of Tantivy's error and result.
use std::io;
use std::path::PathBuf;
use std::error;
use std::sync::PoisonError;
use directory::error::{OpenReadError, OpenWriteError, OpenDirectoryError};
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use query;
use schema;
use fastfield::FastFieldNotAvailableError;
use serde_json;
/// Generic tantivy error.
///
/// Any specialized error return in tantivy can be converted in `tantivy::Error`.
#[derive(Debug)]
pub enum Error {
/// Path does not exist.
PathDoesNotExist(PathBuf),
/// File already exists, this is a problem when we try to write into a new file.
FileAlreadyExists(PathBuf),
/// IO Error
IOError(io::Error),
/// A thread holding the locked panicked and poisoned the lock.
Poisoned,
/// The data within is corrupted.
///
/// For instance, it contains invalid JSON.
CorruptedFile(PathBuf, Box<error::Error + Send + Sync>),
/// Invalid argument was passed by the user.
InvalidArgument(String),
/// An Error happened in one of the thread
ErrorInThread(String),
/// An Error appeared related to the lack of a field.
SchemaError(String),
/// Tried to access a fastfield reader for a field not configured accordingly.
FastFieldError(FastFieldNotAvailableError)
}
error_chain!(
errors {
/// Path does not exist.
PathDoesNotExist(buf: PathBuf) {
description("path does not exist")
display("path does not exist: '{:?}'", buf)
}
/// File already exists, this is a problem when we try to write into a new file.
FileAlreadyExists(buf: PathBuf) {
description("file already exists")
display("file already exists: '{:?}'", buf)
}
/// IO Error.
IOError(err: IOError) {
description("an IO error occurred")
display("an IO error occurred: '{}'", err)
}
/// The data within is corrupted.
///
/// For instance, it contains invalid JSON.
CorruptedFile(buf: PathBuf) {
description("file contains corrupted data")
display("file contains corrupted data: '{:?}'", buf)
}
/// A thread holding the locked panicked and poisoned the lock.
Poisoned {
description("a thread holding the locked panicked and poisoned the lock")
}
/// Invalid argument was passed by the user.
InvalidArgument(arg: String) {
description("an invalid argument was passed")
display("an invalid argument was passed: '{}'", arg)
}
/// An Error happened in one of the thread.
ErrorInThread(err: String) {
description("an error occurred in a thread")
display("an error occurred in a thread: '{}'", err)
}
/// An Error appeared related to the lack of a field.
SchemaError(field: String) {
description("a schema field is missing")
display("a schema field is missing: '{}'", field)
}
/// Tried to access a fastfield reader for a field not configured accordingly.
FastFieldError(err: FastFieldNotAvailableError) {
description("fast field not available")
display("fast field not available: '{:?}'", err)
}
}
);
impl From<FastFieldNotAvailableError> for Error {
fn from(fastfield_error: FastFieldNotAvailableError) -> Error {
Error::FastFieldError(fastfield_error)
ErrorKind::FastFieldError(fastfield_error).into()
}
}
impl From<IOError> for Error {
fn from(io_error: IOError) -> Error {
ErrorKind::IOError(io_error).into()
}
}
impl From<io::Error> for Error {
fn from(io_error: io::Error) -> Error {
Error::IOError(io_error)
ErrorKind::IOError(io_error.into()).into()
}
}
impl From<query::QueryParserError> for Error {
fn from(parsing_error: query::QueryParserError) -> Error {
Error::InvalidArgument(format!("Query is invalid. {:?}", parsing_error))
ErrorKind::InvalidArgument(format!("Query is invalid. {:?}", parsing_error)).into()
}
}
impl<Guard> From<PoisonError<Guard>> for Error {
fn from(_: PoisonError<Guard>) -> Error {
Error::Poisoned
ErrorKind::Poisoned.into()
}
}
impl From<OpenReadError> for Error {
fn from(error: OpenReadError) -> Error {
match error {
OpenReadError::FileDoesNotExist(filepath) => Error::PathDoesNotExist(filepath),
OpenReadError::IOError(io_error) => Error::IOError(io_error),
OpenReadError::FileDoesNotExist(filepath) => {
ErrorKind::PathDoesNotExist(filepath).into()
}
OpenReadError::IOError(io_error) => ErrorKind::IOError(io_error).into(),
}
}
}
impl From<schema::DocParsingError> for Error {
fn from(error: schema::DocParsingError) -> Error {
Error::InvalidArgument(format!("Failed to parse document {:?}", error))
ErrorKind::InvalidArgument(format!("Failed to parse document {:?}", error)).into()
}
}
impl From<OpenWriteError> for Error {
fn from(error: OpenWriteError) -> Error {
match error {
OpenWriteError::FileAlreadyExists(filepath) =>
Error::FileAlreadyExists(filepath),
OpenWriteError::IOError(io_error) =>
Error::IOError(io_error),
}
OpenWriteError::FileAlreadyExists(filepath) => ErrorKind::FileAlreadyExists(filepath),
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
}.into()
}
}
impl From<OpenDirectoryError> for Error {
fn from(error: OpenDirectoryError) -> Error {
match error {
OpenDirectoryError::DoesNotExist(directory_path) =>
Error::PathDoesNotExist(directory_path),
OpenDirectoryError::NotADirectory(directory_path) =>
Error::InvalidArgument(format!("{:?} is not a directory", directory_path)),
OpenDirectoryError::DoesNotExist(directory_path) => {
ErrorKind::PathDoesNotExist(directory_path).into()
}
OpenDirectoryError::NotADirectory(directory_path) => ErrorKind::InvalidArgument(
format!("{:?} is not a directory", directory_path),
).into(),
}
}
}
impl From<serde_json::Error> for Error {
fn from(error: serde_json::Error) -> Error {
Error::IOError(error.into())
let io_err = io::Error::from(error);
ErrorKind::IOError(io_err.into()).into()
}
}
}

View File

@@ -6,7 +6,7 @@ use directory::ReadOnlySource;
use DocId;
use common::HasLen;
/// Write a delete BitSet
/// Write a delete `BitSet`
///
/// where `delete_bitset` is the set of deleted `DocId`.
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> {
@@ -21,8 +21,7 @@ pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io:
writer.write_all(&[byte])?;
shift = 0;
byte = 0;
}
else {
} else {
shift += 1;
}
}
@@ -36,20 +35,18 @@ pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io:
#[derive(Clone)]
pub struct DeleteBitSet {
data: ReadOnlySource,
len: usize,
len: usize,
}
impl DeleteBitSet {
/// Opens a delete bitset given its data source.
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
let num_deleted: usize = data
.as_slice()
let num_deleted: usize = data.as_slice()
.iter()
.map(|b| b.count_ones() as usize)
.sum();
DeleteBitSet {
data: data,
data,
len: num_deleted,
}
}
@@ -68,21 +65,19 @@ impl DeleteBitSet {
}
/// Returns true iff the document is deleted.
#[inline]
pub fn is_deleted(&self, doc: DocId) -> bool {
if self.len == 0 {
false
}
else {
} else {
let byte_offset = doc / 8u32;
let b: u8 = (*self.data)[byte_offset as usize];
let shift = (doc & 7u32) as u8;
b & (1u8 << shift) != 0
b & (1u8 << shift) != 0
}
}
}
impl HasLen for DeleteBitSet {
fn len(&self) -> usize {
self.len
@@ -132,4 +127,4 @@ mod tests {
test_delete_bitset_helper(&bitset);
}
}
}
}

View File

@@ -1,7 +1,7 @@
use std::result;
use schema::FieldEntry;
/// FastFieldNotAvailableError is returned when the
/// `FastFieldNotAvailableError` is returned when the
/// user requested for a fast field reader, and the field was not
/// defined in the schema as a fast field.
#[derive(Debug)]
@@ -10,9 +10,8 @@ pub struct FastFieldNotAvailableError {
}
impl FastFieldNotAvailableError {
/// Creates a `FastFieldNotAvailable` error.
/// `field_entry` is the configuration of the field
/// `field_entry` is the configuration of the field
/// for which fast fields are not available.
pub fn new(field_entry: &FieldEntry) -> FastFieldNotAvailableError {
FastFieldNotAvailableError {
@@ -21,6 +20,5 @@ impl FastFieldNotAvailableError {
}
}
/// Result when trying to access a fast field reader.
pub type Result<R> = result::Result<R, FastFieldNotAvailableError>;
pub type Result<R> = result::Result<R, FastFieldNotAvailableError>;

View File

@@ -1,25 +1,27 @@
//! # Fast fields
//!
//! Fast fields are the equivalent of `DocValues` in `Lucene`.
//! Fast fields is a non-compressed column-oriented fashion storage
//! of `tantivy`.
//!
//! It is designed for the fast random access of some document
//! fields given a document id.
//!
//! `FastField` are useful when a field is required for all or most of
//! the `DocSet` : for instance for scoring, grouping, filtering, or facetting.
//!
//!
//! Fields have to be declared as `FAST` in the schema.
//! Currently only 64-bits integers (signed or unsigned) are
//! supported.
//!
//! They are stored in a bitpacked fashion so that their
//! memory usage is directly linear with the amplitude of the
//! values stored.
//!
//! Read access performance is comparable to that of an array lookup.
/*!
Column oriented field storage for tantivy.
It is the equivalent of `Lucene`'s `DocValues`.
Fast fields is a column-oriented fashion storage of `tantivy`.
It is designed for the fast random access of some document
fields given a document id.
`FastField` are useful when a field is required for all or most of
the `DocSet` : for instance for scoring, grouping, filtering, or faceting.
Fields have to be declared as `FAST` in the schema.
Currently only 64-bits integers (signed or unsigned) are
supported.
They are stored in a bit-packed fashion so that their
memory usage is directly linear with the amplitude of the
values stored.
Read access performance is comparable to that of an array lookup.
*/
mod reader;
mod writer;
@@ -30,17 +32,17 @@ mod delete;
pub use self::delete::write_delete_bitset;
pub use self::delete::DeleteBitSet;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
pub use self::reader::{FastFieldsReader, U64FastFieldReader, I64FastFieldReader};
pub use self::reader::{I64FastFieldReader, U64FastFieldReader};
pub use self::reader::FastFieldReader;
pub use self::serializer::FastFieldSerializer;
pub use self::error::{Result, FastFieldNotAvailableError};
pub use self::error::{FastFieldNotAvailableError, Result};
#[cfg(test)]
mod tests {
use super::*;
use schema::Field;
use std::path::Path;
use directory::{Directory, WritePtr, RAMDirectory};
use directory::{Directory, RAMDirectory, WritePtr};
use schema::Document;
use schema::{Schema, SchemaBuilder};
use schema::FAST;
@@ -49,6 +51,7 @@ mod tests {
use fastfield::FastFieldReader;
use rand::Rng;
use rand::SeedableRng;
use common::CompositeFile;
use rand::XorShiftRng;
lazy_static! {
@@ -57,7 +60,7 @@ mod tests {
schema_builder.add_u64_field("field", FAST);
schema_builder.build()
};
static ref FIELD: Field = {
static ref FIELD: Field = {
SCHEMA.get_field("field").unwrap()
};
}
@@ -67,13 +70,13 @@ mod tests {
doc.add_u64(field, value);
fast_field_writers.add_document(&doc);
}
#[test]
pub fn test_fastfield() {
let test_fastfield = U64FastFieldReader::from(vec!(100,200,300));
let test_fastfield = U64FastFieldReader::from(vec![100, 200, 300]);
assert_eq!(test_fastfield.get(0), 100);
assert_eq!(test_fastfield.get(1), 200);
assert_eq!(test_fastfield.get(2), 300);
assert_eq!(test_fastfield.get(2), 300);
}
#[test]
@@ -82,7 +85,7 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64);
@@ -92,11 +95,12 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 31 as usize);
assert_eq!(source.len(), 35 as usize);
}
{
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
let composite_file = CompositeFile::open(&source).unwrap();
let field_source = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader: U64FastFieldReader = U64FastFieldReader::open(field_source);
assert_eq!(fast_field_reader.get(0), 13u64);
assert_eq!(fast_field_reader.get(1), 14u64);
assert_eq!(fast_field_reader.get(2), 2u64);
@@ -109,7 +113,7 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 4u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u64);
@@ -125,11 +129,12 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 56 as usize);
assert_eq!(source.len(), 60 as usize);
}
{
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_field_reader: U64FastFieldReader =
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
assert_eq!(fast_field_reader.get(0), 4u64);
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
assert_eq!(fast_field_reader.get(2), 3_052u64);
@@ -141,16 +146,15 @@ mod tests {
assert_eq!(fast_field_reader.get(8), 215u64);
}
}
#[test]
fn test_intfastfield_null_amplitude() {
#[test]
fn test_intfastfield_null_amplitude() {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for _ in 0..10_000 {
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64);
@@ -160,49 +164,58 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 29 as usize);
assert_eq!(source.len(), 33 as usize);
}
{
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_field_reader: U64FastFieldReader =
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
for doc in 0..10_000 {
assert_eq!(fast_field_reader.get(doc), 100_000u64);
}
}
}
#[test]
fn test_intfastfield_large_numbers() {
#[test]
fn test_intfastfield_large_numbers() {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
// forcing the amplitude to be high
add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64);
for i in 0u64..10_000u64 {
add_single_field_doc(&mut fast_field_writers, *FIELD, 5_000_000_000_000_000_000u64 + i);
add_single_field_doc(
&mut fast_field_writers,
*FIELD,
5_000_000_000_000_000_000u64 + i,
);
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 80037 as usize);
assert_eq!(source.len(), 80041 as usize);
}
{
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_field_reader: U64FastFieldReader =
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
assert_eq!(fast_field_reader.get(0), 0u64);
for doc in 1..10_001 {
assert_eq!(fast_field_reader.get(doc), 5_000_000_000_000_000_000u64 + doc as u64 - 1u64);
assert_eq!(
fast_field_reader.get(doc),
5_000_000_000_000_000_000u64 + doc as u64 - 1u64
);
}
}
}
#[test]
fn test_signed_intfastfield() {
let path = Path::new("test");
@@ -213,7 +226,7 @@ mod tests {
let schema = schema_builder.build();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
for i in -100i64..10_000i64 {
let mut doc = Document::default();
@@ -225,16 +238,23 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 17704 as usize);
assert_eq!(source.len(), 17708 as usize);
}
{
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: I64FastFieldReader = fast_field_readers.open_reader(i64_field).unwrap();
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_field_reader: I64FastFieldReader =
I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap());
assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64);
for (doc, i) in (-100i64..10_000i64).enumerate() {
assert_eq!(fast_field_reader.get(doc as u32), i);
}
let mut buffer = vec![0i64; 100];
fast_field_reader.get_range(53, &mut buffer[..]);
for i in 0..100 {
assert_eq!(buffer[i], -100i64 + 53i64 + i as i64);
}
}
}
@@ -248,18 +268,19 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let doc = Document::default();
fast_field_writers.add_document(&doc);
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: I64FastFieldReader = fast_field_readers.open_reader(i64_field).unwrap();
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_field_reader: I64FastFieldReader =
I64FastFieldReader::open(fast_fields_composite.open_read(i64_field).unwrap());
assert_eq!(fast_field_reader.get(0u32), 0i64);
}
}
@@ -280,7 +301,7 @@ mod tests {
let mut directory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
@@ -290,11 +311,12 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_field_reader: U64FastFieldReader =
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
let mut a = 0u64;
for _ in 0..n {
println!("i {}=> {} {}", a, fast_field_reader.get(a as u32), permutation[a as usize]);
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
a = fast_field_reader.get(a as u32);
}
@@ -307,7 +329,7 @@ mod tests {
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in (0u32..n).step_by(7) {
for i in Iterator::step_by((0u32..n), 7) {
a ^= permutation[i as usize];
}
a
@@ -334,7 +356,7 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
@@ -344,12 +366,14 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_field_reader: U64FastFieldReader =
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in (0u32..n).step_by(7) {
for i in Iterator::step_by((0u32..n), 7) {
a ^= fast_field_reader.get(i);
}
a
@@ -364,7 +388,7 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
@@ -374,8 +398,10 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let fast_field_reader: U64FastFieldReader =
U64FastFieldReader::open(fast_fields_composite.open_read(*FIELD).unwrap());
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;

View File

@@ -1,141 +1,166 @@
use std::io;
use std::collections::HashMap;
use directory::ReadOnlySource;
use common::BinarySerializable;
use common::{self, BinarySerializable};
use common::bitpacker::{compute_num_bits, BitUnpacker};
use DocId;
use schema::{Field, SchemaBuilder};
use schema::SchemaBuilder;
use std::path::Path;
use schema::FAST;
use directory::{WritePtr, RAMDirectory, Directory};
use fastfield::FastFieldSerializer;
use fastfield::FastFieldsWriter;
use common::bitpacker::compute_num_bits;
use common::bitpacker::BitUnpacker;
use directory::{Directory, RAMDirectory, WritePtr};
use fastfield::{FastFieldSerializer, FastFieldsWriter};
use schema::FieldType;
use common;
use std::mem;
use common::CompositeFile;
use owning_ref::OwningRef;
/// Trait for accessing a fastfield.
///
/// Depending on the field type, a different
/// fast field is required.
pub trait FastFieldReader: Sized {
/// Type of the value stored in the fastfield.
type ValueType;
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
// `maxdoc`.
fn get(&self, doc: DocId) -> Self::ValueType;
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// # Panics
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: u32, output: &mut [Self::ValueType]);
/// Opens a fast field given a source.
fn open(source: ReadOnlySource) -> Self;
/// Returns true iff the given field_type makes
/// it possible to access the field values via a
/// it possible to access the field values via a
/// fastfield.
fn is_enabled(field_type: &FieldType) -> bool;
}
/// FastFieldReader for unsigned 64-bits integers.
/// `FastFieldReader` for unsigned 64-bits integers.
pub struct U64FastFieldReader {
_data: ReadOnlySource,
bit_unpacker: BitUnpacker,
bit_unpacker: BitUnpacker<OwningRef<ReadOnlySource, [u8]>>,
min_value: u64,
max_value: u64,
}
impl U64FastFieldReader {
/// Returns the minimum value for this fast field.
///
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// deleted document, and should be considered as a lower bound
/// of the actual minimum value.
pub fn min_value(&self,) -> u64 {
pub fn min_value(&self) -> u64 {
self.min_value
}
}
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
pub fn max_value(&self,) -> u64 {
pub fn max_value(&self) -> u64 {
self.max_value
}
}
impl FastFieldReader for U64FastFieldReader {
type ValueType = u64;
fn get(&self, doc: DocId) -> u64 {
self.min_value + self.bit_unpacker.get(doc as usize)
}
fn is_enabled(field_type: &FieldType) -> bool {
match field_type {
&FieldType::U64(ref integer_options) =>
integer_options.is_fast(),
match *field_type {
FieldType::U64(ref integer_options) => integer_options.is_fast(),
_ => false,
}
}
fn get_range(&self, start: u32, output: &mut [Self::ValueType]) {
self.bit_unpacker.get_range(start, output);
for out in output.iter_mut() {
*out += self.min_value;
}
}
/// Opens a new fast field reader given a read only source.
///
/// # Panics
/// Panics if the data is corrupted.
fn open(data: ReadOnlySource) -> U64FastFieldReader {
let min_value: u64;
let max_value: u64;
let bit_unpacker: BitUnpacker;
let amplitude: u64;
{
let mut cursor: &[u8] = data.as_slice();
min_value = u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
let amplitude = u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
bit_unpacker = BitUnpacker::new(cursor, num_bits as usize)
let mut cursor = data.as_slice();
min_value =
u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
amplitude =
u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
}
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let owning_ref = OwningRef::new(data).map(|data| &data[16..]);
let bit_unpacker = BitUnpacker::new(owning_ref, num_bits as usize);
U64FastFieldReader {
_data: data,
bit_unpacker: bit_unpacker,
min_value: min_value,
max_value: max_value,
bit_unpacker: bit_unpacker,
}
}
}
impl From<Vec<u64>> for U64FastFieldReader {
fn from(vals: Vec<u64>) -> U64FastFieldReader {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("test");
let path = Path::new("__dummy__");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let write: WritePtr = directory
.open_write(path)
.expect("With a RAMDirectory, this should never fail.");
let mut serializer = FastFieldSerializer::from_write(write)
.expect("With a RAMDirectory, this should never fail.");
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
for val in vals {
let mut fast_field_writer = fast_field_writers.get_field_writer(field).unwrap();
fast_field_writer.add_val(val);
{
let fast_field_writer = fast_field_writers
.get_field_writer(field)
.expect("With a RAMDirectory, this should never fail.");
for val in vals {
fast_field_writer.add_val(val);
}
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
let fast_field_readers = FastFieldsReader::open(source).unwrap();
fast_field_readers.open_reader(field).unwrap()
}
let source = directory.open_read(path).expect("Failed to open the file");
let composite_file =
CompositeFile::open(&source).expect("Failed to read the composite file");
let field_source = composite_file
.open_read(field)
.expect("File component not found");
U64FastFieldReader::open(field_source)
}
}
/// FastFieldReader for signed 64-bits integers.
/// `FastFieldReader` for signed 64-bits integers.
pub struct I64FastFieldReader {
underlying: U64FastFieldReader,
}
@@ -144,118 +169,62 @@ impl I64FastFieldReader {
/// Returns the minimum value for this fast field.
///
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// deleted document, and should be considered as a lower bound
/// of the actual minimum value.
pub fn min_value(&self,) -> i64 {
pub fn min_value(&self) -> i64 {
common::u64_to_i64(self.underlying.min_value())
}
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
pub fn max_value(&self,) -> i64 {
pub fn max_value(&self) -> i64 {
common::u64_to_i64(self.underlying.max_value())
}
}
impl FastFieldReader for I64FastFieldReader {
type ValueType = i64;
///
///
/// # Panics
///
/// May panic or return wrong random result if `doc`
/// is greater or equal to the segment's `maxdoc`.
fn get(&self, doc: DocId) -> i64 {
common::u64_to_i64(self.underlying.get(doc))
}
///
/// # Panics
///
/// May panic or return wrong random result if `doc`
/// is greater or equal to the segment's `maxdoc`.
fn get_range(&self, start: u32, output: &mut [Self::ValueType]) {
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
self.underlying.get_range(start, output_u64);
for mut_val in output_u64.iter_mut() {
*mut_val = common::u64_to_i64(*mut_val as u64) as u64;
}
}
/// Opens a new fast field reader given a read only source.
///
/// # Panics
/// Panics if the data is corrupted.
fn open(data: ReadOnlySource) -> I64FastFieldReader {
I64FastFieldReader {
underlying: U64FastFieldReader::open(data)
underlying: U64FastFieldReader::open(data),
}
}
fn is_enabled(field_type: &FieldType) -> bool {
match field_type {
&FieldType::I64(ref integer_options) => {
if integer_options.is_fast() {
true
}
else {
false
}
},
match *field_type {
FieldType::I64(ref integer_options) => integer_options.is_fast(),
_ => false,
}
}
}
/// The FastFieldsReader` is the datastructure containing
/// all of the fast fields' data.
///
/// It contains a mapping that associated these fields to
/// the proper slice in the fastfield reader file.
pub struct FastFieldsReader {
source: ReadOnlySource,
field_offsets: HashMap<Field, (u32, u32)>,
}
impl FastFieldsReader {
/// Opens the `FastFieldsReader` file
///
/// When opening the fast field reader, the
/// the list of the offset is read (as a footer of the
/// data file).
pub fn open(source: ReadOnlySource) -> io::Result<FastFieldsReader> {
let header_offset;
let field_offsets: Vec<(Field, u32)>;
{
let buffer = source.as_slice();
{
let mut cursor = buffer;
header_offset = u32::deserialize(&mut cursor)?;
}
{
let mut cursor = &buffer[header_offset as usize..];
field_offsets = Vec::deserialize(&mut cursor)?;
}
}
let mut end_offsets: Vec<u32> = field_offsets
.iter()
.map(|&(_, offset)| offset)
.collect();
end_offsets.push(header_offset);
let mut field_offsets_map: HashMap<Field, (u32, u32)> = HashMap::new();
for (field_start_offsets, stop_offset) in field_offsets.iter().zip(end_offsets.iter().skip(1)) {
let (field, start_offset) = *field_start_offsets;
field_offsets_map.insert(field, (start_offset, *stop_offset));
}
Ok(FastFieldsReader {
field_offsets: field_offsets_map,
source: source,
})
}
/// Returns the u64 fast value reader if the field
/// is a u64 field indexed as "fast".
///
/// Return None if the field is not a u64 field
/// indexed with the fast option.
///
/// # Panics
/// May panic if the index is corrupted.
pub fn open_reader<FFReader: FastFieldReader>(&self, field: Field) -> Option<FFReader> {
self.field_offsets
.get(&field)
.map(|&(start, stop)| {
let field_source = self.source.slice(start as usize, stop as usize);
FFReader::open(field_source)
})
}
}

View File

@@ -2,14 +2,15 @@ use common::BinarySerializable;
use directory::WritePtr;
use schema::Field;
use common::bitpacker::{compute_num_bits, BitPacker};
use std::io::{self, Write, Seek, SeekFrom};
use common::CountingWriter;
use common::CompositeWrite;
use std::io::{self, Write};
/// `FastFieldSerializer` is in charge of serializing
/// fastfields on disk.
///
///
/// Fast fields are encoded using bit-packing.
///
///
/// `FastFieldWriter`s are in charge of pushing the data to
/// the serializer.
/// The serializer expects to receive the following calls.
@@ -26,81 +27,68 @@ use std::io::{self, Write, Seek, SeekFrom};
/// * `close_field()`
/// * `close()`
pub struct FastFieldSerializer {
write: WritePtr,
written_size: usize,
fields: Vec<(Field, u32)>,
min_value: u64,
field_open: bool,
bit_packer: BitPacker,
composite_write: CompositeWrite<WritePtr>,
}
impl FastFieldSerializer {
/// Constructor
pub fn new(mut write: WritePtr) -> io::Result<FastFieldSerializer> {
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
// just making room for the pointer to header.
let written_size: usize = try!(0u32.serialize(&mut write));
Ok(FastFieldSerializer {
write: write,
written_size: written_size,
fields: Vec::new(),
min_value: 0,
field_open: false,
bit_packer: BitPacker::new(0),
let composite_write = CompositeWrite::wrap(write);
Ok(FastFieldSerializer { composite_write })
}
/// Start serializing a new u64 fast field
pub fn new_u64_fast_field(
&mut self,
field: Field,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
let field_write = self.composite_write.for_field(field);
FastSingleFieldSerializer::open(field_write, min_value, max_value)
}
/// Closes the serializer
///
/// After this call the data must be persistently save on disk.
pub fn close(self) -> io::Result<()> {
self.composite_write.close()
}
}
pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
}
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
min_value.serialize(write)?;
let amplitude = max_value - min_value;
amplitude.serialize(write)?;
let num_bits = compute_num_bits(amplitude);
let bit_packer = BitPacker::new(num_bits as usize);
Ok(FastSingleFieldSerializer {
write,
bit_packer,
min_value,
})
}
/// Start serializing a new u64 fast field
pub fn new_u64_fast_field(&mut self, field: Field, min_value: u64, max_value: u64) -> io::Result<()> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
}
self.min_value = min_value;
self.field_open = true;
self.fields.push((field, self.written_size as u32));
let write: &mut Write = &mut self.write;
self.written_size += try!(min_value.serialize(write));
let amplitude = max_value - min_value;
self.written_size += try!(amplitude.serialize(write));
let num_bits = compute_num_bits(amplitude);
self.bit_packer = BitPacker::new(num_bits as usize);
Ok(())
}
/// Pushes a new value to the currently open u64 fast field.
/// Pushes a new value to the currently open u64 fast field.
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer.write(val_to_write, &mut self.write)?;
Ok(())
}
/// Close the u64 fast field.
pub fn close_field(&mut self,) -> io::Result<()> {
if !self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
}
self.field_open = false;
// adding some padding to make sure we
// can read the last elements with our u64
// cursor
self.written_size += self.bit_packer.close(&mut self.write)?;
Ok(())
}
/// Closes the serializer
///
/// After this call the data must be persistently save on disk.
pub fn close(mut self,) -> io::Result<usize> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));
}
let header_offset: usize = self.written_size;
self.written_size += try!(self.fields.serialize(&mut self.write));
try!(self.write.seek(SeekFrom::Start(0)));
try!((header_offset as u32).serialize(&mut self.write));
try!(self.write.flush());
Ok(self.written_size)
pub fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)
}
}

View File

@@ -1,10 +1,12 @@
use schema::{Schema, Field, Document};
use schema::{Document, Field, Schema};
use fastfield::FastFieldSerializer;
use std::io;
use schema::Value;
use DocId;
use common;
use schema::FieldType;
use common;
use common::VInt;
use common::BinarySerializable;
/// The fastfieldswriter regroup all of the fast field writers.
pub struct FastFieldsWriter {
@@ -12,7 +14,6 @@ pub struct FastFieldsWriter {
}
impl FastFieldsWriter {
/// Create all `FastFieldWriter` required by the schema.
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
let field_writers: Vec<IntFastFieldWriter> = schema
@@ -21,46 +22,39 @@ impl FastFieldsWriter {
.enumerate()
.flat_map(|(field_id, field_entry)| {
let field = Field(field_id as u32);
match field_entry.field_type() {
&FieldType::I64(ref int_options) => {
match *field_entry.field_type() {
FieldType::I64(ref int_options) => {
if int_options.is_fast() {
let mut fast_field_writer = IntFastFieldWriter::new(field);
fast_field_writer.set_val_if_missing(common::i64_to_u64(0i64));
Some(fast_field_writer)
}
else {
} else {
None
}
}
&FieldType::U64(ref int_options) => {
FieldType::U64(ref int_options) => {
if int_options.is_fast() {
Some(IntFastFieldWriter::new(field))
}
else {
} else {
None
}
}
_ => None
_ => None,
}
})
})
.collect();
FastFieldsWriter {
field_writers: field_writers,
}
FastFieldsWriter { field_writers }
}
/// Returns a `FastFieldsWriter`
/// with a `IntFastFieldWriter` for each
/// Returns a `FastFieldsWriter`
/// with a `IntFastFieldWriter` for each
/// of the field given in argument.
pub fn new(fields: Vec<Field>) -> FastFieldsWriter {
FastFieldsWriter {
field_writers: fields
.into_iter()
.map(IntFastFieldWriter::new)
.collect(),
field_writers: fields.into_iter().map(IntFastFieldWriter::new).collect(),
}
}
/// Get the `FastFieldWriter` associated to a field.
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
// TODO optimize
@@ -68,7 +62,6 @@ impl FastFieldsWriter {
.iter_mut()
.find(|field_writer| field_writer.field == field)
}
/// Indexes all of the fastfields of a new document.
pub fn add_document(&mut self, doc: &Document) {
@@ -77,7 +70,7 @@ impl FastFieldsWriter {
}
}
/// Serializes all of the `FastFieldWriter`s by pushing them in
/// Serializes all of the `FastFieldWriter`s by pushing them in
/// order to the fast field serializer.
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
for field_writer in &self.field_writers {
@@ -85,10 +78,10 @@ impl FastFieldsWriter {
}
Ok(())
}
/// Ensures all of the fast field writers have
/// reached `doc`. (included)
///
///
/// The missing values will be filled with 0.
pub fn fill_val_up_to(&mut self, doc: DocId) {
for field_writer in &mut self.field_writers {
@@ -99,39 +92,44 @@ impl FastFieldsWriter {
/// Fast field writer for ints.
/// The fast field writer just keeps the values in memory.
///
///
/// Only when the segment writer can be closed and
/// persisted on disc, the fast field writer is
/// persisted on disc, the fast field writer is
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
/// method.
///
/// We cannot serialize earlier as the values are
/// bitpacked and the number of bits required for bitpacking
/// We cannot serialize earlier as the values are
/// bitpacked and the number of bits required for bitpacking
/// can only been known once we have seen all of the values.
///
///
/// Both u64, and i64 use the same writer.
/// i64 are just remapped to the `0..2^64 - 1`
/// using `common::i64_to_u64`.
pub struct IntFastFieldWriter {
field: Field,
vals: Vec<u64>,
vals: Vec<u8>,
val_count: usize,
val_if_missing: u64,
val_min: u64,
val_max: u64,
}
impl IntFastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field) -> IntFastFieldWriter {
IntFastFieldWriter {
field: field,
field,
vals: Vec::new(),
val_count: 0,
val_if_missing: 0u64,
val_min: u64::max_value(),
val_max: 0,
}
}
/// Sets the default value.
///
/// This default value is recorded for documents if
/// This default value is recorded for documents if
/// a document does not have any value.
fn set_val_if_missing(&mut self, val_if_missing: u64) {
self.val_if_missing = val_if_missing;
@@ -139,13 +137,13 @@ impl IntFastFieldWriter {
/// Ensures all of the fast field writer have
/// reached `doc`. (included)
///
///
/// The missing values will be filled with 0.
fn fill_val_up_to(&mut self, doc: DocId) {
let target = doc as usize + 1;
debug_assert!(self.vals.len() <= target);
debug_assert!(self.val_count <= target);
let val_if_missing = self.val_if_missing;
while self.vals.len() < target {
while self.val_count < target {
self.add_val(val_if_missing);
}
}
@@ -156,11 +154,21 @@ impl IntFastFieldWriter {
/// associated to the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u64) {
self.vals.push(val);
}
VInt(val)
.serialize(&mut self.vals)
.expect("unable to serialize VInt to Vec");
/// Extract the value associated to the fast field for
if val > self.val_max {
self.val_max = val;
}
if val < self.val_min {
self.val_min = val;
}
self.val_count += 1;
}
/// Extract the value associated to the fast field for
/// this document.
///
/// i64 are remapped to u64 using the logic
@@ -172,16 +180,12 @@ impl IntFastFieldWriter {
/// only the first one is taken in account.
fn extract_val(&self, doc: &Document) -> u64 {
match doc.get_first(self.field) {
Some(v) => {
match *v {
Value::U64(ref val) => { *val },
Value::I64(ref val) => common::i64_to_u64(*val),
_ => { panic!("Expected a u64field, got {:?} ", v) }
}
Some(v) => match *v {
Value::U64(ref val) => *val,
Value::I64(ref val) => common::i64_to_u64(*val),
_ => panic!("Expected a u64field, got {:?} ", v),
},
None => {
self.val_if_missing
}
None => self.val_if_missing,
}
}
@@ -194,18 +198,19 @@ impl IntFastFieldWriter {
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
let zero = 0;
let min = *self.vals.iter().min().unwrap_or(&zero);
let max = *self.vals.iter().max().unwrap_or(&min);
serializer.new_u64_fast_field(self.field, min, max)?;
for &val in &self.vals {
serializer.add_val(val)?;
let (min, max) = if self.val_min > self.val_max {
(0, 0)
} else {
(self.val_min, self.val_max)
};
let mut single_field_serializer = serializer.new_u64_fast_field(self.field, min, max)?;
let mut cursor = self.vals.as_slice();
while let Ok(VInt(val)) = VInt::deserialize(&mut cursor) {
single_field_serializer.add_val(val)?;
}
serializer.close_field()
single_field_serializer.close_field()
}
}

View File

@@ -14,11 +14,10 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
#[test]
#[ignore]
fn test_indexing() {
let mut schema_builder = SchemaBuilder::default();
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap();
@@ -41,14 +40,11 @@ fn test_indexing() {
let searcher = index.searcher();
// check that everything is correct.
check_index_content(&searcher, &committed_docs);
}
else {
if committed_docs.remove(&random_val) ||
uncommitted_docs.remove(&random_val) {
} else {
if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u64(id_field, random_val);
index_writer.delete_term(doc_id_term);
}
else {
} else {
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
doc.add_u64(id_field, random_val);

View File

@@ -3,20 +3,19 @@ use std::sync::{Arc, RwLock};
use std::mem;
use std::ops::DerefMut;
// The DeleteQueue is similar in conceptually to a multiple
// consumer single producer broadcast channel.
//
//
// All consumer will receive all messages.
//
//
// Consumer of the delete queue are holding a `DeleteCursor`,
// which points to a specific place of the `DeleteQueue`.
//
//
// New consumer can be created in two ways
// - calling `delete_queue.cursor()` returns a cursor, that
// - calling `delete_queue.cursor()` returns a cursor, that
// will include all future delete operation (and no past operations).
// - cloning an existing cursor returns a new cursor, that
// is at the exact same position, and can now advance independantly
// is at the exact same position, and can now advance independently
// from the original cursor.
#[derive(Default)]
struct InnerDeleteQueue {
@@ -29,34 +28,28 @@ pub struct DeleteQueue {
inner: Arc<RwLock<InnerDeleteQueue>>,
}
impl DeleteQueue {
// Creates a new delete queue.
pub fn new() -> DeleteQueue {
let delete_queue = DeleteQueue {
inner: Arc::default(),
};
let next_block = NextBlock::from(delete_queue.clone());
{
let mut delete_queue_wlock = delete_queue.inner.write().unwrap();
delete_queue_wlock.last_block = Some(
Arc::new(Block {
operations: Arc::default(),
next: next_block,
})
);
delete_queue_wlock.last_block = Some(Arc::new(Block {
operations: Arc::default(),
next: next_block,
}));
}
delete_queue
}
// Creates a new cursor that makes it possible to
// Creates a new cursor that makes it possible to
// consume future delete operations.
//
//
// Past delete operations are not accessible.
pub fn cursor(&self) -> DeleteCursor {
let last_block = self.inner
@@ -64,9 +57,11 @@ impl DeleteQueue {
.expect("Read lock poisoned when opening delete queue cursor")
.last_block
.clone()
.expect("Failed to unwrap last_block. This should never happen
.expect(
"Failed to unwrap last_block. This should never happen
as the Option<> is only here to make
initialization possible");
initialization possible",
);
let operations_len = last_block.operations.len();
DeleteCursor {
block: last_block,
@@ -85,40 +80,37 @@ impl DeleteQueue {
// DeleteQueue is a linked list of blocks of
// delete operations.
//
//
// Writing happens by simply appending to a vec.
// `.flush()` takes this pending delete operations vec
// creates a new read-only block from it,
// creates a new read-only block from it,
// and appends it to the linked list.
//
// `.flush()` happens when, for instance,
//
// `.flush()` happens when, for instance,
// a consumer reaches the last read-only operations.
// It then ask the delete queue if there happen to
// It then ask the delete queue if there happen to
// be some unflushed operations.
//
fn flush(&self) -> Option<Arc<Block>> {
let mut self_wlock = self
.inner
let mut self_wlock = self.inner
.write()
.expect("Failed to acquire write lock on delete queue writer");
let delete_operations;
{
let writer: &mut Vec<DeleteOperation> = &mut self_wlock.writer;
if writer.is_empty() {
return None;
}
delete_operations = mem::replace(writer, vec!());
delete_operations = mem::replace(writer, vec![]);
}
let next_block = NextBlock::from(self.clone());
{
self_wlock.last_block = Some(
Arc::new(Block {
operations: Arc::new(delete_operations),
next: next_block,
})
);
self_wlock.last_block = Some(Arc::new(Block {
operations: Arc::new(delete_operations),
next: next_block,
}));
}
self_wlock.last_block.clone()
}
@@ -137,17 +129,14 @@ impl From<DeleteQueue> for NextBlock {
}
}
impl NextBlock {
impl NextBlock {
fn next_block(&self) -> Option<Arc<Block>> {
{
let next_read_lock = self.0
.read()
.expect("Failed to acquire write lock in delete queue");
match *next_read_lock {
InnerNextBlock::Closed(ref block) => {
return Some(block.clone());
}
_ => {}
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
return Some(Arc::clone(block));
}
}
let next_block;
@@ -157,21 +146,19 @@ impl NextBlock {
.expect("Failed to acquire write lock in delete queue");
match *next_write_lock {
InnerNextBlock::Closed(ref block) => {
return Some(block.clone());
return Some(Arc::clone(block));
}
InnerNextBlock::Writer(ref writer) => {
match writer.flush() {
Some(flushed_next_block) => {
next_block = flushed_next_block;
}
None => {
return None;
}
InnerNextBlock::Writer(ref writer) => match writer.flush() {
Some(flushed_next_block) => {
next_block = flushed_next_block;
}
}
None => {
return None;
}
},
}
*next_write_lock.deref_mut() = InnerNextBlock::Closed(next_block.clone());
return Some(next_block)
*next_write_lock.deref_mut() = InnerNextBlock::Closed(Arc::clone(&next_block));
Some(next_block)
}
}
}
@@ -181,40 +168,37 @@ struct Block {
next: NextBlock,
}
#[derive(Clone)]
pub struct DeleteCursor {
block: Arc<Block>,
pos: usize,
}
impl DeleteCursor {
impl DeleteCursor {
/// Skips operations and position it so that
/// - either all of the delete operation currently in the
/// - either all of the delete operation currently in the
/// queue are consume and the next get will return None.
/// - the next get will return the first operation with an
/// `opstamp >= target_opstamp`.
pub fn skip_to(&mut self, target_opstamp: u64) {
// TODO Can be optimize as we work with block.
#[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))]
loop {
if let Some(operation) = self.get() {
if operation.opstamp >= target_opstamp {
break;
}
}
else {
} else {
break;
}
self.advance();
}
}
/// If the current block has been entirely
/// If the current block has been entirely
/// consumed, try to load the next one.
///
/// Return `true`, if after this attempt,
///
/// Return `true`, if after this attempt,
/// the cursor is on a block that has not
/// been entirely consumed.
/// Return `false`, if we have reached the end of the queue.
@@ -229,24 +213,20 @@ impl DeleteCursor {
self.pos = 0;
true
}
None => {
false
}
None => false,
}
}
else {
} else {
true
}
}
/// Advance to the next delete operation.
/// Returns true iff there is such an operation.
pub fn advance(&mut self) -> bool {
if self.load_block_if_required() {
self.pos += 1;
true
}
else {
} else {
false
}
}
@@ -256,34 +236,27 @@ impl DeleteCursor {
pub fn get(&mut self) -> Option<&DeleteOperation> {
if self.load_block_if_required() {
Some(&self.block.operations[self.pos])
}
else {
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::{DeleteQueue, DeleteOperation};
use schema::{Term, Field};
use super::{DeleteOperation, DeleteQueue};
use schema::{Field, Term};
#[test]
fn test_deletequeue() {
let delete_queue = DeleteQueue::new();
let make_op = |i: usize| {
let field = Field(1u32);
DeleteOperation {
opstamp: i as u64,
term: Term::from_field_u64(field, i as u64)
term: Term::from_field_u64(field, i as u64),
}
};
@@ -299,7 +272,7 @@ mod tests {
operations_it.advance();
assert!(operations_it.get().is_none());
operations_it.advance();
let mut snapshot2 = delete_queue.cursor();
assert!(snapshot2.get().is_none());
delete_queue.push(make_op(3));
@@ -310,7 +283,7 @@ mod tests {
assert!(operations_it.get().is_none());
operations_it.advance();
}
{
{
let mut operations_it = snapshot.clone();
assert_eq!(operations_it.get().unwrap().opstamp, 1);
operations_it.advance();
@@ -320,6 +293,5 @@ mod tests {
operations_it.advance();
assert!(operations_it.get().is_none());
}
}
}
}

View File

@@ -2,8 +2,6 @@ use Directory;
use directory::error::OpenWriteError;
use core::LOCKFILE_FILEPATH;
/// The directory lock is a mechanism used to
/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html)
///
@@ -15,8 +13,8 @@ pub struct DirectoryLock {
impl DirectoryLock {
pub fn lock(mut directory: Box<Directory>) -> Result<DirectoryLock, OpenWriteError> {
try!(directory.open_write(&*LOCKFILE_FILEPATH));
Ok(DirectoryLock { directory: directory })
directory.open_write(&*LOCKFILE_FILEPATH)?;
Ok(DirectoryLock { directory })
}
}
@@ -26,4 +24,4 @@ impl Drop for DirectoryLock {
error!("Failed to remove the lock file. {:?}", e);
}
}
}
}

View File

@@ -1,10 +1,9 @@
use std::sync::Arc;
use DocId;
// Doc to opstamp is used to identify which
// document should be deleted.
//
//
// Since the docset matching the query of a delete operation
// is not computed right when the delete operation is received,
// we need to find a way to evaluate, for each document,
@@ -14,13 +13,13 @@ use DocId;
//
// The doc to opstamp mapping stores precisely an array
// indexed by doc id and storing the opstamp of the document.
//
//
// This mapping is (for the moment) stricly increasing
// because of the way document id are allocated.
#[derive(Clone)]
pub enum DocToOpstampMapping {
WithMap(Arc<Vec<u64>>),
None
None,
}
impl From<Vec<u64>> for DocToOpstampMapping {
@@ -30,9 +29,8 @@ impl From<Vec<u64>> for DocToOpstampMapping {
}
impl DocToOpstampMapping {
/// Given an opstamp return the limit doc id L
/// such that all doc id D such that
/// such that all doc id D such that
// D >= L iff opstamp(D) >= than `target_opstamp`.
//
// The edge case opstamp = some doc opstamp is in practise
@@ -41,8 +39,7 @@ impl DocToOpstampMapping {
match *self {
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
match doc_opstamps.binary_search(&target_opstamp) {
Ok(doc_id) => doc_id as DocId,
Err(doc_id) => doc_id as DocId,
Ok(doc_id) | Err(doc_id) => doc_id as DocId,
}
}
DocToOpstampMapping::None => DocId::max_value(),
@@ -58,23 +55,26 @@ mod tests {
#[test]
fn test_doc_to_opstamp_mapping_none() {
let doc_to_opstamp_mapping = DocToOpstampMapping::None;
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1), u32::max_value());
assert_eq!(
doc_to_opstamp_mapping.compute_doc_limit(1),
u32::max_value()
);
}
#[test]
fn test_doc_to_opstamp_mapping_complex() {
{
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!());
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![]);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 0);
}
{
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!(1u64));
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64]);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 1);
}
{
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!(1u64, 12u64, 17u64, 23u64));
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64, 12u64, 17u64, 23u64]);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
for i in 2u64..13u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 1);
@@ -90,4 +90,4 @@ mod tests {
}
}
}
}
}

View File

@@ -9,11 +9,12 @@ use core::SegmentReader;
use indexer::stamper::Stamper;
use datastruct::stacker::Heap;
use directory::FileProtection;
use Error;
use error::{Error, ErrorKind, Result, ResultExt};
use Directory;
use fastfield::write_delete_bitset;
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
use futures::Canceled;
use datastruct::stacker::hashmap::split_memory;
use futures::Future;
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
use indexer::MergePolicy;
@@ -21,13 +22,12 @@ use indexer::operation::DeleteOperation;
use indexer::SegmentEntry;
use indexer::SegmentWriter;
use postings::DocSet;
use postings::SegmentPostingsOption;
use Result;
use schema::IndexRecordOption;
use schema::Document;
use schema::Schema;
use schema::Term;
use std::mem;
use std::mem::swap;
use std::mem::swap;
use std::thread::JoinHandle;
use super::directory_lock::DirectoryLock;
use super::operation::AddOperation;
@@ -41,7 +41,8 @@ pub const MARGIN_IN_BYTES: u32 = 1_000_000u32;
// We impose the memory per thread to be at least 3 MB.
pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32;
// Add document will block if the number of docs waiting in the queue to be indexed reaches PIPELINE_MAX_SIZE_IN_DOCS
// Add document will block if the number of docs waiting in the queue to be indexed
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
type DocumentSender = chan::Sender<AddOperation>;
@@ -51,14 +52,13 @@ type DocumentReceiver = chan::Receiver<AddOperation>;
///
/// It manages a small number of indexing thread, as well as a shared
/// indexing queue.
/// Each indexing thread builds its own independant `Segment`, via
/// Each indexing thread builds its own independent `Segment`, via
/// a `SegmentWriter` object.
pub struct IndexWriter {
// the lock is just used to bind the
// the lock is just used to bind the
// lifetime of the lock with that of the IndexWriter.
_directory_lock: DirectoryLock,
_directory_lock: DirectoryLock,
index: Index,
heap_size_in_bytes_per_thread: usize,
@@ -86,8 +86,6 @@ pub struct IndexWriter {
impl !Send for IndexWriter {}
impl !Sync for IndexWriter {}
/// Open a new index writer. Attempts to acquire a lockfile.
///
/// The lockfile should be deleted on drop, but it is possible
@@ -96,7 +94,7 @@ impl !Sync for IndexWriter {}
/// `IndexWriter` on the system is accessing the index directory,
/// it is safe to manually delete the lockfile.
///
/// num_threads specifies the number of indexing workers that
/// `num_threads` specifies the number of indexing workers that
/// should work at the same time.
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
@@ -105,81 +103,81 @@ impl !Sync for IndexWriter {}
pub fn open_index_writer(
index: &Index,
num_threads: usize,
heap_size_in_bytes_per_thread: usize) -> Result<IndexWriter> {
heap_size_in_bytes_per_thread: usize,
) -> Result<IndexWriter> {
if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize {
panic!(format!("The heap size per thread needs to be at least {}.",
HEAP_SIZE_LIMIT));
panic!(format!(
"The heap size per thread needs to be at least {}.",
HEAP_SIZE_LIMIT
));
}
let directory_lock = try!(DirectoryLock::lock(index.directory().box_clone()));
let directory_lock = DirectoryLock::lock(index.directory().box_clone())?;
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
let delete_queue = DeleteQueue::new();
let current_opstamp = index.opstamp();
let stamper = Stamper::new(current_opstamp);
let segment_updater = SegmentUpdater::new(index.clone(),
stamper.clone(),
delete_queue.cursor())?;
let segment_updater =
SegmentUpdater::new(index.clone(), stamper.clone(), &delete_queue.cursor())?;
let mut index_writer = IndexWriter {
_directory_lock: directory_lock,
heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread,
heap_size_in_bytes_per_thread,
index: index.clone(),
document_receiver: document_receiver,
document_sender: document_sender,
document_receiver,
document_sender,
segment_updater: segment_updater,
segment_updater,
workers_join_handle: vec!(),
num_threads: num_threads,
workers_join_handle: vec![],
num_threads,
delete_queue: delete_queue,
delete_queue,
committed_opstamp: current_opstamp,
stamper: stamper,
stamper,
generation: 0,
worker_id: 0,
};
try!(index_writer.start_workers());
index_writer.start_workers()?;
Ok(index_writer)
}
pub fn compute_deleted_bitset(
delete_bitset: &mut BitSet,
segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor,
doc_opstamps: DocToOpstampMapping,
target_opstamp: u64) -> Result<bool> {
doc_opstamps: &DocToOpstampMapping,
target_opstamp: u64,
) -> Result<bool> {
let mut might_have_changed = false;
#[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))]
loop {
if let Some(delete_op) = delete_cursor.get() {
if delete_op.opstamp > target_opstamp {
break;
}
else {
} else {
// A delete operation should only affect
// document that were inserted after it.
//
//
// Limit doc helps identify the first document
// that may be affected by the delete operation.
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
if let Some(mut docset) = segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) {
let inverted_index = segment_reader.inverted_index(delete_op.term.field());
if let Some(mut docset) =
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)
{
while docset.advance() {
let deleted_doc = docset.doc();
if deleted_doc < limit_doc {
@@ -189,8 +187,7 @@ pub fn compute_deleted_bitset(
}
}
}
}
else {
} else {
break;
}
delete_cursor.advance();
@@ -203,8 +200,8 @@ pub fn compute_deleted_bitset(
pub fn advance_deletes(
mut segment: Segment,
segment_entry: &mut SegmentEntry,
target_opstamp: u64) -> Result<Option<FileProtection>> {
target_opstamp: u64,
) -> Result<Option<FileProtection>> {
let mut file_protect: Option<FileProtection> = None;
{
@@ -214,26 +211,24 @@ pub fn advance_deletes(
return Ok(file_protect);
}
}
let segment_reader = SegmentReader::open(segment.clone())?;
let segment_reader = SegmentReader::open(&segment)?;
let max_doc = segment_reader.max_doc();
let mut delete_bitset: BitSet =
match segment_entry.delete_bitset() {
Some(ref previous_delete_bitset) =>
(*previous_delete_bitset).clone(),
None =>
BitSet::with_capacity(max_doc as usize)
};
let mut delete_bitset: BitSet = match segment_entry.delete_bitset() {
Some(previous_delete_bitset) => (*previous_delete_bitset).clone(),
None => BitSet::with_capacity(max_doc as usize),
};
let delete_cursor = segment_entry.delete_cursor();
compute_deleted_bitset(
&mut delete_bitset,
&segment_reader,
delete_cursor,
DocToOpstampMapping::None,
target_opstamp)?;
&DocToOpstampMapping::None,
target_opstamp,
)?;
for doc in 0u32..max_doc {
if segment_reader.is_deleted(doc) {
delete_bitset.insert(doc as usize);
@@ -253,25 +248,29 @@ pub fn advance_deletes(
Ok(file_protect)
}
fn index_documents(heap: &mut Heap,
segment: Segment,
schema: &Schema,
generation: usize,
document_iterator: &mut Iterator<Item=AddOperation>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor)
-> Result<bool> {
fn index_documents(
heap: &mut Heap,
table_size: usize,
segment: &Segment,
schema: &Schema,
generation: usize,
document_iterator: &mut Iterator<Item = AddOperation>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> Result<bool> {
heap.clear();
let segment_id = segment.id();
let mut segment_writer = SegmentWriter::for_segment(heap, segment.clone(), &schema)?;
let mut segment_writer = SegmentWriter::for_segment(heap, table_size, segment.clone(), schema)?;
for doc in document_iterator {
try!(segment_writer.add_document(&doc, &schema));
segment_writer.add_document(&doc, schema)?;
// There is two possible conditions to close the segment.
// One is the memory arena dedicated to the segment is
// One is the memory arena dedicated to the segment is
// getting full.
if segment_writer.is_buffer_full() {
info!("Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc());
info!(
"Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break;
}
// The second is the term dictionary hash table
@@ -279,17 +278,19 @@ fn index_documents(heap: &mut Heap,
//
// Tantivy does not resize its hashtable. When it reaches
// capacity, we just stop indexing new document.
if segment_writer.is_termdic_saturated() {
info!("Term dic saturated, flushing segment with maxdoc={}.",
segment_writer.max_doc());
if segment_writer.is_term_saturated() {
info!(
"Term dic saturated, flushing segment with maxdoc={}.",
segment_writer.max_doc()
);
break;
}
}
let num_docs = segment_writer.max_doc();
// this is ensured by the call to peek before starting
// the worker thread.
assert!(num_docs > 0);
assert!(num_docs > 0);
let doc_opstamps: Vec<u64> = segment_writer.finalize()?;
@@ -297,7 +298,7 @@ fn index_documents(heap: &mut Heap,
segment_meta.set_max_doc(num_docs);
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let segment_reader = SegmentReader::open(segment)?;
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
@@ -305,57 +306,61 @@ fn index_documents(heap: &mut Heap,
&mut deleted_bitset,
&segment_reader,
&mut delete_cursor,
doc_to_opstamps,
&doc_to_opstamps,
last_docstamp,
)?;
let segment_entry = SegmentEntry::new(
segment_meta,
delete_cursor,
{ if may_have_deletes { Some(deleted_bitset) }
else { None } }
);
Ok(
segment_updater
.add_segment(generation, segment_entry)
)
}
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
if may_have_deletes {
Some(deleted_bitset)
} else {
None
}
});
Ok(segment_updater.add_segment(generation, segment_entry))
}
impl IndexWriter {
/// The index writer
pub fn wait_merging_threads(mut self) -> Result<()> {
// this will stop the indexing thread,
// dropping the last reference to the segment_updater.
drop(self.document_sender);
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec!());
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]);
for join_handle in former_workers_handles {
try!(join_handle.join()
join_handle
.join()
.expect("Indexing Worker thread panicked")
.map_err(|e| {
Error::ErrorInThread(format!("Error in indexing worker thread. {:?}", e))
}));
.chain_err(|| ErrorKind::ErrorInThread("Error in indexing worker thread.".into()))?;
}
drop(self.workers_join_handle);
let result = self.segment_updater
.wait_merging_thread()
.map_err(|_|
Error::ErrorInThread("Failed to join merging thread.".to_string())
);
if let &Err(ref e) = &result {
.chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into()));
if let Err(ref e) = result {
error!("Some merging thread failed {:?}", e);
}
result
}
#[doc(hidden)]
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
let delete_cursor = self.delete_queue.cursor();
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
self.segment_updater
.add_segment(self.generation, segment_entry);
}
#[doc(hidden)]
pub fn new_segment(&self) -> Segment {
self.segment_updater.new_segment()
}
/// Spawns a new worker thread for indexing.
/// The thread consumes documents from the pipeline.
///
@@ -363,48 +368,50 @@ impl IndexWriter {
let schema = self.index.schema();
let document_receiver_clone = self.document_receiver.clone();
let mut segment_updater = self.segment_updater.clone();
let mut heap = Heap::with_capacity(self.heap_size_in_bytes_per_thread);
let (heap_size, table_size) = split_memory(self.heap_size_in_bytes_per_thread);
info!("heap size {}, table_size {}", heap_size, table_size);
let mut heap = Heap::with_capacity(heap_size);
let generation = self.generation;
let mut delete_cursor = self.delete_queue.cursor();
let join_handle: JoinHandle<Result<()>> =
thread::Builder::new()
.name(format!("indexing thread {} for gen {}", self.worker_id, generation))
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
.name(format!(
"indexing thread {} for gen {}",
self.worker_id, generation
))
.spawn(move || {
loop {
let mut document_iterator =
document_receiver_clone.clone().into_iter().peekable();
let mut document_iterator = document_receiver_clone.clone()
.into_iter()
.peekable();
// the peeking here is to avoid
// creating a new segment's files
// if no document are available.
//
// this is a valid guarantee as the
// this is a valid guarantee as the
// peeked document now belongs to
// our local iterator.
if let Some(operation) = document_iterator.peek() {
delete_cursor.skip_to(operation.opstamp);
}
else {
} else {
// No more documents.
// Happens when there is a commit, or if the `IndexWriter`
// was dropped.
return Ok(())
return Ok(());
}
let segment = segment_updater.new_segment();
index_documents(&mut heap,
segment,
&schema,
generation,
&mut document_iterator,
&mut segment_updater,
delete_cursor.clone())?;
index_documents(
&mut heap,
table_size,
&segment,
&schema,
generation,
&mut document_iterator,
&mut segment_updater,
delete_cursor.clone(),
)?;
}
})?;
self.worker_id += 1;
@@ -421,10 +428,10 @@ impl IndexWriter {
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
self.segment_updater.set_merge_policy(merge_policy);
}
fn start_workers(&mut self) -> Result<()> {
for _ in 0..self.num_threads {
try!(self.add_indexing_worker());
self.add_indexing_worker()?;
}
Ok(())
}
@@ -436,7 +443,10 @@ impl IndexWriter {
}
/// Merges a given list of segments
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> impl Future<Item=SegmentMeta, Error=Canceled> {
pub fn merge(
&mut self,
segment_ids: &[SegmentId],
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
self.segment_updater.start_merge(segment_ids)
}
@@ -449,8 +459,10 @@ impl IndexWriter {
///
/// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) -> DocumentReceiver {
let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) =
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
let (mut document_sender, mut document_receiver): (
DocumentSender,
DocumentReceiver,
) = chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
swap(&mut self.document_sender, &mut document_sender);
swap(&mut self.document_receiver, &mut document_receiver);
document_receiver
@@ -477,17 +489,12 @@ impl IndexWriter {
let heap_size_in_bytes_per_thread = self.heap_size_in_bytes_per_thread;
drop(self);
for _ in receiver_clone {}
let index_writer = open_index_writer(
&index,
num_threads,
heap_size_in_bytes_per_thread)?;
let index_writer = open_index_writer(&index, num_threads, heap_size_in_bytes_per_thread)?;
Ok(index_writer)
}
/// Commits all of the pending changes
///
/// A call to commit blocks.
@@ -503,7 +510,6 @@ impl IndexWriter {
/// that made it in the commit.
///
pub fn commit(&mut self) -> Result<u64> {
// here, because we join all of the worker threads,
// all of the segment update for this commit have been
// sent.
@@ -522,25 +528,26 @@ impl IndexWriter {
self.recreate_document_channel();
let mut former_workers_join_handle = Vec::new();
swap(&mut former_workers_join_handle,
&mut self.workers_join_handle);
swap(
&mut former_workers_join_handle,
&mut self.workers_join_handle,
);
for worker_handle in former_workers_join_handle {
let indexing_worker_result = try!(worker_handle.join()
.map_err(|e| Error::ErrorInThread(format!("{:?}", e))));
try!(indexing_worker_result);
let indexing_worker_result = worker_handle
.join()
.map_err(|e| Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))))?;
indexing_worker_result?;
// add a new worker for the next generation.
try!(self.add_indexing_worker());
self.add_indexing_worker()?;
}
// wait for the segment update thread to have processed the info
self.segment_updater
.commit(self.committed_opstamp)?;
self.segment_updater.commit(self.committed_opstamp)?;
Ok(self.committed_opstamp)
}
}
/// Delete all documents containing a given term.
///
@@ -548,20 +555,17 @@ impl IndexWriter {
/// were added in previous commits, and documents
/// that were added previously in the same commit.
///
/// Like adds, the deletion itself will be visible
/// Like adds, the deletion itself will be visible
/// only after calling `commit()`.
pub fn delete_term(&mut self, term: Term) -> u64 {
let opstamp = self.stamper.stamp();
let delete_operation = DeleteOperation {
opstamp: opstamp,
term: term,
};
let delete_operation = DeleteOperation { opstamp, term };
self.delete_queue.push(delete_operation);
opstamp
}
/// Returns the opstamp of the last successful commit.
///
///
/// This is, for instance, the opstamp the index will
/// rollback to if there is a failure like a power surge.
///
@@ -583,18 +587,12 @@ impl IndexWriter {
/// have been added since the creation of the index.
pub fn add_document(&mut self, document: Document) -> u64 {
let opstamp = self.stamper.stamp();
let add_operation = AddOperation {
opstamp: opstamp,
document: document,
};
let add_operation = AddOperation { opstamp, document };
self.document_sender.send(add_operation);
opstamp
}
}
#[cfg(test)]
mod tests {
@@ -602,7 +600,7 @@ mod tests {
use schema::{self, Document};
use Index;
use Term;
use Error;
use error::*;
use env_logger;
#[test]
@@ -611,20 +609,27 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer(40_000_000).unwrap();
match index.writer(40_000_000) {
Err(Error::FileAlreadyExists(_)) => {}
Err(Error(ErrorKind::FileAlreadyExists(_), _)) => {}
_ => panic!("Expected FileAlreadyExists error"),
}
}
#[test]
fn test_set_merge_policy() {
let schema_builder = schema::SchemaBuilder::default();
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer(40_000_000).unwrap();
assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, level_log_size: 0.75 }");
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
level_log_size: 0.75 }"
);
let merge_policy = box NoMergePolicy::default();
index_writer.set_merge_policy(merge_policy);
assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "NoMergePolicy");
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"NoMergePolicy"
);
}
#[test]
@@ -633,12 +638,12 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
{
let _index_writer = index.writer(40_000_000).unwrap();
// the lock should be released when the
// the lock should be released when the
// index_writer leaves the scope.
}
let _index_writer_two = index.writer(40_000_000).unwrap();
}
#[test]
fn test_commit_and_rollback() {
let mut schema_builder = schema::SchemaBuilder::default();
@@ -661,7 +666,7 @@ mod tests {
}
index_writer = index_writer.rollback().unwrap();
assert_eq!(index_writer.commit_opstamp(), 0u64);
assert_eq!(num_docs_containing("a"), 0);
@@ -685,7 +690,6 @@ mod tests {
index.searcher();
}
#[test]
fn test_with_merges() {
let _ = env_logger::init();
@@ -714,14 +718,14 @@ mod tests {
}
// this should create 8 segments and trigger a merge.
index_writer.commit().expect("commit failed");
index_writer.wait_merging_threads().expect("waiting merging thread failed");
index_writer
.wait_merging_threads()
.expect("waiting merging thread failed");
index.load_searchers().unwrap();
assert_eq!(num_docs_containing("a"), 200);
assert!(index.searchable_segments().unwrap().len() < 8);
}
}
}

View File

@@ -1,5 +1,4 @@
extern crate itertools;
use super::merge_policy::{MergePolicy, MergeCandidate};
use super::merge_policy::{MergeCandidate, MergePolicy};
use core::SegmentMeta;
use std::cmp;
use std::f64;
@@ -8,8 +7,7 @@ const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
const DEFAULT_MIN_MERGE_SIZE: usize = 8;
/// LogMergePolicy tries tries to merge segments that have a similar number of
/// `LogMergePolicy` tries tries to merge segments that have a similar number of
/// documents.
#[derive(Debug, Clone)]
pub struct LogMergePolicy {
@@ -24,7 +22,7 @@ impl LogMergePolicy {
}
/// Set the minimum number of segment that may be merge together.
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
self.min_merge_size = min_merge_size;
}
@@ -52,15 +50,17 @@ impl MergePolicy for LogMergePolicy {
return Vec::new();
}
let mut size_sorted_tuples = segments.iter()
let mut size_sorted_tuples = segments
.iter()
.map(|x| x.num_docs())
.enumerate()
.collect::<Vec<(usize, u32)>>();
size_sorted_tuples.sort_by(|x, y| y.cmp(x));
size_sorted_tuples.sort_by(|x, y| y.1.cmp(&(x.1)));
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples.into_iter()
.map(|(ind, num_docs)| (ind, (self.clip_min_size(num_docs) as f64).log2()))
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
.into_iter()
.map(|(ind, num_docs)| (ind, f64::from(self.clip_min_size(num_docs)).log2()))
.collect();
let (first_ind, first_score) = size_sorted_log_tuples[0];
@@ -77,14 +77,10 @@ impl MergePolicy for LogMergePolicy {
levels
.iter()
.filter(|level| level.len() >= self.min_merge_size)
.map(|ind_vec| {
MergeCandidate(ind_vec.iter()
.map(|&ind| segments[ind].id())
.collect())
})
.map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()))
.collect()
}
fn box_clone(&self) -> Box<MergePolicy> {
box self.clone()
}
@@ -104,7 +100,7 @@ impl Default for LogMergePolicy {
mod tests {
use super::*;
use indexer::merge_policy::MergePolicy;
use core::{SegmentMeta, SegmentId};
use core::{SegmentId, SegmentMeta};
fn test_merge_policy() -> LogMergePolicy {
let mut log_merge_policy = LogMergePolicy::default();
@@ -128,9 +124,7 @@ mod tests {
#[test]
fn test_log_merge_policy_pair() {
let test_input = vec![seg_meta(10),
seg_meta(10),
seg_meta(10)];
let test_input = vec![seg_meta(10), seg_meta(10), seg_meta(10)];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 1);
}
@@ -138,12 +132,23 @@ mod tests {
#[test]
fn test_log_merge_policy_levels() {
// multiple levels all get merged correctly
let test_input = vec![seg_meta(10),
seg_meta(10),
seg_meta(10),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000)];
// 2 MergeCandidates expected:
// * one with the 6 * 10-docs segments
// * one with the 3 * 1000-docs segments
// no MergeCandidate expected for the 2 * 10_000-docs segments as min_merge_size=3
let test_input = vec![
seg_meta(10),
seg_meta(10),
seg_meta(10),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000),
seg_meta(10000),
seg_meta(10000),
seg_meta(10),
seg_meta(10),
seg_meta(10),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
}
@@ -151,24 +156,28 @@ mod tests {
#[test]
fn test_log_merge_policy_within_levels() {
// multiple levels all get merged correctly
let test_input = vec![seg_meta(10),
seg_meta(11),
seg_meta(12),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000)];
let test_input = vec![
seg_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75)
seg_meta(11), // log2(11) = ~3.46
seg_meta(12), // log2(12) = ~3.58
seg_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75)
seg_meta(1000), // log2(1000) = ~9.97
seg_meta(1000),
]; // log2(1000) = ~9.97
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
}
#[test]
fn test_log_merge_policy_small_segments() {
// multiple levels all get merged correctly
let test_input = vec![seg_meta(1),
seg_meta(1),
seg_meta(1),
seg_meta(2),
seg_meta(2),
seg_meta(2)];
// segments under min_layer_size are merged together
let test_input = vec![
seg_meta(1),
seg_meta(1),
seg_meta(1),
seg_meta(2),
seg_meta(2),
seg_meta(2),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 1);
}

View File

@@ -3,27 +3,25 @@ use core::SegmentMeta;
use std::marker;
use std::fmt::Debug;
/// Set of segment suggested for a merge.
/// Set of segment suggested for a merge.
#[derive(Debug, Clone)]
pub struct MergeCandidate(pub Vec<SegmentId>);
/// The Merge policy defines which segments should be merged.
///
/// The `MergePolicy` defines which segments should be merged.
///
/// Every time a the list of segments changes, the segment updater
/// asks the merge policy if some segments should be merged.
pub trait MergePolicy: marker::Send + marker::Sync + Debug {
/// Given the list of segment metas, returns the list of merge candidates.
/// Given the list of segment metas, returns the list of merge candidates.
///
/// This call happens on the segment updater thread, and will block
/// other segment updates, so all implementations should happen rapidly.
/// This call happens on the segment updater thread, and will block
/// other segment updates, so all implementations should happen rapidly.
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
/// Returns a boxed clone of the MergePolicy.
fn box_clone(&self) -> Box<MergePolicy>;
}
/// Never merge segments.
/// Never merge segments.
#[derive(Debug)]
pub struct NoMergePolicy;
@@ -37,13 +35,12 @@ impl MergePolicy for NoMergePolicy {
fn compute_merge_candidates(&self, _segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
Vec::new()
}
fn box_clone(&self) -> Box<MergePolicy> {
box NoMergePolicy
}
}
#[cfg(test)]
pub mod tests {
@@ -51,8 +48,7 @@ pub mod tests {
use core::SegmentId;
use core::SegmentMeta;
/// Merge policy useful for test purposes.
/// `MergePolicy` useful for test purposes.
///
/// Everytime there is more than one segment,
/// it will suggest to merge them.
@@ -66,15 +62,14 @@ pub mod tests {
.map(|segment_meta| segment_meta.id())
.collect::<Vec<SegmentId>>();
if segment_ids.len() > 1 {
vec!(MergeCandidate(segment_ids))
}
else {
vec!()
vec![MergeCandidate(segment_ids)]
} else {
vec![]
}
}
fn box_clone(&self) -> Box<MergePolicy> {
box MergeWheneverPossible
}
}
}
}

View File

@@ -1,22 +1,25 @@
use {Error, Result};
use error::{ErrorKind, Result};
use core::SegmentReader;
use core::Segment;
use DocId;
use core::SerializableSegment;
use schema::FieldValue;
use indexer::SegmentSerializer;
use postings::PostingsSerializer;
use postings::InvertedIndexSerializer;
use fastfield::U64FastFieldReader;
use itertools::Itertools;
use postings::Postings;
use postings::DocSet;
use core::TermIterator;
use fastfield::DeleteBitSet;
use schema::{Schema, Field};
use schema::{Field, Schema};
use termdict::TermMerger;
use fastfield::FastFieldSerializer;
use fastfield::FastFieldReader;
use store::StoreWriter;
use std::cmp::{min, max};
use std::cmp::{max, min};
use termdict::TermDictionary;
use schema::Term;
use termdict::TermStreamer;
pub struct IndexMerger {
schema: Schema,
@@ -24,42 +27,18 @@ pub struct IndexMerger {
max_doc: u32,
}
struct DeltaPositionComputer {
buffer: Vec<u32>,
}
impl DeltaPositionComputer {
fn new() -> DeltaPositionComputer {
DeltaPositionComputer {
buffer: vec![0u32; 512]
}
}
fn compute_delta_positions(&mut self, positions: &[u32]) -> &[u32] {
if positions.len() > self.buffer.len() {
self.buffer.resize(positions.len(), 0u32);
}
let mut last_pos = 0u32;
for (i, position) in positions.iter().cloned().enumerate() {
self.buffer[i] = position - last_pos;
last_pos = position;
}
&self.buffer[..positions.len()]
}
}
fn compute_min_max_val(u64_reader: &U64FastFieldReader, max_doc: DocId, delete_bitset: &DeleteBitSet) -> Option<(u64, u64)> {
fn compute_min_max_val(
u64_reader: &U64FastFieldReader,
max_doc: DocId,
delete_bitset: &DeleteBitSet,
) -> Option<(u64, u64)> {
if max_doc == 0 {
None
}
else if !delete_bitset.has_deletes() {
// no deleted documents,
} else if !delete_bitset.has_deletes() {
// no deleted documents,
// we can use the previous min_val, max_val.
Some((u64_reader.min_value(), u64_reader.max_value()))
}
else {
} else {
// some deleted documents,
// we need to recompute the max / min
(0..max_doc)
@@ -70,87 +49,131 @@ fn compute_min_max_val(u64_reader: &U64FastFieldReader, max_doc: DocId, delete_b
}
}
fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> Option<U64FastFieldReader> {
fn extract_fieldnorm_reader(
segment_reader: &SegmentReader,
field: Field,
) -> Option<U64FastFieldReader> {
segment_reader.get_fieldnorms_reader(field)
}
fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Option<U64FastFieldReader> {
segment_reader
.fast_fields_reader()
.open_reader(field)
fn extract_fast_field_reader(
segment_reader: &SegmentReader,
field: Field,
) -> Option<U64FastFieldReader> {
segment_reader.get_fast_field_reader(field).ok()
}
struct DeltaComputer {
buffer: Vec<u32>,
}
impl DeltaComputer {
fn new() -> DeltaComputer {
DeltaComputer {
buffer: vec![0u32; 512],
}
}
fn compute_delta(&mut self, positions: &[u32]) -> &[u32] {
if positions.len() > self.buffer.len() {
self.buffer.resize(positions.len(), 0u32);
}
let mut last_pos = 0u32;
for (cur_pos, dest) in positions.iter().cloned().zip(self.buffer.iter_mut()) {
*dest = cur_pos - last_pos;
last_pos = cur_pos;
}
&self.buffer[..positions.len()]
}
}
impl IndexMerger {
pub fn open(schema: Schema, segments: &[Segment]) -> Result<IndexMerger> {
let mut readers = vec!();
let mut readers = vec![];
let mut max_doc: u32 = 0u32;
for segment in segments {
if segment.meta().num_docs() > 0 {
let reader = SegmentReader::open(segment.clone())?;
let reader = SegmentReader::open(segment)?;
max_doc += reader.num_docs();
readers.push(reader);
}
}
Ok(IndexMerger {
schema: schema,
readers: readers,
max_doc: max_doc,
schema,
readers,
max_doc,
})
}
fn write_fieldnorms(&self,
fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
#[inline(never)]
fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
let fieldnorm_fastfields: Vec<Field> = self.schema
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(fieldnorm_fastfields, &extract_fieldnorm_reader, fast_field_serializer)
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(
fieldnorm_fastfields,
&extract_fieldnorm_reader,
fast_field_serializer,
)
}
#[inline(never)]
fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
let fast_fields: Vec<Field> = self.schema
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_int_fast())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(fast_fields, &extract_fast_field_reader, fast_field_serializer)
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_int_fast())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(
fast_fields,
&extract_fast_field_reader,
fast_field_serializer,
)
}
// used both to merge field norms and regular u64 fast fields.
fn generic_write_fast_field(&self,
fn generic_write_fast_field(
&self,
fields: Vec<Field>,
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U64FastFieldReader>,
fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
fast_field_serializer: &mut FastFieldSerializer,
) -> Result<()> {
for field in fields {
let mut u64_readers = vec!();
let mut u64_readers = vec![];
let mut min_val = u64::max_value();
let mut max_val = u64::min_value();
for reader in &self.readers {
match field_reader_extractor(reader, field) {
Some(u64_reader) => {
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset()) {
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(
&u64_reader,
reader.max_doc(),
reader.delete_bitset(),
) {
// the segment has some non-deleted documents
min_val = min(min_val, seg_min_val);
max_val = max(max_val, seg_max_val);
u64_readers.push((reader.max_doc(), u64_reader, reader.delete_bitset()));
}
u64_readers.push((
reader.max_doc(),
u64_reader,
reader.delete_bitset(),
));
}
}
None => {
let error_msg = format!("Failed to find a u64_reader for field {:?}", field);
let error_msg =
format!("Failed to find a u64_reader for field {:?}", field);
error!("{}", error_msg);
return Err(Error::SchemaError(error_msg))
bail!(ErrorKind::SchemaError(error_msg));
}
}
}
if u64_readers.is_empty() {
@@ -160,51 +183,65 @@ impl IndexMerger {
}
assert!(min_val <= max_val);
try!(fast_field_serializer.new_u64_fast_field(field, min_val, max_val));
let mut fast_single_field_serializer =
fast_field_serializer.new_u64_fast_field(field, min_val, max_val)?;
for (max_doc, u64_reader, delete_bitset) in u64_readers {
for doc_id in 0..max_doc {
if !delete_bitset.is_deleted(doc_id) {
let val = u64_reader.get(doc_id);
try!(fast_field_serializer.add_val(val));
fast_single_field_serializer.add_val(val)?;
}
}
}
try!(fast_field_serializer.close_field());
fast_single_field_serializer.close_field()?;
}
Ok(())
}
fn write_postings(
&self,
postings_serializer: &mut PostingsSerializer) -> Result<()> {
let mut merged_terms = TermIterator::from(&self.readers[..]);
let mut delta_position_computer = DeltaPositionComputer::new();
let mut max_doc = 0;
#[inline(never)]
fn write_postings(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
let mut delta_computer = DeltaComputer::new();
// map from segment doc ids to the resulting merged segment doc id.
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> = Vec::with_capacity(self.readers.len());
for reader in &self.readers {
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
for doc_id in 0..reader.max_doc() {
if reader.is_deleted(doc_id) {
segment_local_map.push(None);
}
else {
segment_local_map.push(Some(max_doc));
max_doc += 1u32;
}
let mut indexed_fields = vec![];
for (field_ord, field_entry) in self.schema.fields().iter().enumerate() {
if field_entry.is_indexed() {
indexed_fields.push(Field(field_ord as u32));
}
merged_doc_id_map.push(segment_local_map);
}
let mut last_field: Option<Field> = None;
while merged_terms.advance() {
for indexed_field in indexed_fields {
let field_readers = self.readers
.iter()
.map(|reader| reader.inverted_index(indexed_field))
.collect::<Vec<_>>();
let field_term_streams = field_readers
.iter()
.map(|field_reader| field_reader.terms().stream())
.collect();
let mut merged_terms = TermMerger::new(field_term_streams);
let mut max_doc = 0;
// map from segment doc ids to the resulting merged segment doc id.
let mut merged_doc_id_map: Vec<Vec<Option<DocId>>> =
Vec::with_capacity(self.readers.len());
for reader in &self.readers {
let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize);
for doc_id in 0..reader.max_doc() {
if reader.is_deleted(doc_id) {
segment_local_map.push(None);
} else {
segment_local_map.push(Some(max_doc));
max_doc += 1u32;
}
}
merged_doc_id_map.push(segment_local_map);
}
// Create the total list of doc ids
// by stacking the doc ids from the different segment.
//
@@ -212,74 +249,103 @@ impl IndexMerger {
// segment are stacked so that :
// - Segment 0's doc ids become doc id [0, seg.max_doc]
// - Segment 1's doc ids become [seg0.max_doc, seg0.max_doc + seg.max_doc]
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, seg0.max_doc + seg1.max_doc + seg2.max_doc]
// - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc,
// seg0.max_doc + seg1.max_doc + seg2.max_doc]
// ...
let term = merged_terms.term();
let mut term_written = false;
let segment_postings = merged_terms
.segment_ords()
.iter()
.cloned()
.flat_map(|segment_ord| {
self.readers[segment_ord]
.read_postings_all_info(&term)
.map(|segment_postings| (segment_ord, segment_postings))
})
.collect::<Vec<_>>();
// We can remove the term if all documents which
// contained it have been deleted.
if segment_postings.len() > 0 {
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
while segment_postings.advance() {
if let Some(remapped_doc_id) = old_to_new_doc_id[segment_postings.doc() as usize] {
if !term_written {
let current_field = term.field();
if last_field != Some(current_field) {
postings_serializer.new_field(current_field);
last_field = Some(current_field);
}
let mut field_serializer = serializer.new_field(indexed_field)?;
let field_entry = self.schema.get_field_entry(indexed_field);
// ... set segment postings option the new field.
let segment_postings_option =
field_entry.field_type().get_index_record_option().expect(
"Encountered a field that is not supposed to be
indexed. Have you modified the schema?",
);
while merged_terms.advance() {
let term = Term::wrap(merged_terms.key());
// Let's compute the list of non-empty posting lists
let segment_postings: Vec<_> = merged_terms
.current_kvs()
.iter()
.flat_map(|heap_item| {
let segment_ord = heap_item.segment_ord;
let term_info = heap_item.streamer.value();
let segment_reader = &self.readers[heap_item.segment_ord];
let inverted_index = segment_reader.inverted_index(term.field());
let mut segment_postings = inverted_index
.read_postings_from_terminfo(term_info, segment_postings_option);
if segment_postings.advance() {
Some((segment_ord, segment_postings))
} else {
None
}
})
.collect();
// At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term.
//
// These segments are non-empty and advance has already been called.
if !segment_postings.is_empty() {
// If not, the `term` will be entirely removed.
// We know that there is at least one document containing
// the term, so we add it.
field_serializer.new_term(term.as_ref())?;
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
loop {
// `.advance()` has been called once before the loop.
// Hence we cannot use a `while segment_postings.advance()` loop.
if let Some(remapped_doc_id) =
old_to_new_doc_id[segment_postings.doc() as usize]
{
// we make sure to only write the term iff
// there is at least one document.
postings_serializer.new_term(term.as_slice())?;
term_written = true;
}
let delta_positions: &[u32] =
delta_position_computer.compute_delta_positions(segment_postings.positions());
try!(postings_serializer.write_doc(
let positions: &[u32] = segment_postings.positions();
let term_freq = segment_postings.term_freq();
let delta_positions = delta_computer.compute_delta(positions);
field_serializer.write_doc(
remapped_doc_id,
segment_postings.term_freq(),
delta_positions));
term_freq,
delta_positions,
)?;
}
if !segment_postings.advance() {
break;
}
}
}
}
if term_written {
try!(postings_serializer.close_term());
// closing the term.
field_serializer.close_term()?;
}
}
field_serializer.close()?;
}
Ok(())
}
#[inline(never)]
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> {
for reader in &self.readers {
let store_reader = reader.get_store_reader();
for doc_id in 0..reader.max_doc() {
if !reader.is_deleted(doc_id) {
let doc = try!(store_reader.get(doc_id));
let field_values: Vec<&FieldValue> = doc.field_values()
.iter()
.collect();
try!(store_writer.store(&field_values));
let doc = store_reader.get(doc_id)?;
let field_values: Vec<&FieldValue> = doc.field_values().iter().collect();
store_writer.store(&field_values)?;
}
}
}
}
Ok(())
}
@@ -287,11 +353,11 @@ impl IndexMerger {
impl SerializableSegment for IndexMerger {
fn write(&self, mut serializer: SegmentSerializer) -> Result<u32> {
try!(self.write_postings(serializer.get_postings_serializer()));
try!(self.write_fieldnorms(serializer.get_fieldnorms_serializer()));
try!(self.write_fast_fields(serializer.get_fast_field_serializer()));
try!(self.write_storable_fields(serializer.get_store_writer()));
try!(serializer.close());
self.write_postings(serializer.get_postings_serializer())?;
self.write_fieldnorms(serializer.get_fieldnorms_serializer())?;
self.write_fast_fields(serializer.get_fast_field_serializer())?;
self.write_storable_fields(serializer.get_store_writer())?;
serializer.close()?;
Ok(self.max_doc)
}
}
@@ -301,8 +367,9 @@ mod tests {
use schema;
use schema::Document;
use schema::Term;
use schema::TextFieldIndexing;
use query::TermQuery;
use schema::{Field, FieldValue};
use schema::Field;
use core::Index;
use fastfield::U64FastFieldReader;
use Searcher;
@@ -310,16 +377,19 @@ mod tests {
use collector::tests::FastFieldTestCollector;
use collector::tests::TestCollector;
use query::BooleanQuery;
use postings::SegmentPostingsOption;
use schema::TextIndexingOptions;
use schema::IndexRecordOption;
use futures::Future;
#[test]
fn test_index_merger_no_deletes() {
let mut schema_builder = schema::SchemaBuilder::default();
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
.set_stored();
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
@@ -368,11 +438,14 @@ mod tests {
}
}
{
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.merge(&segment_ids)
.wait()
.expect("Merging failed");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
}
{
@@ -385,14 +458,22 @@ mod tests {
collector.docs()
};
{
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec!(1, 2, 4,));
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec!(0, 3,));
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
vec!(4,));
assert_eq!(get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec!(0, 1, 2, 3, 4,));
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2, 4]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0, 3]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
vec![4]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2, 3, 4]
);
}
{
let doc = searcher.doc(&DocAddress(0, 0)).unwrap();
@@ -421,15 +502,17 @@ mod tests {
assert!(searcher.search(&query, &mut collector).is_ok());
collector.vals()
};
assert_eq!(get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
vec!(5, 7, 13,));
assert_eq!(
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
vec![5, 7, 13]
);
}
}
}
fn search_term(searcher: &Searcher, term: Term) -> Vec<u64> {
fn search_term(searcher: &Searcher, term: Term) -> Vec<u64> {
let mut collector = FastFieldTestCollector::for_field(Field(1));
let term_query = TermQuery::new(term, SegmentPostingsOption::NoFreq);
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
searcher.search(&term_query, &mut collector).unwrap();
collector.vals()
}
@@ -437,9 +520,10 @@ mod tests {
#[test]
fn test_index_merger_with_deletes() {
let mut schema_builder = schema::SchemaBuilder::default();
let text_fieldtype = schema::TextOptions
::default()
.set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast();
@@ -448,21 +532,19 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let empty_vec = Vec::<u64>::new();
{ // a first commit
index_writer.add_document(
doc!(
{
// a first commit
index_writer.add_document(doc!(
text_field => "a b d",
score_field => 1u64
));
index_writer.add_document(
doc!(
index_writer.add_document(doc!(
text_field => "b c",
score_field => 2u64
));
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.add_document(
doc!(
index_writer.add_document(doc!(
text_field => "c d",
score_field => 3u64
));
@@ -472,31 +554,40 @@ mod tests {
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!(1));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!(1));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(1, 3));
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
vec![1]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
vec![1]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![1, 3]
);
}
{ // a second commit
index_writer.add_document(
doc!(
{
// a second commit
index_writer.add_document(doc!(
text_field => "a d e",
score_field => 4_000u64
));
index_writer.add_document(
doc!(
index_writer.add_document(doc!(
text_field => "e f",
score_field => 5_000u64
));
index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.delete_term(Term::from_field_text(text_field, "f"));
index_writer.add_document(
doc!(
index_writer.add_document(doc!(
text_field => "f g",
score_field => 6_000u64
));
index_writer.add_document(
doc!(
index_writer.add_document(doc!(
text_field => "g h",
score_field => 7_000u64
));
@@ -510,71 +601,154 @@ mod tests {
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(searcher.segment_readers()[1].num_docs(), 2);
assert_eq!(searcher.segment_readers()[1].max_doc(), 4);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
.get_fast_field_reader(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 1);
assert_eq!(score_field_reader.max_value(), 3);
let score_field_reader: U64FastFieldReader = searcher.segment_reader(1).get_fast_field_reader(score_field).unwrap();
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(1)
.get_fast_field_reader(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 4000);
assert_eq!(score_field_reader.max_value(), 7000);
}
{ // merging the segments
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
index_writer.merge(&segment_ids)
.wait()
.expect("Merging failed");
{
// merging the segments
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
vec![3]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
.get_fast_field_reader(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 3);
assert_eq!(score_field_reader.max_value(), 7000);
}
{
{
// test a commit with only deletes
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
.get_fast_field_reader(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 3);
assert_eq!(score_field_reader.max_value(), 7000);
}
{ // Test merging a single segment in order to remove deletes.
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
index_writer.merge(&segment_ids)
.wait()
.expect("Merging failed");
{
// Test merging a single segment in order to remove deletes.
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
@@ -582,31 +756,57 @@ mod tests {
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 2);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "a")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "b")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "c")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "d")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "e")),
empty_vec
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "f")),
vec![6_000]
);
assert_eq!(
search_term(&searcher, Term::from_field_text(text_field, "g")),
vec![6_000, 7_000]
);
let score_field_reader: U64FastFieldReader = searcher
.segment_reader(0)
.get_fast_field_reader(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 6000);
assert_eq!(score_field_reader.max_value(), 7000);
}
{ // Test removing all docs
{
// Test removing all docs
index_writer.delete_term(Term::from_field_text(text_field, "g"));
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
index_writer.merge(&segment_ids)
.wait()
.expect("Merging failed");
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 0);
}
}
}

View File

@@ -19,8 +19,8 @@ pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_writer::SegmentWriter;
pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_policy::{NoMergePolicy, MergeCandidate, MergePolicy};
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
pub use self::segment_manager::SegmentManager;
/// Alias for the default merge policy, which is the LogMergePolicy.
/// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy;

View File

@@ -1,7 +1,6 @@
use schema::Document;
use schema::Term;
/// Timestamped Delete operation.
#[derive(Clone, Eq, PartialEq, Debug)]
pub struct DeleteOperation {

View File

@@ -4,15 +4,14 @@ use indexer::delete_queue::DeleteCursor;
use core::SegmentId;
use std::fmt;
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum SegmentState {
Ready,
InMerge,
InMerge,
}
impl SegmentState {
pub fn letter_code(&self,) -> char {
pub fn letter_code(&self) -> char {
match *self {
SegmentState::InMerge => 'M',
SegmentState::Ready => 'R',
@@ -20,49 +19,46 @@ impl SegmentState {
}
}
/// A segment entry describes the state of
/// A segment entry describes the state of
/// a given segment, at a given instant.
///
/// In addition to segment meta,
/// In addition to segment `meta`,
/// it contains a few transient states
/// - state expresses whether the segment is already in the
/// - `state` expresses whether the segment is already in the
/// middle of a merge
/// - delete_bitset is a bitset describing
/// - `delete_bitset` is a bitset describing
/// documents that were deleted during the commit
/// itself.
/// - Delete cursor, is the position in the delete queue.
/// - `delete_cursor` is the position in the delete queue.
/// Deletes happening before the cursor are reflected either
/// in the .del file or in the delete_bitset.
/// in the .del file or in the `delete_bitset`.
#[derive(Clone)]
pub struct SegmentEntry {
meta: SegmentMeta,
state: SegmentState,
delete_bitset: Option<BitSet>,
delete_cursor: DeleteCursor,
}
impl SegmentEntry {
/// Create a new `SegmentEntry`
pub fn new(segment_meta: SegmentMeta,
delete_cursor: DeleteCursor,
delete_bitset: Option<BitSet>) -> SegmentEntry {
pub fn new(
segment_meta: SegmentMeta,
delete_cursor: DeleteCursor,
delete_bitset: Option<BitSet>,
) -> SegmentEntry {
SegmentEntry {
meta: segment_meta,
state: SegmentState::Ready,
delete_bitset: delete_bitset,
delete_cursor: delete_cursor,
delete_bitset,
delete_cursor,
}
}
/// Return a reference to the segment entry deleted bitset.
///
/// `DocId` in this bitset are flagged as deleted.
pub fn delete_bitset(&self,) -> Option<&BitSet> {
pub fn delete_bitset(&self) -> Option<&BitSet> {
self.delete_bitset.as_ref()
}
@@ -71,13 +67,12 @@ impl SegmentEntry {
self.meta = segment_meta;
}
/// Return a reference to the segment_entry's delete cursor
pub fn delete_cursor(&mut self) -> &mut DeleteCursor {
&mut self.delete_cursor
}
/// Return the `SegmentEntry`.
/// Return the `SegmentEntry`.
///
/// The state describes whether the segment is available for
/// a merge or not.
@@ -89,19 +84,17 @@ impl SegmentEntry {
pub fn segment_id(&self) -> SegmentId {
self.meta.id()
}
/// Accessor to the `SegmentMeta`
pub fn meta(&self) -> &SegmentMeta {
&self.meta
}
/// Mark the `SegmentEntry` as in merge.
///
/// Only segments that are not already
/// Only segments that are not already
/// in a merge are elligible for future merge.
pub fn start_merge(&mut self,) {
pub fn start_merge(&mut self) {
self.state = SegmentState::InMerge;
}
@@ -110,14 +103,13 @@ impl SegmentEntry {
/// If a merge fails, it is important to switch
/// the segment back to a idle state, so that it
/// may be elligible for future merges.
pub fn cancel_merge(&mut self,) {
pub fn cancel_merge(&mut self) {
self.state = SegmentState::Ready;
}
/// Returns true iff a segment should
/// be considered for a merge.
pub fn is_ready(&self,) -> bool {
pub fn is_ready(&self) -> bool {
self.state == SegmentState::Ready
}
}

View File

@@ -1,7 +1,7 @@
use super::segment_register::SegmentRegister;
use std::sync::RwLock;
use core::SegmentMeta;
use core::{META_FILEPATH, LOCKFILE_FILEPATH};
use core::{LOCKFILE_FILEPATH, META_FILEPATH};
use core::SegmentId;
use indexer::SegmentEntry;
use std::path::PathBuf;
@@ -14,15 +14,13 @@ use indexer::delete_queue::DeleteCursor;
struct SegmentRegisters {
uncommitted: SegmentRegister,
committed: SegmentRegister,
writing: HashSet<SegmentId>,
writing: HashSet<SegmentId>,
}
/// The segment manager stores the list of segments
/// as well as their state.
///
/// It guarantees the atomicity of the
/// It guarantees the atomicity of the
/// changes (merges especially)
#[derive(Default)]
pub struct SegmentManager {
@@ -32,19 +30,29 @@ pub struct SegmentManager {
impl Debug for SegmentManager {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
let lock = self.read();
write!(f, "{{ uncommitted: {:?}, committed: {:?} }}", lock.uncommitted, lock.committed)
write!(
f,
"{{ uncommitted: {:?}, committed: {:?} }}",
lock.uncommitted, lock.committed
)
}
}
pub fn get_mergeable_segments(segment_manager: &SegmentManager,) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
pub fn get_mergeable_segments(
segment_manager: &SegmentManager,
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let registers_lock = segment_manager.read();
(registers_lock.committed.get_mergeable_segments(),
registers_lock.uncommitted.get_mergeable_segments())
(
registers_lock.committed.get_mergeable_segments(),
registers_lock.uncommitted.get_mergeable_segments(),
)
}
impl SegmentManager {
pub fn from_segments(segment_metas: Vec<SegmentMeta>, delete_cursor: DeleteCursor) -> SegmentManager {
pub fn from_segments(
segment_metas: Vec<SegmentMeta>,
delete_cursor: &DeleteCursor,
) -> SegmentManager {
SegmentManager {
registers: RwLock::new(SegmentRegisters {
uncommitted: SegmentRegister::default(),
@@ -55,20 +63,14 @@ impl SegmentManager {
}
/// Returns all of the segment entries (committed or uncommitted)
pub fn segment_entries(&self,) -> Vec<SegmentEntry> {
let mut segment_entries = self.read()
.uncommitted
.segment_entries();
segment_entries.extend(
self.read()
.committed
.segment_entries()
);
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
let mut segment_entries = self.read().uncommitted.segment_entries();
segment_entries.extend(self.read().committed.segment_entries());
segment_entries
}
/// Returns the overall number of segments in the `SegmentManager`
pub fn num_segments(&self,) -> usize {
pub fn num_segments(&self) -> usize {
let registers_lock = self.read();
registers_lock.committed.len() + registers_lock.uncommitted.len()
}
@@ -78,19 +80,14 @@ impl SegmentManager {
let mut files = HashSet::new();
files.insert(META_FILEPATH.clone());
files.insert(LOCKFILE_FILEPATH.clone());
let segment_metas: Vec<SegmentMeta> =
registers_lock.committed
.get_all_segments()
.into_iter()
.chain(registers_lock.uncommitted
.get_all_segments()
.into_iter())
.chain(registers_lock.writing
.iter()
.cloned()
.map(SegmentMeta::new))
.collect();
let segment_metas: Vec<SegmentMeta> = registers_lock
.committed
.get_all_segments()
.into_iter()
.chain(registers_lock.uncommitted.get_all_segments().into_iter())
.chain(registers_lock.writing.iter().cloned().map(SegmentMeta::new))
.collect();
for segment_meta in segment_metas {
files.extend(segment_meta.list_files());
}
@@ -102,18 +99,22 @@ impl SegmentManager {
registers
.committed
.segment_entry(segment_id)
.or_else(|| registers.uncommitted.segment_entry(segment_id))
.or_else(|| registers.uncommitted.segment_entry(segment_id))
}
// Lock poisoning should never happen :
// The lock is acquired and released within this class,
// and the operations cannot panic.
fn read(&self,) -> RwLockReadGuard<SegmentRegisters> {
self.registers.read().expect("Failed to acquire read lock on SegmentManager.")
// and the operations cannot panic.
fn read(&self) -> RwLockReadGuard<SegmentRegisters> {
self.registers
.read()
.expect("Failed to acquire read lock on SegmentManager.")
}
fn write(&self,) -> RwLockWriteGuard<SegmentRegisters> {
self.registers.write().expect("Failed to acquire write lock on SegmentManager.")
fn write(&self) -> RwLockWriteGuard<SegmentRegisters> {
self.registers
.write()
.expect("Failed to acquire write lock on SegmentManager.")
}
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
@@ -124,42 +125,44 @@ impl SegmentManager {
registers_lock.committed.add_segment_entry(segment_entry);
}
}
pub fn start_merge(&self, segment_ids: &[SegmentId]) {
let mut registers_lock = self.write();
if registers_lock.uncommitted.contains_all(segment_ids) {
for segment_id in segment_ids {
registers_lock.uncommitted.start_merge(segment_id);
}
}
else if registers_lock.committed.contains_all(segment_ids) {
} else if registers_lock.committed.contains_all(segment_ids) {
for segment_id in segment_ids {
registers_lock.committed.start_merge(segment_id);
}
}
else {
} else {
error!("Merge operation sent for segments that are not all uncommited or commited.");
}
}
pub fn cancel_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_id: SegmentId) {
pub fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_id: SegmentId,
) {
let mut registers_lock = self.write();
// we mark all segments are ready for merge.
{
let target_segment_register: &mut SegmentRegister;
target_segment_register = {
if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) {
if registers_lock
.uncommitted
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.uncommitted
}
else if registers_lock.committed.contains_all(&before_merge_segment_ids) {
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.committed
}
else {
} else {
warn!("couldn't find segment in SegmentManager");
return;
}
@@ -174,7 +177,6 @@ impl SegmentManager {
registers_lock.writing.remove(&after_merge_segment_id);
}
pub fn write_segment(&self, segment_id: SegmentId) {
let mut registers_lock = self.write();
registers_lock.writing.insert(segment_id);
@@ -185,19 +187,27 @@ impl SegmentManager {
registers_lock.writing.remove(&segment_entry.segment_id());
registers_lock.uncommitted.add_segment_entry(segment_entry);
}
pub fn end_merge(&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentEntry) {
pub fn end_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentEntry,
) {
let mut registers_lock = self.write();
registers_lock.writing.remove(&after_merge_segment_entry.segment_id());
let mut target_register: &mut SegmentRegister = {
if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) {
registers_lock
.writing
.remove(&after_merge_segment_entry.segment_id());
let target_register: &mut SegmentRegister = {
if registers_lock
.uncommitted
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.uncommitted
}
else if registers_lock.committed.contains_all(&before_merge_segment_ids) {
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");
@@ -208,13 +218,9 @@ impl SegmentManager {
target_register.remove_segment(segment_id);
}
target_register.add_segment_entry(after_merge_segment_entry);
}
pub fn committed_segment_metas(&self,) -> Vec<SegmentMeta> {
pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> {
let registers_lock = self.read();
registers_lock.committed.segment_metas()
}

View File

@@ -9,32 +9,29 @@ use indexer::delete_queue::DeleteCursor;
/// The segment register keeps track
/// of the list of segment, their size as well
/// as the state they are in.
///
/// It is consumed by indexes to get the list of
///
/// It is consumed by indexes to get the list of
/// segments that are currently searchable,
/// and by the index merger to identify
/// and by the index merger to identify
/// merge candidates.
#[derive(Default)]
pub struct SegmentRegister {
segment_states: HashMap<SegmentId, SegmentEntry>,
segment_states: HashMap<SegmentId, SegmentEntry>,
}
impl Debug for SegmentRegister {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
try!(write!(f, "SegmentRegister("));
write!(f, "SegmentRegister(")?;
for (k, v) in &self.segment_states {
try!(write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code()));
write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code())?;
}
try!(write!(f, ")"));
write!(f, ")")?;
Ok(())
}
}
impl SegmentRegister {
pub fn clear(&mut self,) {
pub fn clear(&mut self) {
self.segment_states.clear();
}
@@ -42,29 +39,26 @@ impl SegmentRegister {
self.segment_states.len()
}
pub fn get_all_segments(&self,) -> Vec<SegmentMeta> {
pub fn get_all_segments(&self) -> Vec<SegmentMeta> {
self.segment_states
.values()
.map(|segment_entry| segment_entry.meta().clone())
.collect()
}
pub fn get_mergeable_segments(&self,) -> Vec<SegmentMeta> {
pub fn get_mergeable_segments(&self) -> Vec<SegmentMeta> {
self.segment_states
.values()
.filter(|segment_entry| segment_entry.is_ready())
.map(|segment_entry| segment_entry.meta().clone())
.collect()
}
pub fn segment_entries(&self,) -> Vec<SegmentEntry> {
self.segment_states
.values()
.cloned()
.collect()
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
self.segment_states.values().cloned().collect()
}
pub fn segment_metas(&self,) -> Vec<SegmentMeta> {
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
let mut segment_ids: Vec<SegmentMeta> = self.segment_states
.values()
.map(|segment_entry| segment_entry.meta().clone())
@@ -72,28 +66,26 @@ impl SegmentRegister {
segment_ids.sort_by_key(|meta| meta.id());
segment_ids
}
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
self.segment_states
.get(&segment_id)
.map(|segment_entry| segment_entry.clone())
self.segment_states.get(segment_id).cloned()
}
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
segment_ids
.iter()
.all(|segment_id| self.segment_states.contains_key(segment_id))
}
pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) {
let segment_id = segment_entry.segment_id();
self.segment_states.insert(segment_id, segment_entry);
}
pub fn remove_segment(&mut self, segment_id: &SegmentId) {
self.segment_states.remove(segment_id);
}
}
pub fn cancel_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get_mut(segment_id)
@@ -106,25 +98,19 @@ impl SegmentRegister {
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.start_merge();
}
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: DeleteCursor) -> SegmentRegister {
}
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: &DeleteCursor) -> SegmentRegister {
let mut segment_states = HashMap::new();
for segment_meta in segment_metas {
let segment_id = segment_meta.id();
let segment_entry = SegmentEntry::new(
segment_meta,
delete_cursor.clone(),
None);
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone(), None);
segment_states.insert(segment_id, segment_entry);
}
SegmentRegister {
segment_states: segment_states
}
SegmentRegister { segment_states }
}
}
#[cfg(test)]
mod tests {
use indexer::SegmentState;
@@ -140,7 +126,7 @@ mod tests {
.map(|segment_meta| segment_meta.id())
.collect()
}
#[test]
fn test_segment_register() {
let delete_queue = DeleteQueue::new();
@@ -149,32 +135,56 @@ mod tests {
let segment_id_a = SegmentId::generate_random();
let segment_id_b = SegmentId::generate_random();
let segment_id_merged = SegmentId::generate_random();
{
let segment_meta = SegmentMeta::new(segment_id_a);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state(), SegmentState::Ready);
assert_eq!(segment_ids(&segment_register), vec!(segment_id_a));
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::Ready
);
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
{
let segment_meta = SegmentMeta::new(segment_id_b);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state(), SegmentState::Ready);
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::Ready
);
segment_register.start_merge(&segment_id_a);
segment_register.start_merge(&segment_id_b);
assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state(), SegmentState::InMerge);
assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state(), SegmentState::InMerge);
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::InMerge
);
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::InMerge
);
segment_register.remove_segment(&segment_id_a);
segment_register.remove_segment(&segment_id_b);
{
let segment_meta_merged = SegmentMeta::new(segment_id_merged);
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_ids(&segment_register), vec!(segment_id_merged));
assert_eq!(segment_ids(&segment_register), vec![segment_id_merged]);
}
}
}

View File

@@ -4,8 +4,7 @@ use core::Segment;
use core::SegmentComponent;
use fastfield::FastFieldSerializer;
use store::StoreWriter;
use postings::PostingsSerializer;
use postings::InvertedIndexSerializer;
/// Segment serializer is in charge of laying out on disk
/// the data accumulated and sorted by the `SegmentWriter`.
@@ -13,31 +12,31 @@ pub struct SegmentSerializer {
store_writer: StoreWriter,
fast_field_serializer: FastFieldSerializer,
fieldnorms_serializer: FastFieldSerializer,
postings_serializer: PostingsSerializer,
postings_serializer: InvertedIndexSerializer,
}
impl SegmentSerializer {
/// Creates a new `SegmentSerializer`.
pub fn for_segment(segment: &mut Segment) -> Result<SegmentSerializer> {
let store_write = try!(segment.open_write(SegmentComponent::STORE));
let store_write = segment.open_write(SegmentComponent::STORE)?;
let fast_field_write = try!(segment.open_write(SegmentComponent::FASTFIELDS));
let fast_field_serializer = try!(FastFieldSerializer::new(fast_field_write));
let fast_field_write = segment.open_write(SegmentComponent::FASTFIELDS)?;
let fast_field_serializer = FastFieldSerializer::from_write(fast_field_write)?;
let fieldnorms_write = try!(segment.open_write(SegmentComponent::FIELDNORMS));
let fieldnorms_serializer = try!(FastFieldSerializer::new(fieldnorms_write));
let fieldnorms_write = segment.open_write(SegmentComponent::FIELDNORMS)?;
let fieldnorms_serializer = FastFieldSerializer::from_write(fieldnorms_write)?;
let postings_serializer = try!(PostingsSerializer::open(segment));
let postings_serializer = InvertedIndexSerializer::open(segment)?;
Ok(SegmentSerializer {
postings_serializer: postings_serializer,
postings_serializer,
store_writer: StoreWriter::new(store_write),
fast_field_serializer: fast_field_serializer,
fieldnorms_serializer: fieldnorms_serializer,
fast_field_serializer,
fieldnorms_serializer,
})
}
/// Accessor to the `PostingsSerializer`.
pub fn get_postings_serializer(&mut self) -> &mut PostingsSerializer {
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
&mut self.postings_serializer
}
@@ -57,11 +56,12 @@ impl SegmentSerializer {
}
/// Finalize the segment serialization.
#[inline(never)]
pub fn close(self) -> Result<()> {
try!(self.fast_field_serializer.close());
try!(self.postings_serializer.close());
try!(self.store_writer.close());
try!(self.fieldnorms_serializer.close());
self.fast_field_serializer.close()?;
self.postings_serializer.close()?;
self.store_writer.close()?;
self.fieldnorms_serializer.close()?;
Ok(())
}
}

View File

@@ -1,5 +1,3 @@
#![allow(for_kv_map)]
use core::Index;
use core::IndexMeta;
use core::META_FILEPATH;
@@ -9,19 +7,18 @@ use core::SegmentMeta;
use core::SerializableSegment;
use directory::Directory;
use indexer::stamper::Stamper;
use Error;
use error::{Error, ErrorKind, Result};
use futures_cpupool::CpuPool;
use futures::Future;
use futures::Canceled;
use futures::oneshot;
use directory::FileProtection;
use indexer::{MergePolicy, DefaultMergePolicy};
use indexer::{DefaultMergePolicy, MergePolicy};
use indexer::index_writer::advance_deletes;
use indexer::MergeCandidate;
use indexer::merger::IndexMerger;
use indexer::SegmentEntry;
use indexer::SegmentSerializer;
use Result;
use futures_cpupool::CpuFuture;
use serde_json;
use indexer::delete_queue::DeleteCursor;
@@ -32,13 +29,12 @@ use std::io::Write;
use std::mem;
use std::ops::DerefMut;
use std::sync::Arc;
use std::sync::atomic::{AtomicUsize, AtomicBool};
use std::sync::atomic::{AtomicBool, AtomicUsize};
use std::sync::atomic::Ordering;
use std::sync::RwLock;
use std::thread;
use std::thread::JoinHandle;
use super::segment_manager::{SegmentManager, get_mergeable_segments};
use super::segment_manager::{get_mergeable_segments, SegmentManager};
/// Save the index meta file.
/// This operation is atomic :
@@ -49,17 +45,12 @@ use super::segment_manager::{SegmentManager, get_mergeable_segments};
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_new_metas(schema: Schema,
opstamp: u64,
directory: &mut Directory)
-> Result<()> {
save_metas(vec!(), schema, opstamp, directory)
pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> {
save_metas(vec![], schema, opstamp, directory)
}
/// Save the index meta file.
/// This operation is atomic :
/// This operation is atomic:
/// Either
// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
@@ -67,93 +58,93 @@ pub fn save_new_metas(schema: Schema,
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_metas(segment_metas: Vec<SegmentMeta>,
schema: Schema,
opstamp: u64,
directory: &mut Directory)
-> Result<()> {
pub fn save_metas(
segment_metas: Vec<SegmentMeta>,
schema: Schema,
opstamp: u64,
directory: &mut Directory,
) -> Result<()> {
let metas = IndexMeta {
segments: segment_metas,
schema: schema,
opstamp: opstamp,
schema,
opstamp,
};
let mut w = try!(serde_json::to_vec_pretty(&metas));
try!(write!(&mut w, "\n"));
let res = directory.atomic_write(&META_FILEPATH, &w[..])?;
let mut buffer = serde_json::to_vec_pretty(&metas)?;
write!(&mut buffer, "\n")?;
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
Ok(res)
Ok(())
}
// The segment update runner is in charge of processing all
// of the `SegmentUpdate`s.
//
// All this processing happens on a single thread
// consuming a common queue.
// consuming a common queue.
#[derive(Clone)]
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
fn perform_merge(segment_ids: &[SegmentId],
segment_updater: &SegmentUpdater,
mut merged_segment: Segment,
target_opstamp: u64) -> Result<SegmentEntry> {
fn perform_merge(
segment_ids: &[SegmentId],
segment_updater: &SegmentUpdater,
mut merged_segment: Segment,
target_opstamp: u64,
) -> Result<SegmentEntry> {
// first we need to apply deletes to our segment.
info!("Start merge: {:?}", segment_ids);
let ref index = segment_updater.0.index;
let schema = index.schema();
let mut segment_entries = vec!();
let mut file_protections: Vec<FileProtection> = vec!();
let index = &segment_updater.0.index;
let schema = index.schema();
let mut segment_entries = vec![];
let mut file_protections: Vec<FileProtection> = vec![];
for segment_id in segment_ids {
if let Some(mut segment_entry) = segment_updater.0
.segment_manager
.segment_entry(segment_id) {
if let Some(mut segment_entry) = segment_updater.0.segment_manager.segment_entry(segment_id)
{
let segment = index.segment(segment_entry.meta().clone());
if let Some(file_protection) = advance_deletes(segment, &mut segment_entry, target_opstamp)? {
if let Some(file_protection) =
advance_deletes(segment, &mut segment_entry, target_opstamp)?
{
file_protections.push(file_protection);
}
segment_entries.push(segment_entry);
}
else {
error!("Error, had to abort merge as some of the segment is not managed anymore.a");
return Err(Error::InvalidArgument(format!("Segment {:?} requested for merge is not managed.", segment_id)));
} else {
error!("Error, had to abort merge as some of the segment is not managed anymore.");
let msg = format!(
"Segment {:?} requested for merge is not managed.",
segment_id
);
bail!(ErrorKind::InvalidArgument(msg));
}
}
let delete_cursor = segment_entries[0].delete_cursor().clone();
let segments: Vec<Segment> = segment_entries
.iter()
.map(|segment_entry| {
index.segment(segment_entry.meta().clone())
})
.map(|segment_entry| index.segment(segment_entry.meta().clone()))
.collect();
// An IndexMerger is like a "view" of our merged segments.
let merger: IndexMerger = IndexMerger::open(schema, &segments[..])?;
// ... we just serialize this index merger in our new segment
// to merge the two segments.
let segment_serializer =
SegmentSerializer::for_segment(&mut merged_segment)
let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)
.expect("Creating index serializer failed");
let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed");
let num_docs = merger
.write(segment_serializer)
.expect("Serializing merged index failed");
let mut segment_meta = SegmentMeta::new(merged_segment.id());
segment_meta.set_max_doc(num_docs);
let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None);
Ok(after_merge_segment_entry)
}
struct InnerSegmentUpdater {
pool: CpuPool,
index: Index,
@@ -162,30 +153,29 @@ struct InnerSegmentUpdater {
merging_thread_id: AtomicUsize,
merging_threads: RwLock<HashMap<usize, JoinHandle<Result<()>>>>,
generation: AtomicUsize,
killed: AtomicBool,
killed: AtomicBool,
stamper: Stamper,
}
impl SegmentUpdater {
pub fn new(index: Index,
stamper: Stamper,
delete_cursor: DeleteCursor) -> Result<SegmentUpdater> {
pub fn new(
index: Index,
stamper: Stamper,
delete_cursor: &DeleteCursor,
) -> Result<SegmentUpdater> {
let segments = index.searchable_segment_metas()?;
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
Ok(
SegmentUpdater(Arc::new(InnerSegmentUpdater {
pool: CpuPool::new(1),
index: index,
segment_manager: segment_manager,
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
merging_thread_id: AtomicUsize::default(),
merging_threads: RwLock::new(HashMap::new()),
generation: AtomicUsize::default(),
killed: AtomicBool::new(false),
stamper: stamper,
}))
)
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
pool: CpuPool::new(1),
index,
segment_manager,
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
merging_thread_id: AtomicUsize::default(),
merging_threads: RwLock::new(HashMap::new()),
generation: AtomicUsize::default(),
killed: AtomicBool::new(false),
stamper,
})))
}
pub fn new_segment(&self) -> Segment {
@@ -200,21 +190,21 @@ impl SegmentUpdater {
}
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
*self.0.merge_policy.write().unwrap()= merge_policy;
*self.0.merge_policy.write().unwrap() = merge_policy;
}
fn get_merging_thread_id(&self) -> usize {
self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst)
}
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>(&self, f: F) -> CpuFuture<T, Error> {
fn run_async<T: 'static + Send, F: 'static + Send + FnOnce(SegmentUpdater) -> T>(
&self,
f: F,
) -> CpuFuture<T, Error> {
let me_clone = self.clone();
self.0.pool.spawn_fn(move || {
Ok(f(me_clone))
})
self.0.pool.spawn_fn(move || Ok(f(me_clone)))
}
pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool {
if generation >= self.0.generation.load(Ordering::Acquire) {
self.run_async(|segment_updater| {
@@ -223,33 +213,30 @@ impl SegmentUpdater {
true
}).forget();
true
}
else {
} else {
false
}
}
pub fn kill(&mut self,) {
pub fn kill(&mut self) {
self.0.killed.store(true, Ordering::Release);
}
fn is_alive(&self,) -> bool {
fn is_alive(&self) -> bool {
!self.0.killed.load(Ordering::Acquire)
}
/// Apply deletes up to the target opstamp to all segments.
///
/// Tne method returns copies of the segment entries,
/// updated with the delete information.
fn purge_deletes(&self, target_opstamp: u64) -> Result<Vec<SegmentEntry>> {
let mut segment_entries = self.0.segment_manager.segment_entries();
let mut segment_entries = self.0.segment_manager.segment_entries();
for segment_entry in &mut segment_entries {
let segment = self.0.index.segment(segment_entry.meta().clone());
advance_deletes(segment, segment_entry, target_opstamp)?;
}
Ok(segment_entries)
}
pub fn save_metas(&self, opstamp: u64) {
@@ -260,7 +247,8 @@ impl SegmentUpdater {
self.0.segment_manager.committed_segment_metas(),
index.schema(),
opstamp,
directory.box_clone().borrow_mut()).expect("Could not save metas.");
directory.box_clone().borrow_mut(),
).expect("Could not save metas.");
}
}
@@ -270,10 +258,12 @@ impl SegmentUpdater {
}).wait()
}
fn garbage_collect_files_exec(&self) {
let living_files = self.0.segment_manager.list_files();
fn garbage_collect_files_exec(&self) {
info!("Running garbage collection");
let mut index = self.0.index.clone();
index.directory_mut().garbage_collect(living_files);
index
.directory_mut()
.garbage_collect(|| self.0.segment_manager.list_files());
}
pub fn commit(&self, opstamp: u64) -> Result<()> {
@@ -290,28 +280,33 @@ impl SegmentUpdater {
}).wait()
}
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> impl Future<Item=SegmentMeta, Error=Canceled> {
pub fn start_merge(
&self,
segment_ids: &[SegmentId],
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
self.0.segment_manager.start_merge(segment_ids);
let segment_updater_clone = self.clone();
let segment_ids_vec = segment_ids.to_vec();
let segment_ids_vec = segment_ids.to_vec();
let merging_thread_id = self.get_merging_thread_id();
let (merging_future_send, merging_future_recv) = oneshot();
if segment_ids.is_empty() {
return merging_future_recv;
}
let target_opstamp = self.0.stamper.stamp();
let merging_join_handle = thread::spawn(move || {
// first we need to apply deletes to our segment.
let merged_segment = segment_updater_clone.new_segment();
let merged_segment = segment_updater_clone.new_segment();
let merged_segment_id = merged_segment.id();
let merge_result = perform_merge(&segment_ids_vec, &segment_updater_clone, merged_segment, target_opstamp);
let merge_result = perform_merge(
&segment_ids_vec,
&segment_updater_clone,
merged_segment,
target_opstamp,
);
match merge_result {
Ok(after_merge_segment_entry) => {
@@ -319,11 +314,11 @@ impl SegmentUpdater {
segment_updater_clone
.end_merge(segment_ids_vec, after_merge_segment_entry)
.expect("Segment updater thread is corrupted.");
// the future may fail if the listener of the oneshot future
// the future may fail if the listener of the oneshot future
// has been destroyed.
//
// This is not a problem here, so we just ignore any
// This is not a problem here, so we just ignore any
// possible error.
let _merging_future_res = merging_future_send.send(merged_segment_meta);
}
@@ -337,16 +332,25 @@ impl SegmentUpdater {
// merging_future_send will be dropped, sending an error to the future.
}
}
segment_updater_clone.0.merging_threads.write().unwrap().remove(&merging_thread_id);
segment_updater_clone
.0
.merging_threads
.write()
.unwrap()
.remove(&merging_thread_id);
Ok(())
});
self.0.merging_threads.write().unwrap().insert(merging_thread_id, merging_join_handle);
self.0
.merging_threads
.write()
.unwrap()
.insert(merging_thread_id, merging_join_handle);
merging_future_recv
}
fn consider_merge_options(&self) {
let (committed_segments, uncommitted_segments) = get_mergeable_segments(&self.0.segment_manager);
let (committed_segments, uncommitted_segments) =
get_mergeable_segments(&self.0.segment_manager);
// Committed segments cannot be merged with uncommitted_segments.
// We therefore consider merges using these two sets of segments independently.
let merge_policy = self.get_merge_policy();
@@ -358,68 +362,85 @@ impl SegmentUpdater {
}
}
fn cancel_merge(&self,
fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentId) {
self.0.segment_manager.cancel_merge(&before_merge_segment_ids, after_merge_segment_entry);
after_merge_segment_entry: SegmentId,
) {
self.0
.segment_manager
.cancel_merge(before_merge_segment_ids, after_merge_segment_entry);
}
fn end_merge(&self,
fn end_merge(
&self,
before_merge_segment_ids: Vec<SegmentId>,
mut after_merge_segment_entry: SegmentEntry) -> Result<()> {
mut after_merge_segment_entry: SegmentEntry,
) -> Result<()> {
self.run_async(move |segment_updater| {
debug!("End merge {:?}", after_merge_segment_entry.meta());
info!("End merge {:?}", after_merge_segment_entry.meta());
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
let mut _file_protection_opt = None;
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.0.index.opstamp();
if delete_operation.opstamp < committed_opstamp {
let segment = segment_updater.0.index.segment(after_merge_segment_entry.meta().clone());
match advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp) {
let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
match advance_deletes(
segment,
&mut after_merge_segment_entry,
committed_opstamp,
) {
Ok(file_protection_opt_res) => {
_file_protection_opt = file_protection_opt_res;
}
Err(e) => {
error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}", before_merge_segment_ids, e);
error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
before_merge_segment_ids, e
);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
segment_updater.cancel_merge(&before_merge_segment_ids, after_merge_segment_entry.segment_id());
segment_updater.cancel_merge(
&before_merge_segment_ids,
after_merge_segment_entry.segment_id(),
);
return;
}
}
}
}
segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
segment_updater
.0
.segment_manager
.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
segment_updater.consider_merge_options();
info!("save metas");
segment_updater.save_metas(segment_updater.0.index.opstamp());
segment_updater.garbage_collect_files_exec();
}).wait()
}
/// Wait for current merging threads.
///
/// Upon termination of the current merging threads,
/// merge opportunity may appear.
//
//
/// We keep waiting until the merge policy judges that
/// no opportunity is available.
///
/// Note that it is not required to call this
/// Note that it is not required to call this
/// method in your application.
/// Terminating your application without letting
/// Terminating your application without letting
/// merge terminate is perfectly safe.
///
///
/// Obsolete files will eventually be cleaned up
/// by the directory garbage collector.
pub fn wait_merging_thread(&self) -> Result<()> {
let mut num_segments: usize;
loop {
num_segments = self.0.segment_manager.num_segments();
let mut new_merging_threads = HashMap::new();
@@ -432,9 +453,7 @@ impl SegmentUpdater {
merging_thread_handle
.join()
.map(|_| ())
.map_err(|_| {
Error::ErrorInThread("Merging thread failed.".to_string())
})?
.map_err(|_| ErrorKind::ErrorInThread("Merging thread failed.".into()))?;
}
// Our merging thread may have queued their completed
self.run_async(move |_| {}).wait()?;
@@ -444,15 +463,11 @@ impl SegmentUpdater {
if new_num_segments >= num_segments {
break;
}
}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
@@ -467,7 +482,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.set_merge_policy(box MergeWheneverPossible);
@@ -479,7 +494,7 @@ mod tests {
}
assert!(index_writer.commit().is_ok());
}
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"));
@@ -487,7 +502,7 @@ mod tests {
}
assert!(index_writer.commit().is_ok());
}
{
index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"));
@@ -504,8 +519,9 @@ mod tests {
assert_eq!(index.searcher().num_docs(), 302);
{
index_writer.wait_merging_threads()
.expect( "waiting for merging threads");
index_writer
.wait_merging_threads()
.expect("waiting for merging threads");
}
index.load_searchers().unwrap();

View File

@@ -1,7 +1,7 @@
use Result;
use DocId;
use std::io;
use schema::Schema;
use schema::Schema;
use schema::Term;
use core::Segment;
use core::SerializableSegment;
@@ -14,200 +14,232 @@ use datastruct::stacker::Heap;
use indexer::index_writer::MARGIN_IN_BYTES;
use super::operation::AddOperation;
use postings::MultiFieldPostingsWriter;
use tokenizer::BoxedTokenizer;
use schema::Value;
/// A `SegmentWriter` is in charge of creating segment index from a
/// documents.
///
///
/// They creates the postings list in anonymous memory.
/// The segment is layed on disk when the segment gets `finalized`.
pub struct SegmentWriter<'a> {
heap: &'a Heap,
heap: &'a Heap,
max_doc: DocId,
multifield_postings: MultiFieldPostingsWriter<'a>,
segment_serializer: SegmentSerializer,
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FastFieldsWriter,
doc_opstamps: Vec<u64>,
multifield_postings: MultiFieldPostingsWriter<'a>,
segment_serializer: SegmentSerializer,
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FastFieldsWriter,
doc_opstamps: Vec<u64>,
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
}
fn create_fieldnorms_writer(schema: &Schema) -> FastFieldsWriter {
let u64_fields: Vec<Field> = schema.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
FastFieldsWriter::new(u64_fields)
let u64_fields: Vec<Field> = schema
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
FastFieldsWriter::new(u64_fields)
}
impl<'a> SegmentWriter<'a> {
/// Creates a new `SegmentWriter`
///
/// The arguments are defined as follows
///
/// - heap: most of the segment writer data (terms, and postings lists recorders)
/// is stored in a user-defined heap object. This makes it possible for the user to define
/// the flushing behavior as a buffer limit
/// - segment: The segment being written
/// - schema
pub fn for_segment(heap: &'a Heap,
mut segment: Segment,
schema: &Schema) -> Result<SegmentWriter<'a>> {
let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment));
let multifield_postings = MultiFieldPostingsWriter::new(schema, heap);
Ok(SegmentWriter {
heap: heap,
max_doc: 0,
multifield_postings: multifield_postings,
fieldnorms_writer: create_fieldnorms_writer(schema),
segment_serializer: segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
})
}
/// Lay on disk the current content of the `SegmentWriter`
///
/// Finalize consumes the `SegmentWriter`, so that it cannot
/// be used afterwards.
pub fn finalize(self) -> Result<Vec<u64>> {
write(&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer)?;
Ok(self.doc_opstamps)
}
/// Returns true iff the segment writer's buffer has reached capacity.
///
/// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB`
/// The `Segment` is `finalize`d when the buffer gets full.
///
/// Because, we cannot cut through a document, the margin is there to ensure that we rarely
/// exceeds the heap size.
pub fn is_buffer_full(&self,) -> bool {
self.heap.num_free_bytes() <= MARGIN_IN_BYTES
}
/// Return true if the term dictionary hashmap is reaching capacity.
/// It is one of the condition that triggers a `SegmentWriter` to
/// be finalized.
pub(crate) fn is_termdic_saturated(&self,) -> bool {
self.multifield_postings.is_termdic_saturated()
}
/// Indexes a new document
///
/// As a user, you should rather use `IndexWriter`'s add_document.
pub fn add_document(&mut self, add_operation: &AddOperation, schema: &Schema) -> io::Result<()> {
let doc_id = self.max_doc;
let doc = &add_operation.document;
self.doc_opstamps.push(add_operation.opstamp);
for (field, field_values) in doc.get_sorted_field_values() {
let field_options = schema.get_field_entry(field);
match *field_options.field_type() {
FieldType::Str(ref text_options) => {
let num_tokens: u32 =
if text_options.get_indexing_options().is_tokenized() {
self.multifield_postings.index_text(doc_id, field, &field_values)
}
else {
let num_field_values = field_values.len() as u32;
for field_value in field_values {
let term = Term::from_field_text(field, field_value.value().text());
self.multifield_postings.suscribe(doc_id, &term);
}
num_field_values
};
self.fieldnorms_writer
.get_field_writer(field)
.map(|field_norms_writer| {
field_norms_writer.add_val(num_tokens as u64)
});
}
FieldType::U64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_u64(field_value.field(), field_value.value().u64_value());
self.multifield_postings.suscribe(doc_id, &term);
}
}
}
FieldType::I64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(field_value.field(), field_value.value().i64_value());
self.multifield_postings.suscribe(doc_id, &term);
}
}
}
}
}
self.fieldnorms_writer.fill_val_up_to(doc_id);
self.fast_field_writers.add_document(&doc);
let stored_fieldvalues: Vec<&FieldValue> = doc
.field_values()
.iter()
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
.collect();
let doc_writer = self.segment_serializer.get_store_writer();
try!(doc_writer.store(&stored_fieldvalues));
self.max_doc += 1;
Ok(())
/// Creates a new `SegmentWriter`
///
/// The arguments are defined as follows
///
/// - heap: most of the segment writer data (terms, and postings lists recorders)
/// is stored in a user-defined heap object. This makes it possible for the user to define
/// the flushing behavior as a buffer limit
/// - segment: The segment being written
/// - schema
pub fn for_segment(
heap: &'a Heap,
table_bits: usize,
mut segment: Segment,
schema: &Schema,
) -> Result<SegmentWriter<'a>> {
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap);
let tokenizers = schema
.fields()
.iter()
.map(|field_entry| field_entry.field_type())
.map(|field_type| match *field_type {
FieldType::Str(ref text_options) => text_options.get_indexing_options().and_then(
|text_index_option| {
let tokenizer_name = &text_index_option.tokenizer();
segment.index().tokenizers().get(tokenizer_name)
},
),
_ => None,
})
.collect();
Ok(SegmentWriter {
heap,
max_doc: 0,
multifield_postings,
fieldnorms_writer: create_fieldnorms_writer(schema),
segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
tokenizers,
})
}
/// Max doc is
/// - the number of documents in the segment assuming there is no deletes
/// - the maximum document id (including deleted documents) + 1
///
/// Currently, **tantivy** does not handle deletes anyway,
/// so `max_doc == num_docs`
pub fn max_doc(&self,) -> u32 {
self.max_doc
}
/// Number of documents in the index.
/// Deleted documents are not counted.
///
/// Currently, **tantivy** does not handle deletes anyway,
/// so `max_doc == num_docs`
#[allow(dead_code)]
pub fn num_docs(&self,) -> u32 {
self.max_doc
}
/// Lay on disk the current content of the `SegmentWriter`
///
/// Finalize consumes the `SegmentWriter`, so that it cannot
/// be used afterwards.
pub fn finalize(self) -> Result<Vec<u64>> {
write(
&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer,
)?;
Ok(self.doc_opstamps)
}
/// Returns true iff the segment writer's buffer has reached capacity.
///
/// The limit is defined as `the user defined heap size - an arbitrary margin of 10MB`
/// The `Segment` is `finalize`d when the buffer gets full.
///
/// Because, we cannot cut through a document, the margin is there to ensure that we rarely
/// exceeds the heap size.
pub fn is_buffer_full(&self) -> bool {
self.heap.num_free_bytes() <= MARGIN_IN_BYTES
}
/// Return true if the term dictionary hashmap is reaching capacity.
/// It is one of the condition that triggers a `SegmentWriter` to
/// be finalized.
pub(crate) fn is_term_saturated(&self) -> bool {
self.multifield_postings.is_term_saturated()
}
/// Indexes a new document
///
/// As a user, you should rather use `IndexWriter`'s add_document.
pub fn add_document(
&mut self,
add_operation: &AddOperation,
schema: &Schema,
) -> io::Result<()> {
let doc_id = self.max_doc;
let doc = &add_operation.document;
self.doc_opstamps.push(add_operation.opstamp);
for (field, field_values) in doc.get_sorted_field_values() {
let field_options = schema.get_field_entry(field);
if !field_options.is_indexed() {
continue;
}
match *field_options.field_type() {
FieldType::Str(_) => {
let num_tokens =
if let Some(ref mut tokenizer) = self.tokenizers[field.0 as usize] {
let texts: Vec<&str> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Str(ref text) => Some(text.as_str()),
_ => None,
})
.collect();
let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
self.multifield_postings
.index_text(doc_id, field, &mut token_stream)
} else {
0
};
self.fieldnorms_writer
.get_field_writer(field)
.map(|field_norms_writer| {
field_norms_writer.add_val(u64::from(num_tokens))
});
}
FieldType::U64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_u64(
field_value.field(),
field_value.value().u64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
}
}
}
FieldType::I64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(
field_value.field(),
field_value.value().i64_value(),
);
self.multifield_postings.subscribe(doc_id, &term);
}
}
}
}
}
self.fieldnorms_writer.fill_val_up_to(doc_id);
self.fast_field_writers.add_document(doc);
let stored_fieldvalues: Vec<&FieldValue> = doc.field_values()
.iter()
.filter(|field_value| schema.get_field_entry(field_value.field()).is_stored())
.collect();
let doc_writer = self.segment_serializer.get_store_writer();
doc_writer.store(&stored_fieldvalues)?;
self.max_doc += 1;
Ok(())
}
/// Max doc is
/// - the number of documents in the segment assuming there is no deletes
/// - the maximum document id (including deleted documents) + 1
///
/// Currently, **tantivy** does not handle deletes anyway,
/// so `max_doc == num_docs`
pub fn max_doc(&self) -> u32 {
self.max_doc
}
/// Number of documents in the index.
/// Deleted documents are not counted.
///
/// Currently, **tantivy** does not handle deletes anyway,
/// so `max_doc == num_docs`
#[allow(dead_code)]
pub fn num_docs(&self) -> u32 {
self.max_doc
}
}
// This method is used as a trick to workaround the borrow checker
fn write<'a>(
multifield_postings: &MultiFieldPostingsWriter,
fast_field_writers: &FastFieldsWriter,
fieldnorms_writer: &FastFieldsWriter,
mut serializer: SegmentSerializer) -> Result<()> {
try!(multifield_postings.serialize(serializer.get_postings_serializer()));
try!(fast_field_writers.serialize(serializer.get_fast_field_serializer()));
try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer()));
try!(serializer.close());
fn write(
multifield_postings: &MultiFieldPostingsWriter,
fast_field_writers: &FastFieldsWriter,
fieldnorms_writer: &FastFieldsWriter,
mut serializer: SegmentSerializer,
) -> Result<()> {
multifield_postings.serialize(serializer.get_postings_serializer())?;
fast_field_writers.serialize(serializer.get_fast_field_serializer())?;
fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())?;
serializer.close()?;
Ok(())
Ok(())
}
impl<'a> SerializableSegment for SegmentWriter<'a> {
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
let max_doc = self.max_doc;
write(&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
serializer)?;
Ok(max_doc)
}
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
let max_doc = self.max_doc;
write(
&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
serializer,
)?;
Ok(max_doc)
}
}

View File

@@ -1,17 +1,15 @@
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64>);
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
}
pub fn stamp(&self,) -> u64 {
pub fn stamp(&self) -> u64 {
self.0.fetch_add(1u64, Ordering::SeqCst)
}
}
}

View File

@@ -1,26 +1,26 @@
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
#![allow(unknown_lints)] // for the clippy lint options
#![allow(module_inception)]
#![cfg_attr(feature = "cargo-clippy", allow(module_inception))]
#![cfg_attr(feature = "cargo-clippy", allow(inline_always))]
#![feature(box_syntax)]
#![feature(optin_builtin_traits)]
#![feature(conservative_impl_trait)]
#![feature(integer_atomics)]
#![cfg_attr(test, feature(test))]
#![cfg_attr(test, feature(step_by))]
#![cfg_attr(test, feature(iterator_step_by))]
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![allow(unknown_lints)]
#![allow(new_without_default)]
#![warn(missing_docs)]
//! # `tantivy`
//!
//! Tantivy is a search engine library.
//! Tantivy is a search engine library.
//! Think `Lucene`, but in Rust.
//!
//! A good place for you to get started is to check out
//! the example code ( [literate programming](http://fulmicoton.com/tantivy-examples/simple_search.html) / [source code](https://github.com/fulmicoton/tantivy/blob/master/examples/simple_search.rs))
//! the example code (
//! [literate programming](http://fulmicoton.com/tantivy-examples/simple_search.html) /
//! [source code](https://github.com/fulmicoton/tantivy/blob/master/examples/simple_search.rs))
#[macro_use]
extern crate lazy_static;
@@ -32,120 +32,103 @@ extern crate serde_derive;
extern crate log;
#[macro_use]
extern crate version;
extern crate fst;
extern crate byteorder;
extern crate memmap;
extern crate regex;
extern crate tempfile;
extern crate error_chain;
extern crate atomicwrites;
extern crate tempdir;
extern crate serde;
extern crate bincode;
extern crate serde_json;
extern crate time;
extern crate lz4;
extern crate uuid;
extern crate num_cpus;
extern crate combine;
extern crate itertools;
extern crate chan;
extern crate crossbeam;
extern crate bit_set;
extern crate byteorder;
extern crate chan;
extern crate combine;
extern crate crossbeam;
extern crate fst;
extern crate futures;
extern crate futures_cpupool;
extern crate itertools;
extern crate lz4;
extern crate memmap;
extern crate num_cpus;
extern crate owning_ref;
extern crate regex;
extern crate rust_stemmers;
extern crate serde;
extern crate serde_json;
extern crate stable_deref_trait;
extern crate tempdir;
extern crate time;
extern crate uuid;
#[macro_use]
extern crate version;
#[cfg(test)]
extern crate env_logger;
#[cfg(feature="simdcompression")]
#[cfg(feature = "simdcompression")]
extern crate libc;
#[cfg(windows)]
extern crate winapi;
#[cfg(test)] extern crate test;
#[cfg(test)] extern crate rand;
#[cfg(test)]
extern crate rand;
#[cfg(test)]
extern crate test;
extern crate tinysegmenter;
#[cfg(test)]
mod functional_test;
#[macro_use]
mod macros {
macro_rules! get(
($e:expr) => (match $e { Some(e) => e, None => return None })
);
mod macros;
macro_rules! doc(
() => (Document::default()); // avoids a warning due to the useless `mut`.
($($field:ident => $value:expr),*) => {{
let mut document = Document::default();
$(
document.add(FieldValue::new($field, $value.into()));
)*
document
}};
);
}
pub use error::Error;
pub use error::{Error, ErrorKind, ResultExt};
/// Tantivy result.
pub type Result<T> = std::result::Result<T, Error>;
mod core;
mod compression;
mod store;
mod indexer;
mod common;
#[allow(unused_doc_comment)]
mod error;
mod analyzer;
pub mod tokenizer;
mod datastruct;
/// Query module
pub mod termdict;
pub mod store;
pub mod query;
/// Directory module
pub mod directory;
/// Collector module
pub mod collector;
/// Postings module (also called inverted index)
pub mod postings;
/// Schema
pub mod schema;
pub mod fastfield;
pub use directory::Directory;
pub use core::{Index, Segment, SegmentId, SegmentMeta, Searcher};
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
pub use indexer::IndexWriter;
pub use schema::{Term, Document};
pub use core::SegmentReader;
pub use schema::{Document, Term};
pub use core::{InvertedIndexReader, SegmentReader};
pub use self::common::TimerTree;
pub use postings::DocSet;
pub use postings::Postings;
pub use postings::SegmentPostingsOption;
pub use core::TermIterator;
pub use core::SegmentComponent;
pub use common::{i64_to_u64, u64_to_i64};
/// Expose the current version of tantivy, as well
/// whether it was compiled with the simd compression.
pub fn version() -> &'static str {
if cfg!(feature="simdcompression") {
concat!(version!(), "-simd")
}
else {
concat!(version!(), "-nosimd")
if cfg!(feature = "simdcompression") {
concat!(version!(), "-simd")
} else {
concat!(version!(), "-nosimd")
}
}
/// Tantivy's makes it possible to personalize when
/// the indexer should merge its segments
/// Defines tantivy's merging strategy
pub mod merge_policy {
pub use indexer::MergePolicy;
pub use indexer::LogMergePolicy;
@@ -153,47 +136,48 @@ pub mod merge_policy {
pub use indexer::DefaultMergePolicy;
}
/// u32 identifying a document within a segment.
/// Documents have their doc id assigned incrementally,
/// A `u32` identifying a document within a segment.
/// Documents have their `DocId` assigned incrementally,
/// as they are added in the segment.
pub type DocId = u32;
/// f32 the score of a document.
/// A f32 that represents the relevance of the document to the query
///
/// This is modelled internally as a `f32`. The
/// larger the number, the more relevant the document
/// to the search
pub type Score = f32;
/// A segment local id identifies a segment.
/// A `SegmentLocalId` identifies a segment.
/// It only makes sense for a given searcher.
pub type SegmentLocalId = u32;
impl DocAddress {
/// Return the segment ordinal.
/// The segment ordinal is an id identifying the segment
/// hosting the document. It is only meaningful, in the context
/// of a searcher.
pub fn segment_ord(&self,) -> SegmentLocalId {
pub fn segment_ord(&self) -> SegmentLocalId {
self.0
}
/// Return the segment local `DocId`
pub fn doc(&self,) -> DocId {
pub fn doc(&self) -> DocId {
self.1
}
}
/// `DocAddress` contains all the necessary information
/// `DocAddress` contains all the necessary information
/// to identify a document given a `Searcher` object.
///
/// It consists in an id identifying its segment, and
///
/// It consists in an id identifying its segment, and
/// its segment-local `DocId`.
///
///
/// The id used for the segment is actually an ordinal
/// in the list of segment hold by a `Searcher`.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct DocAddress(pub SegmentLocalId, pub DocId);
#[cfg(test)]
mod tests {
@@ -201,13 +185,36 @@ mod tests {
use Index;
use core::SegmentReader;
use query::BooleanQuery;
use postings::SegmentPostingsOption;
use schema::IndexRecordOption;
use schema::*;
use DocSet;
use IndexWriter;
use fastfield::{FastFieldReader, U64FastFieldReader, I64FastFieldReader};
use fastfield::{FastFieldReader, I64FastFieldReader, U64FastFieldReader};
use Postings;
use rand::{Rng, SeedableRng, XorShiftRng};
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
(0..u32::max_value())
.filter(|_| rng.next_f32() < ratio)
.take(n)
.collect()
}
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
generate_array_with_seed(n, ratio, 4)
}
fn sample_with_seed(n: u32, ratio: f32, seed_val: u32) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
(0..n).filter(|_| rng.next_f32() < ratio).collect()
}
pub fn sample(n: u32, ratio: f32) -> Vec<u32> {
sample_with_seed(n, ratio, 4)
}
#[test]
fn test_indexing() {
@@ -232,11 +239,10 @@ mod tests {
}
assert!(index_writer.commit().is_ok());
}
}
#[test]
fn test_docfreq() {
fn test_docfreq1() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
@@ -274,8 +280,7 @@ mod tests {
assert_eq!(searcher.doc_freq(&term_d), 0);
}
}
#[test]
fn test_fieldnorm() {
let mut schema_builder = SchemaBuilder::default();
@@ -308,29 +313,36 @@ mod tests {
}
}
#[test]
fn test_delete_postings1() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let term_abcd = Term::from_field_text(text_field, "abcd");
let term_a = Term::from_field_text(text_field, "a");
let term_b = Term::from_field_text(text_field, "b");
let term_c = Term::from_field_text(text_field, "c");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{ // 0
{
// 0
let doc = doc!(text_field=>"a b");
index_writer.add_document(doc);
}
{ // 1
{
// 1
let doc = doc!(text_field=>" a c");
index_writer.add_document(doc);
}
{ // 2
{
// 2
let doc = doc!(text_field=>" b c");
index_writer.add_document(doc);
}
{ // 3
{
// 3
let doc = doc!(text_field=>" b d");
index_writer.add_document(doc);
}
@@ -340,11 +352,13 @@ mod tests {
{
index_writer.delete_term(Term::from_field_text(text_field, "a"));
}
{ // 4
{
// 4
let doc = doc!(text_field=>" b c");
index_writer.add_document(doc);
}
{ // 5
{
// 5
let doc = doc!(text_field=>" a");
index_writer.add_document(doc);
}
@@ -354,15 +368,24 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none());
let inverted_index = reader.inverted_index(text_field);
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 5);
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap();
let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -373,11 +396,13 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{ // 0
{
// 0
let doc = doc!(text_field=>"a b");
index_writer.add_document(doc);
}
{ // 1
{
// 1
index_writer.delete_term(Term::from_field_text(text_field, "c"));
}
index_writer.rollback().unwrap();
@@ -386,15 +411,25 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none());
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 5);
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap();
let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -405,14 +440,14 @@ mod tests {
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
{
let doc = doc!(text_field=>"a b");
index_writer.add_document(doc);
}
{
{
index_writer.delete_term(Term::from_field_text(text_field, "c"));
}
index_writer = index_writer.rollback().unwrap();
index_writer = index_writer.rollback().unwrap();
index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.commit().unwrap();
}
@@ -420,13 +455,22 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none());
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap();
let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 3);
assert!(postings.advance());
@@ -434,7 +478,9 @@ mod tests {
assert!(!postings.advance());
}
{
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "c")).unwrap();
let mut postings = inverted_index
.read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 4);
assert!(!postings.advance());
@@ -442,23 +488,24 @@ mod tests {
}
}
#[test]
fn test_indexed_u64() {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_u64_field("value", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(
doc!(field=>1u64)
);
index_writer.add_document(doc!(field=>1u64));
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher.segment_reader(0).read_postings(&term, SegmentPostingsOption::NoFreq).unwrap();
let mut postings = searcher
.segment_reader(0)
.inverted_index(term.field())
.read_postings(&term, IndexRecordOption::Basic)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert!(!postings.advance());
@@ -469,20 +516,20 @@ mod tests {
let mut schema_builder = SchemaBuilder::default();
let value_field = schema_builder.add_i64_field("value", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let negative_val = -1i64;
index_writer.add_document(
doc!(value_field => negative_val)
);
index_writer.add_document(doc!(value_field => negative_val));
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher
.segment_reader(0)
.read_postings(&term, SegmentPostingsOption::NoFreq).unwrap();
.inverted_index(term.field())
.read_postings(&term, IndexRecordOption::Basic)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert!(!postings.advance());
@@ -494,15 +541,15 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap();
let add_document = |index_writer: &mut IndexWriter, val: &'static str| {
let doc = doc!(text_field=>val);
index_writer.add_document(doc);
};
let remove_document = |index_writer: &mut IndexWriter, val: &'static str| {
let delterm = Term::from_field_text(text_field, val);
index_writer.delete_term(delterm);
@@ -543,8 +590,17 @@ mod tests {
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none());
let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "af")).unwrap();
let inverted_index = reader.inverted_index(text_field);
let term_abcd = Term::from_field_text(text_field, "abcd");
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
let term_af = Term::from_field_text(text_field, "af");
let mut postings = inverted_index
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert_eq!(postings.term_freq(), 3);
@@ -587,34 +643,42 @@ mod tests {
};
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(text_field, "a"))),
vec!(1, 2));
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
vec![1, 2]
);
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(text_field, "af"))),
vec!(0));
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
vec![0]
);
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(text_field, "b"))),
vec!(0, 1, 2));
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
vec![0, 1, 2]
);
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(text_field, "c"))),
vec!(1, 2));
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
vec![1, 2]
);
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(text_field, "d"))),
vec!(2));
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
vec![2]
);
}
{
assert_eq!(
get_doc_ids(vec!(Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a"), )),
vec!(0, 1, 2));
get_doc_ids(vec![
Term::from_field_text(text_field, "b"),
Term::from_field_text(text_field, "a"),
]),
vec![0, 1, 2]
);
}
}
}
@@ -651,7 +715,9 @@ mod tests {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let other_text_field = schema_builder.add_text_field("text2", TEXT);
let document = doc!(text_field => "tantivy", text_field => "some other value", other_text_field => "short");
let document = doc!(text_field => "tantivy",
text_field => "some other value",
other_text_field => "short");
assert_eq!(document.len(), 3);
let values = document.get_all(text_field);
assert_eq!(values.len(), 2);
@@ -683,30 +749,34 @@ mod tests {
let searcher = index.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
{
let fast_field_reader_res = segment_reader.get_fast_field_reader::<U64FastFieldReader>(text_field);
let fast_field_reader_res =
segment_reader.get_fast_field_reader::<U64FastFieldReader>(text_field);
assert!(fast_field_reader_res.is_err());
}
{
let fast_field_reader_res = segment_reader.get_fast_field_reader::<U64FastFieldReader>(stored_int_field);
let fast_field_reader_res =
segment_reader.get_fast_field_reader::<U64FastFieldReader>(stored_int_field);
assert!(fast_field_reader_res.is_err());
}
{
let fast_field_reader_res = segment_reader.get_fast_field_reader::<U64FastFieldReader>(fast_field_signed);
let fast_field_reader_res =
segment_reader.get_fast_field_reader::<U64FastFieldReader>(fast_field_signed);
assert!(fast_field_reader_res.is_err());
}
{
let fast_field_reader_res = segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
let fast_field_reader_res =
segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64)
}
{
let fast_field_reader_res = segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
let fast_field_reader_res =
segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64)
}
}
}

66
src/macros.rs Normal file
View File

@@ -0,0 +1,66 @@
macro_rules! get(
($e:expr) => (match $e { Some(e) => e, None => return None })
);
/// `doc!` is a shortcut that helps building `Document`
/// objects.
///
/// Assuming that `field1` and `field2` are `Field` instances.
/// You can create a document with a value of `value1` for `field1`
/// `value2` for `field2`, as follows :
///
/// ```c
/// doc!(
/// field1 => value1,
/// field2 => value2,
/// )
/// ```
///
/// The value can be a `u64`, a `&str`, a `i64`, or a `String`.
///
/// # Warning
///
/// The document hence created, is not yet validated against a schema.
/// Nothing prevents its user from creating an invalid document missing a
/// field, or associating a `String` to a `u64` field for instance.
///
/// # Example
///
/// ```
/// #[macro_use]
/// extern crate tantivy;
///
/// use tantivy::schema::{SchemaBuilder, TEXT, FAST};
///
/// //...
///
/// # fn main() {
/// let mut schema_builder = SchemaBuilder::new();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let author = schema_builder.add_text_field("text", TEXT);
/// let likes = schema_builder.add_u64_field("num_u64", FAST);
/// let schema = schema_builder.build();
/// let doc = doc!(
/// title => "Life Aquatic",
/// author => "Wes Anderson",
/// likes => 4u64
/// );
/// # }
/// ```
#[macro_export]
macro_rules! doc(
() => {
{
($crate::Document::default())
}
}; // avoids a warning due to the useless `mut`.
($($field:ident => $value:expr),*) => {
{
let mut document = $crate::Document::default();
$(
document.add($crate::schema::FieldValue::new($field, $value.into()));
)*
document
}
};
);

View File

@@ -3,7 +3,6 @@ use std::borrow::Borrow;
use std::borrow::BorrowMut;
use std::cmp::Ordering;
/// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`.
#[derive(PartialEq, Eq, Debug)]
pub enum SkipResult {
@@ -16,7 +15,6 @@ pub enum SkipResult {
End,
}
/// Represents an iterable set of sorted doc ids.
pub trait DocSet {
/// Goes to the next element.
@@ -52,6 +50,32 @@ pub trait DocSet {
}
}
/// Fills a given mutable buffer with the next doc ids from the
/// `DocSet`
///
/// If that many `DocId`s are available, the method should
/// fill the entire buffer and return the length of the buffer.
///
/// If we reach the end of the `DocSet` before filling
/// it entirely, then the buffer is filled up to this point, and
/// return value is the number of elements that were filled.
///
/// # Warning
///
/// This method is only here for specific high-performance
/// use case where batching. The normal way to
/// go through the `DocId`'s is to call `.advance()`.
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
for (i, buffer_val) in buffer.iter_mut().enumerate() {
if self.advance() {
*buffer_val = self.doc();
} else {
return i;
}
}
buffer.len()
}
/// Returns the current document
fn doc(&self) -> DocId;
@@ -65,8 +89,11 @@ pub trait DocSet {
None
}
}
}
/// Returns a best-effort hint of the
/// length of the docset.
fn size_hint(&self) -> usize;
}
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
fn advance(&mut self) -> bool {
@@ -83,6 +110,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
let unboxed: &TDocSet = self.borrow();
unboxed.doc()
}
fn size_hint(&self) -> usize {
let unboxed: &TDocSet = self.borrow();
unboxed.size_hint()
}
}
impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
@@ -100,4 +132,9 @@ impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
let unref: &TDocSet = *self;
unref.doc()
}
fn size_hint(&self) -> usize {
let unref: &TDocSet = *self;
unref.size_hint()
}
}

View File

@@ -1,126 +0,0 @@
use compression::BlockDecoder;
use common::VInt;
use common::BinarySerializable;
use compression::{CompositeDecoder, VIntDecoder};
use postings::SegmentPostingsOption;
use compression::NUM_DOCS_PER_BLOCK;
/// `FreqHandler` is in charge of decompressing
/// frequencies and/or positions.
pub struct FreqHandler {
freq_decoder: BlockDecoder,
positions: Vec<u32>,
option: SegmentPostingsOption,
positions_offsets: [usize; NUM_DOCS_PER_BLOCK + 1],
}
fn read_positions(data: &[u8]) -> Vec<u32> {
let mut composite_reader = CompositeDecoder::new();
let mut readable: &[u8] = data;
let uncompressed_len = VInt::deserialize(&mut readable).unwrap().0 as usize;
composite_reader.uncompress_unsorted(readable, uncompressed_len);
composite_reader.into()
}
impl FreqHandler {
/// Returns a `FreqHandler` that just decodes `DocId`s.
pub fn new_without_freq() -> FreqHandler {
FreqHandler {
freq_decoder: BlockDecoder::with_val(1u32),
positions: Vec::new(),
option: SegmentPostingsOption::NoFreq,
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
}
}
/// Returns a `FreqHandler` that decodes `DocId`s and term frequencies.
pub fn new_with_freq() -> FreqHandler {
FreqHandler {
freq_decoder: BlockDecoder::new(),
positions: Vec::new(),
option: SegmentPostingsOption::Freq,
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
}
}
/// Returns a `FreqHandler` that decodes `DocId`s, term frequencies, and term positions.
pub fn new_with_freq_and_position(position_data: &[u8]) -> FreqHandler {
let positions = read_positions(position_data);
FreqHandler {
freq_decoder: BlockDecoder::new(),
positions: positions,
option: SegmentPostingsOption::FreqAndPositions,
positions_offsets: [0; NUM_DOCS_PER_BLOCK + 1],
}
}
fn fill_positions_offset(&mut self) {
let mut cur_position: usize = self.positions_offsets[NUM_DOCS_PER_BLOCK];
let mut i: usize = 0;
self.positions_offsets[i] = cur_position;
let mut last_cur_position = cur_position;
for &doc_freq in self.freq_decoder.output_array() {
i += 1;
let mut cumulated_pos = 0u32;
// this next loop decodes delta positions into normal positions.
for j in last_cur_position..(last_cur_position + (doc_freq as usize)) {
cumulated_pos += self.positions[j];
self.positions[j] = cumulated_pos;
}
cur_position += doc_freq as usize;
self.positions_offsets[i] = cur_position;
last_cur_position = cur_position;
}
}
/// Accessor to term frequency
///
/// idx is the offset of the current doc in the block.
/// It takes value between 0 and 128.
pub fn freq(&self, idx: usize) -> u32 {
self.freq_decoder.output(idx)
}
/// Accessor to the positions
///
/// idx is the offset of the current doc in the block.
/// It takes value between 0 and 128.
pub fn positions(&self, idx: usize) -> &[u32] {
let start = self.positions_offsets[idx];
let stop = self.positions_offsets[idx + 1];
&self.positions[start..stop]
}
/// Decompresses a complete frequency block
pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
match self.option {
SegmentPostingsOption::NoFreq => data,
SegmentPostingsOption::Freq => self.freq_decoder.uncompress_block_unsorted(data),
SegmentPostingsOption::FreqAndPositions => {
let remaining: &'a [u8] = self.freq_decoder.uncompress_block_unsorted(data);
self.fill_positions_offset();
remaining
}
}
}
/// Decompresses an incomplete frequency block
pub fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
match self.option {
SegmentPostingsOption::NoFreq => {}
SegmentPostingsOption::Freq => {
self.freq_decoder.uncompress_vint_unsorted(data, num_els);
}
SegmentPostingsOption::FreqAndPositions => {
self.freq_decoder.uncompress_vint_unsorted(data, num_els);
self.fill_positions_offset();
}
}
}
}

View File

@@ -10,12 +10,13 @@ pub struct IntersectionDocSet<TDocSet: DocSet> {
}
impl<TDocSet: DocSet> From<Vec<TDocSet>> for IntersectionDocSet<TDocSet> {
fn from(docsets: Vec<TDocSet>) -> IntersectionDocSet<TDocSet> {
fn from(mut docsets: Vec<TDocSet>) -> IntersectionDocSet<TDocSet> {
assert!(docsets.len() >= 2);
docsets.sort_by_key(|docset| docset.size_hint());
IntersectionDocSet {
docsets: docsets,
docsets,
finished: false,
doc: DocId::max_value(),
doc: 0u32,
}
}
}
@@ -29,39 +30,51 @@ impl<TDocSet: DocSet> IntersectionDocSet<TDocSet> {
}
}
impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
fn size_hint(&self) -> usize {
self.docsets
.iter()
.map(|docset| docset.size_hint())
.min()
.unwrap() // safe as docsets cannot be empty.
}
#[allow(never_loop)]
fn advance(&mut self) -> bool {
if self.finished {
return false;
}
let num_docsets = self.docsets.len();
let mut count_matching = 0;
let mut doc_candidate = 0;
let mut ord = 0;
loop {
let mut doc_set = &mut self.docsets[ord];
match doc_set.skip_next(doc_candidate) {
SkipResult::Reached => {
count_matching += 1;
if count_matching == num_docsets {
self.doc = doc_candidate;
return true;
let mut candidate_doc = self.doc;
let mut candidate_ord = self.docsets.len();
'outer: loop {
for (ord, docset) in self.docsets.iter_mut().enumerate() {
if ord != candidate_ord {
// `candidate_ord` is already at the
// right position.
//
// Calling `skip_next` would advance this docset
// and miss it.
match docset.skip_next(candidate_doc) {
SkipResult::Reached => {}
SkipResult::OverStep => {
// this is not in the intersection,
// let's update our candidate.
candidate_doc = docset.doc();
candidate_ord = ord;
continue 'outer;
}
SkipResult::End => {
self.finished = true;
return false;
}
}
}
SkipResult::End => {
self.finished = true;
return false;
}
SkipResult::OverStep => {
count_matching = 1;
doc_candidate = doc_set.doc();
}
}
ord += 1;
if ord == num_docsets {
ord = 0;
}
self.doc = candidate_doc;
return true;
}
}
@@ -69,3 +82,50 @@ impl<TDocSet: DocSet> DocSet for IntersectionDocSet<TDocSet> {
self.doc
}
}
#[cfg(test)]
mod tests {
use postings::{DocSet, IntersectionDocSet, VecPostings};
#[test]
fn test_intersection() {
{
let left = VecPostings::from(vec![1, 3, 9]);
let right = VecPostings::from(vec![3, 4, 9, 18]);
let mut intersection = IntersectionDocSet::from(vec![left, right]);
assert!(intersection.advance());
assert_eq!(intersection.doc(), 3);
assert!(intersection.advance());
assert_eq!(intersection.doc(), 9);
assert!(!intersection.advance());
}
{
let a = VecPostings::from(vec![1, 3, 9]);
let b = VecPostings::from(vec![3, 4, 9, 18]);
let c = VecPostings::from(vec![1, 5, 9, 111]);
let mut intersection = IntersectionDocSet::from(vec![a, b, c]);
assert!(intersection.advance());
assert_eq!(intersection.doc(), 9);
assert!(!intersection.advance());
}
}
#[test]
fn test_intersection_zero() {
let left = VecPostings::from(vec![0]);
let right = VecPostings::from(vec![0]);
let mut intersection = IntersectionDocSet::from(vec![left, right]);
assert!(intersection.advance());
assert_eq!(intersection.doc(), 0);
}
#[test]
fn test_intersection_empty() {
let a = VecPostings::from(vec![1, 3]);
let b = VecPostings::from(vec![1, 4]);
let c = VecPostings::from(vec![3, 9]);
let mut intersection = IntersectionDocSet::from(vec![a, b, c]);
assert!(!intersection.advance());
}
}

View File

@@ -1,9 +1,12 @@
/*!
Postings module (also called inverted index)
*/
/// Postings module
///
/// Postings, also called inverted lists, is the key datastructure
/// to full-text search.
mod postings;
mod recorder;
mod serializer;
@@ -12,46 +15,43 @@ mod term_info;
mod vec_postings;
mod segment_postings;
mod intersection;
mod freq_handler;
mod docset;
mod segment_postings_option;
pub use self::docset::{SkipResult, DocSet};
use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
pub use self::serializer::PostingsSerializer;
pub use self::docset::{DocSet, SkipResult};
use self::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::term_info::TermInfo;
pub use self::postings::Postings;
#[cfg(test)]
pub use self::vec_postings::VecPostings;
pub use self::segment_postings::SegmentPostings;
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
pub use self::intersection::IntersectionDocSet;
pub use self::freq_handler::FreqHandler;
pub use self::segment_postings_option::SegmentPostingsOption;
pub use common::HasLen;
#[cfg(test)]
mod tests {
use super::*;
use schema::{Document, TEXT, STRING, SchemaBuilder, Term};
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
use core::SegmentComponent;
use indexer::SegmentWriter;
use core::SegmentReader;
use core::Index;
use schema::IndexRecordOption;
use std::iter;
use datastruct::stacker::Heap;
use fastfield::FastFieldReader;
use query::TermQuery;
use schema::Field;
use test::Bencher;
use test::{self, Bencher};
use indexer::operation::AddOperation;
use rand::{XorShiftRng, Rng, SeedableRng};
use tests;
use rand::{Rng, SeedableRng, XorShiftRng};
#[test]
pub fn test_position_write() {
let mut schema_builder = SchemaBuilder::default();
@@ -59,36 +59,43 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut segment = index.new_segment();
let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap();
posting_serializer.new_field(text_field);
posting_serializer.new_term("abc".as_bytes()).unwrap();
for doc_id in 0u32..3u32 {
let positions = vec!(1,2,3,2);
posting_serializer.write_doc(doc_id, 2, &positions).unwrap();
let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap();
{
let mut field_serializer = posting_serializer.new_field(text_field).unwrap();
field_serializer.new_term("abc".as_bytes()).unwrap();
for doc_id in 0u32..120u32 {
let delta_positions = vec![1, 2, 3, 2];
field_serializer
.write_doc(doc_id, 2, &delta_positions)
.unwrap();
}
field_serializer.close_term().unwrap();
}
posting_serializer.close_term().unwrap();
posting_serializer.close().unwrap();
let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
assert!(read.len() <= 16);
assert!(read.len() <= 140);
}
#[test]
pub fn test_position_and_fieldnorm() {
pub fn test_position_and_fieldnorm1() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let segment = index.new_segment();
let heap = Heap::with_capacity(10_000_000);
{
let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema).unwrap();
let mut segment_writer =
SegmentWriter::for_segment(&heap, 18, segment.clone(), &schema).unwrap();
{
let mut doc = Document::default();
// checking that position works if the field has two values
doc.add_text(text_field, "a b a c a d a a.");
doc.add_text(text_field, "d d d d a"); // checking that position works if the field has two values.
doc.add_text(text_field, "d d d d a");
let op = AddOperation {
opstamp: 0u64,
document: doc,
document: doc,
};
segment_writer.add_document(&op, &schema).unwrap();
}
@@ -97,7 +104,7 @@ mod tests {
doc.add_text(text_field, "b a");
let op = AddOperation {
opstamp: 1u64,
document: doc,
document: doc,
};
segment_writer.add_document(&op, &schema).unwrap();
}
@@ -108,38 +115,47 @@ mod tests {
doc.add_text(text_field, &text);
let op = AddOperation {
opstamp: 2u64,
document: doc,
document: doc,
};
segment_writer.add_document(&op, &schema).unwrap();
}
segment_writer.finalize().unwrap();
}
{
let segment_reader = SegmentReader::open(segment).unwrap();
let segment_reader = SegmentReader::open(&segment).unwrap();
{
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field).unwrap();
assert_eq!(fieldnorm_reader.get(0), 8 + 5);
assert_eq!(fieldnorm_reader.get(1), 2);
for i in 2 .. 1000 {
for i in 2..1000 {
assert_eq!(fieldnorm_reader.get(i), (i + 1) as u64);
}
}
{
let term_a = Term::from_field_text(text_field, "abcdef");
assert!(segment_reader.read_postings_all_info(&term_a).is_none());
assert!(
segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
}
{
let term_a = Term::from_field_text(text_field, "a");
let mut postings_a = segment_reader.read_postings_all_info(&term_a).unwrap();
let mut postings_a = segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert_eq!(postings_a.len(), 1000);
assert!(postings_a.advance());
assert_eq!(postings_a.doc(), 0);
assert_eq!(postings_a.term_freq(), 6);
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
assert_eq!(postings_a.positions(), [0, 2, 4, 6, 7, 13]);
assert!(postings_a.advance());
assert_eq!(postings_a.doc(), 1u32);
assert_eq!(postings_a.term_freq(), 1);
for i in 2u32 .. 1000u32 {
for i in 2u32..1000u32 {
assert!(postings_a.advance());
assert_eq!(postings_a.term_freq(), 1);
assert_eq!(postings_a.positions(), [i]);
@@ -149,9 +165,12 @@ mod tests {
}
{
let term_e = Term::from_field_text(text_field, "e");
let mut postings_e = segment_reader.read_postings_all_info(&term_e).unwrap();
let mut postings_e = segment_reader
.inverted_index(term_e.field())
.read_postings(&term_e, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert_eq!(postings_e.len(), 1000 - 2);
for i in 2u32 .. 1000u32 {
for i in 2u32..1000u32 {
assert!(postings_e.advance());
assert_eq!(postings_e.term_freq(), i);
let positions = postings_e.positions();
@@ -165,7 +184,7 @@ mod tests {
}
}
}
#[test]
pub fn test_position_and_fieldnorm2() {
let mut schema_builder = SchemaBuilder::default();
@@ -187,41 +206,193 @@ mod tests {
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
let term_query = TermQuery::new(Term::from_field_text(text_field, "a"), SegmentPostingsOption::NoFreq);
let term_query = TermQuery::new(
Term::from_field_text(text_field, "a"),
IndexRecordOption::Basic,
);
let searcher = index.searcher();
let mut term_weight = term_query.specialized_weight(&*searcher);
term_weight.segment_postings_options = SegmentPostingsOption::FreqAndPositions;
term_weight.index_record_option = IndexRecordOption::WithFreqsAndPositions;
let segment_reader = &searcher.segment_readers()[0];
let mut term_scorer = term_weight.specialized_scorer(segment_reader).unwrap();
assert!(term_scorer.advance());
assert_eq!(term_scorer.doc(), 1u32);
assert_eq!(term_scorer.postings().positions(), &[1u32, 4]);
}
#[test]
fn test_intersection() {
{
let left = VecPostings::from(vec!(1, 3, 9));
let right = VecPostings::from(vec!(3, 4, 9, 18));
let mut intersection = IntersectionDocSet::from(vec!(left, right));
assert!(intersection.advance());
assert_eq!(intersection.doc(), 3);
assert!(intersection.advance());
assert_eq!(intersection.doc(), 9);
assert!(!intersection.advance());
fn test_skip_next() {
let term_0 = Term::from_field_u64(Field(0), 0);
let term_1 = Term::from_field_u64(Field(0), 1);
let term_2 = Term::from_field_u64(Field(0), 2);
let num_docs = 300u32;
let index = {
let mut schema_builder = SchemaBuilder::default();
let value_field = schema_builder.add_u64_field("value", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
for i in 0..num_docs {
let mut doc = Document::default();
doc.add_u64(value_field, 2);
doc.add_u64(value_field, (i % 2) as u64);
index_writer.add_document(doc);
}
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
index
};
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
// check that the basic usage works
for i in 0..num_docs - 1 {
for j in i + 1..num_docs {
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.unwrap();
assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
assert_eq!(segment_postings.doc(), i);
assert_eq!(segment_postings.skip_next(j), SkipResult::Reached);
assert_eq!(segment_postings.doc(), j);
}
}
{
let a = VecPostings::from(vec!(1, 3, 9));
let b = VecPostings::from(vec!(3, 4, 9, 18));
let c = VecPostings::from(vec!(1, 5, 9, 111));
let mut intersection = IntersectionDocSet::from(vec!(a, b, c));
assert!(intersection.advance());
assert_eq!(intersection.doc(), 9);
assert!(!intersection.advance());
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.unwrap();
// check that `skip_next` advances the iterator
assert!(segment_postings.advance());
assert_eq!(segment_postings.doc(), 0);
assert_eq!(segment_postings.skip_next(1), SkipResult::Reached);
assert_eq!(segment_postings.doc(), 1);
assert_eq!(segment_postings.skip_next(1), SkipResult::OverStep);
assert_eq!(segment_postings.doc(), 2);
// check that going beyond the end is handled
assert_eq!(segment_postings.skip_next(num_docs), SkipResult::End);
}
// check that filtering works
{
let mut segment_postings = segment_reader
.inverted_index(term_0.field())
.read_postings(&term_0, IndexRecordOption::Basic)
.unwrap();
for i in 0..num_docs / 2 {
assert_eq!(segment_postings.skip_next(i * 2), SkipResult::Reached);
assert_eq!(segment_postings.doc(), i * 2);
}
let mut segment_postings = segment_reader
.inverted_index(term_0.field())
.read_postings(&term_0, IndexRecordOption::Basic)
.unwrap();
for i in 0..num_docs / 2 - 1 {
assert_eq!(segment_postings.skip_next(i * 2 + 1), SkipResult::OverStep);
assert_eq!(segment_postings.doc(), (i + 1) * 2);
}
}
// delete some of the documents
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
// make sure seeking still works
for i in 0..num_docs {
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.unwrap();
if i % 2 == 0 {
assert_eq!(segment_postings.skip_next(i), SkipResult::OverStep);
assert_eq!(segment_postings.doc(), i + 1);
} else {
assert_eq!(segment_postings.skip_next(i), SkipResult::Reached);
assert_eq!(segment_postings.doc(), i);
}
}
// now try with a longer sequence
{
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.unwrap();
let mut last = 2; // start from 5 to avoid seeking to 3 twice
let mut cur = 3;
loop {
match segment_postings.skip_next(cur) {
SkipResult::End => break,
SkipResult::Reached => assert_eq!(segment_postings.doc(), cur),
SkipResult::OverStep => assert_eq!(segment_postings.doc(), cur + 1),
}
let next = cur + last;
last = cur;
cur = next;
}
assert_eq!(cur, 377);
}
// delete everything else
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
// finally, check that it's empty
{
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.unwrap();
assert_eq!(segment_postings.skip_next(0), SkipResult::End);
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.unwrap();
assert_eq!(segment_postings.skip_next(num_docs), SkipResult::End);
}
}
lazy_static! {
static ref TERM_A: Term = {
let field = Field(0);
@@ -231,34 +402,40 @@ mod tests {
let field = Field(0);
Term::from_field_text(field, "b")
};
static ref TERM_C: Term = {
let field = Field(0);
Term::from_field_text(field, "c")
};
static ref TERM_D: Term = {
let field = Field(0);
Term::from_field_text(field, "d")
};
static ref INDEX: Index = {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", STRING);
let schema = schema_builder.build();
let seed: &[u32; 4] = &[1, 2, 3, 4];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
let index = Index::create_in_ram(schema);
let mut count_a = 0;
let mut count_b = 0;
let posting_list_size = 100_000;
let posting_list_size = 1_000_000;
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
for _ in 0 .. {
if count_a >= posting_list_size &&
count_b >= posting_list_size {
break;
}
for _ in 0 .. posting_list_size {
let mut doc = Document::default();
if count_a < posting_list_size && rng.gen_weighted_bool(15) {
count_a += 1;
if rng.gen_weighted_bool(15) {
doc.add_text(text_field, "a");
}
if count_b < posting_list_size && rng.gen_weighted_bool(10) {
count_b += 1;
if rng.gen_weighted_bool(10) {
doc.add_text(text_field, "b");
}
if rng.gen_weighted_bool(5) {
doc.add_text(text_field, "c");
}
if rng.gen_weighted_bool(1) {
doc.add_text(text_field, "d");
}
index_writer.add_document(doc);
}
assert!(index_writer.commit().is_ok());
@@ -267,27 +444,122 @@ mod tests {
index
};
}
#[bench]
fn bench_segment_postings(b: &mut Bencher) {
let searcher = INDEX.searcher();
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
let mut segment_postings = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap();
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap();
while segment_postings.advance() {}
});
}
}
#[bench]
fn bench_segment_intersection(b: &mut Bencher) {
let searcher = INDEX.searcher();
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
let segment_postings_a = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap();
let segment_postings_b = segment_reader.read_postings(&*TERM_B, SegmentPostingsOption::NoFreq).unwrap();
let mut intersection = IntersectionDocSet::from(vec!(segment_postings_a, segment_postings_b));
let segment_postings_a = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap();
let segment_postings_b = segment_reader
.inverted_index(TERM_B.field())
.read_postings(&*TERM_B, IndexRecordOption::Basic)
.unwrap();
let segment_postings_c = segment_reader
.inverted_index(TERM_C.field())
.read_postings(&*TERM_C, IndexRecordOption::Basic)
.unwrap();
let segment_postings_d = segment_reader
.inverted_index(TERM_D.field())
.read_postings(&*TERM_D, IndexRecordOption::Basic)
.unwrap();
let mut intersection = IntersectionDocSet::from(vec![
segment_postings_a,
segment_postings_b,
segment_postings_c,
segment_postings_d,
]);
while intersection.advance() {}
});
}
}
fn bench_skip_next(p: f32, b: &mut Bencher) {
let searcher = INDEX.searcher();
let segment_reader = searcher.segment_reader(0);
let docs = tests::sample(segment_reader.num_docs(), p);
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap();
let mut existing_docs = Vec::new();
segment_postings.advance();
for doc in &docs {
if *doc >= segment_postings.doc() {
existing_docs.push(*doc);
if segment_postings.skip_next(*doc) == SkipResult::End {
break;
}
}
}
b.iter(|| {
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap();
for doc in &existing_docs {
if segment_postings.skip_next(*doc) == SkipResult::End {
break;
}
}
});
}
#[bench]
fn bench_skip_next_p01(b: &mut Bencher) {
bench_skip_next(0.001, b);
}
#[bench]
fn bench_skip_next_p1(b: &mut Bencher) {
bench_skip_next(0.01, b);
}
#[bench]
fn bench_skip_next_p10(b: &mut Bencher) {
bench_skip_next(0.1, b);
}
#[bench]
fn bench_skip_next_p90(b: &mut Bencher) {
bench_skip_next(0.9, b);
}
#[bench]
fn bench_iterate_segment_postings(b: &mut Bencher) {
let searcher = INDEX.searcher();
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
let n: u32 = test::black_box(17);
let mut segment_postings = segment_reader
.inverted_index(TERM_A.field())
.read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap();
let mut s = 0u32;
while segment_postings.advance() {
s += (segment_postings.doc() & n) % 1024;
}
s
});
}
}

View File

@@ -1,47 +1,45 @@
use DocId;
use schema::Term;
use schema::FieldValue;
use postings::PostingsSerializer;
use postings::{FieldSerializer, InvertedIndexSerializer};
use std::io;
use postings::Recorder;
use analyzer::SimpleTokenizer;
use Result;
use schema::{Schema, Field};
use analyzer::StreamingIterator;
use schema::{Field, Schema};
use tokenizer::Token;
use std::marker::PhantomData;
use schema::extract_field_from_term_bytes;
use std::ops::DerefMut;
use datastruct::stacker::{HashMap, Heap};
use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
use postings::{NothingRecorder, TFAndPositionRecorder, TermFrequencyRecorder};
use schema::FieldEntry;
use schema::FieldType;
use schema::TextIndexingOptions;
use tokenizer::TokenStream;
use schema::IndexRecordOption;
fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box<PostingsWriter + 'a> {
match *field_entry.field_type() {
FieldType::Str(ref text_options) => {
match text_options.get_indexing_options() {
TextIndexingOptions::TokenizedWithFreq => {
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
}
TextIndexingOptions::TokenizedWithFreqAndPosition => {
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
}
_ => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
}
}
FieldType::U64(_) => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
FieldType::I64(_) => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
}
fn posting_from_field_entry<'a>(
field_entry: &FieldEntry,
heap: &'a Heap,
) -> Box<PostingsWriter + 'a> {
match *field_entry.field_type() {
FieldType::Str(ref text_options) => text_options
.get_indexing_options()
.map(|indexing_options| match indexing_options.index_option() {
IndexRecordOption::Basic => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
IndexRecordOption::WithFreqs => {
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
}
IndexRecordOption::WithFreqsAndPositions => {
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
}
})
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)),
FieldType::U64(_) | FieldType::I64(_) => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
}
}
pub struct MultiFieldPostingsWriter<'a> {
heap: &'a Heap,
term_index: HashMap<'a>,
@@ -49,58 +47,48 @@ pub struct MultiFieldPostingsWriter<'a> {
}
impl<'a> MultiFieldPostingsWriter<'a> {
/// Create a new `MultiFieldPostingsWriter` given
/// a schema and a heap.
pub fn new(schema: &Schema, heap: &'a Heap) -> MultiFieldPostingsWriter<'a> {
let capacity = heap.capacity();
let hashmap_size = hashmap_size_in_bits(capacity);
let term_index = HashMap::new(hashmap_size, heap);
let mut per_field_postings_writers: Vec<_> = vec!();
for field_entry in schema.fields() {
let field_entry = posting_from_field_entry(&field_entry, heap);
per_field_postings_writers.push(field_entry);
}
pub fn new(schema: &Schema, table_bits: usize, heap: &'a Heap) -> MultiFieldPostingsWriter<'a> {
let term_index = HashMap::new(table_bits, heap);
let per_field_postings_writers: Vec<_> = schema
.fields()
.iter()
.map(|field_entry| posting_from_field_entry(field_entry, heap))
.collect();
MultiFieldPostingsWriter {
heap: heap,
term_index: term_index,
per_field_postings_writers: per_field_postings_writers
heap,
term_index,
per_field_postings_writers,
}
}
pub fn index_text(&mut self,
doc: DocId,
field: Field,
field_values: &[&FieldValue])
-> u32 {
pub fn index_text(&mut self, doc: DocId, field: Field, token_stream: &mut TokenStream) -> u32 {
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
postings_writer.index_text(&mut self.term_index, doc, field, field_values, self.heap)
postings_writer.index_text(&mut self.term_index, doc, field, token_stream, self.heap)
}
pub fn suscribe(&mut self, doc: DocId, term: &Term) {
pub fn subscribe(&mut self, doc: DocId, term: &Term) {
let postings_writer = self.per_field_postings_writers[term.field().0 as usize].deref_mut();
postings_writer.suscribe(&mut self.term_index, doc, 0u32, term, self.heap)
}
/// Serialize the inverted index.
/// It pushes all term, one field at a time, towards the
/// It pushes all term, one field at a time, towards the
/// postings serializer.
pub fn serialize(&self, serializer: &mut PostingsSerializer) -> Result<()> {
let mut term_offsets: Vec<(&[u8], u32)> = self.term_index
.iter()
.collect();
#[allow(needless_range_loop)]
pub fn serialize(&self, serializer: &mut InvertedIndexSerializer) -> Result<()> {
let mut term_offsets: Vec<(&[u8], u32)> = self.term_index.iter().collect();
term_offsets.sort_by_key(|&(k, _v)| k);
let mut offsets: Vec<(Field, usize)> = vec!();
let mut offsets: Vec<(Field, usize)> = vec![];
let term_offsets_it = term_offsets
.iter()
.map(|&(ref key, _)| {
extract_field_from_term_bytes(&key)
})
.cloned()
.map(|(key, _)| Term::wrap(key).field())
.enumerate();
let mut prev_field = Field(u32::max_value());
for (offset, field) in term_offsets_it {
if field != prev_field {
@@ -111,24 +99,25 @@ impl<'a> MultiFieldPostingsWriter<'a> {
offsets.push((Field(0), term_offsets.len()));
for i in 0..(offsets.len() - 1) {
let (field, start) = offsets[i];
let (_, stop) = offsets[i+1];
let (_, stop) = offsets[i + 1];
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
let mut field_serializer = serializer.new_field(field)?;
postings_writer.serialize(
field,
&term_offsets[start..stop],
serializer,
self.heap)?;
&mut field_serializer,
self.heap,
)?;
field_serializer.close()?;
}
Ok(())
}
/// Return true iff the term dictionary is saturated.
pub fn is_termdic_saturated(&self) -> bool {
pub fn is_term_saturated(&self) -> bool {
self.term_index.is_saturated()
}
}
/// The `PostingsWriter` is in charge of receiving documenting
/// and building a `Segment` in anonymous memory.
///
@@ -141,39 +130,41 @@ pub trait PostingsWriter {
/// * term - the term
/// * heap - heap used to store the postings informations as well as the terms
/// in the hashmap.
fn suscribe(&mut self, term_index: &mut HashMap, doc: DocId, pos: u32, term: &Term, heap: &Heap);
fn suscribe(
&mut self,
term_index: &mut HashMap,
doc: DocId,
pos: u32,
term: &Term,
heap: &Heap,
);
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(&self, field: Field, term_addrs: &[(&[u8], u32)], serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
fn serialize(
&self,
term_addrs: &[(&[u8], u32)],
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()>;
/// Tokenize a text and suscribe all of its token.
fn index_text<'a>(&mut self,
term_index: &mut HashMap,
doc_id: DocId,
field: Field,
field_values: &[&'a FieldValue],
heap: &Heap)
-> u32 {
let mut pos = 0u32;
let mut num_tokens: u32 = 0u32;
fn index_text(
&mut self,
term_index: &mut HashMap,
doc_id: DocId,
field: Field,
token_stream: &mut TokenStream,
heap: &Heap,
) -> u32 {
let mut term = unsafe { Term::with_capacity(100) };
term.set_field(field);
for field_value in field_values {
let mut tokens = SimpleTokenizer.tokenize(field_value.value().text());
// right now num_tokens and pos are redundant, but it should
// change when we get proper analyzers
while let Some(token) = tokens.next() {
term.set_text(token);
self.suscribe(term_index, doc_id, pos, &term, heap);
pos += 1u32;
num_tokens += 1u32;
}
pos += 1;
// THIS is to avoid phrase query accross field repetition.
// span queries might still match though :|
}
num_tokens
let mut sink = |token: &Token| {
term.set_text(token.text.as_str());
self.suscribe(term_index, doc_id, token.position as u32, &term, heap);
};
token_stream.process(&mut sink)
}
}
@@ -184,25 +175,11 @@ pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
_recorder_type: PhantomData<Rec>,
}
/// Given a `Heap` size, computes a relevant size for the `HashMap`.
fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
let num_buckets_usable = heap_capacity / 100;
let hash_table_size = num_buckets_usable * 2;
let mut pow = 512;
for num_bits in 10..32 {
pow <<= 1;
if pow > hash_table_size {
return num_bits;
}
}
32
}
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
/// constructor
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
SpecializedPostingsWriter {
heap: heap,
heap,
_recorder_type: PhantomData,
}
}
@@ -214,8 +191,15 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
}
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
fn suscribe(&mut self, term_index: &mut HashMap, doc: DocId, position: u32, term: &Term, heap: &Heap) {
fn suscribe(
&mut self,
term_index: &mut HashMap,
doc: DocId,
position: u32,
term: &Term,
heap: &Heap,
) {
debug_assert!(term.as_slice().len() >= 4);
let recorder: &mut Rec = term_index.get_or_create(term);
let current_doc = recorder.current_doc();
if current_doc != doc {
@@ -227,27 +211,18 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
recorder.record_position(position, heap);
}
fn serialize(&self,
field: Field,
fn serialize(
&self,
term_addrs: &[(&[u8], u32)],
serializer: &mut PostingsSerializer,
heap: &Heap) -> io::Result<()> {
serializer.new_field(field);
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
for &(term_bytes, addr) in term_addrs {
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
try!(serializer.new_term(&term_bytes));
try!(recorder.serialize(addr, serializer, heap));
try!(serializer.close_term());
}
serializer.new_term(term_bytes)?;
recorder.serialize(addr, serializer, heap)?;
serializer.close_term()?;
}
Ok(())
}
}
#[test]
fn test_hashmap_size() {
assert_eq!(hashmap_size_in_bits(10), 10);
assert_eq!(hashmap_size_in_bits(0), 10);
assert_eq!(hashmap_size_in_bits(100_000), 11);
assert_eq!(hashmap_size_in_bits(300_000_000), 23);
}

View File

@@ -1,10 +1,10 @@
use DocId;
use std::io;
use postings::PostingsSerializer;
use std::{self, io};
use postings::FieldSerializer;
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
const POSITION_END: u32 = 4294967295;
const POSITION_END: u32 = std::u32::MAX;
/// Recorder is in charge of recording relevant information about
/// the presence of a term in a document.
@@ -27,15 +27,15 @@ pub trait Recorder: HeapAllocable {
/// Close the document. It will help record the term frequency.
fn close_doc(&mut self, heap: &Heap);
/// Pushes the postings information to the serializer.
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()>;
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()>;
}
/// Only records the doc ids
#[repr(C, packed)]
pub struct NothingRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
@@ -64,21 +64,20 @@ impl Recorder for NothingRecorder {
fn close_doc(&mut self, _heap: &Heap) {}
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
for doc in self.stack.iter(self_addr, heap) {
try!(serializer.write_doc(doc, 0u32, &EMPTY_ARRAY));
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
}
Ok(())
}
}
/// Recorder encoding document ids, and term frequencies
#[repr(C, packed)]
pub struct TermFrequencyRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
@@ -96,7 +95,6 @@ impl HeapAllocable for TermFrequencyRecorder {
}
impl Recorder for TermFrequencyRecorder {
fn current_doc(&self) -> DocId {
self.current_doc
}
@@ -116,33 +114,29 @@ impl Recorder for TermFrequencyRecorder {
self.current_tf = 0;
}
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
// the last document has not been closed...
// its term freq is self.current_tf.
let mut doc_iter = self.stack
.iter(self_addr, heap)
.chain(Some(self.current_tf).into_iter());
loop {
if let Some(doc) = doc_iter.next() {
if let Some(term_freq) = doc_iter.next() {
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
continue;
}
}
return Ok(());
while let Some(doc) = doc_iter.next() {
let term_freq = doc_iter
.next()
.expect("The IndexWriter recorded a doc without a term freq.");
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
}
Ok(())
}
}
/// Recorder encoding term frequencies as well as positions.
#[repr(C, packed)]
pub struct TFAndPositionRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
@@ -175,33 +169,26 @@ impl Recorder for TFAndPositionRecorder {
self.stack.push(POSITION_END, heap);
}
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
fn serialize(
&self,
self_addr: u32,
serializer: &mut FieldSerializer,
heap: &Heap,
) -> io::Result<()> {
let mut doc_positions = Vec::with_capacity(100);
let mut positions_iter = self.stack.iter(self_addr, heap);
while let Some(doc) = positions_iter.next() {
let mut prev_position = 0;
doc_positions.clear();
loop {
match positions_iter.next() {
Some(position) => {
if position == POSITION_END {
break;
} else {
doc_positions.push(position - prev_position);
prev_position = position;
}
}
None => {
// the last document has not been closed...
break;
}
for position in &mut positions_iter {
if position == POSITION_END {
break;
} else {
doc_positions.push(position - prev_position);
prev_position = position;
}
}
try!(serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions));
serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)?;
}
Ok(())
}

View File

@@ -1,121 +1,545 @@
use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder};
use compression::{BlockDecoder, CompressedIntStream, VIntDecoder, COMPRESSION_BLOCK_SIZE};
use DocId;
use postings::{Postings, FreqHandler, DocSet, HasLen};
use std::num::Wrapping;
use postings::{DocSet, HasLen, Postings, SkipResult};
use std::cmp;
use fst::Streamer;
use fastfield::DeleteBitSet;
use std::cell::UnsafeCell;
use directory::{ReadOnlySource, SourceRead};
const EMPTY_POSITIONS: [u32; 0] = [0u32; 0];
const EMPTY_DATA: [u8; 0] = [0u8; 0];
struct PositionComputer {
// store the amount of position int
// before reading positions.
//
// if none, position are already loaded in
// the positions vec.
position_to_skip: Option<usize>,
positions: Vec<u32>,
positions_stream: CompressedIntStream,
}
impl PositionComputer {
pub fn new(positions_stream: CompressedIntStream) -> PositionComputer {
PositionComputer {
position_to_skip: None,
positions: vec![],
positions_stream,
}
}
pub fn add_skip(&mut self, num_skip: usize) {
self.position_to_skip = Some(
self.position_to_skip
.map(|prev_skip| prev_skip + num_skip)
.unwrap_or(0),
);
}
pub fn positions(&mut self, term_freq: usize) -> &[u32] {
if let Some(num_skip) = self.position_to_skip {
self.positions.resize(term_freq, 0u32);
self.positions_stream.skip(num_skip);
self.positions_stream.read(&mut self.positions[..term_freq]);
let mut cum = 0u32;
for i in 0..term_freq as usize {
cum += self.positions[i];
self.positions[i] = cum;
}
self.position_to_skip = None;
}
&self.positions[..term_freq]
}
}
/// `SegmentPostings` represents the inverted list or postings associated to
/// a term in a `Segment`.
///
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
/// Positions on the other hand, are optionally entirely decoded upfront.
pub struct SegmentPostings<'a> {
len: usize,
doc_offset: u32,
block_decoder: BlockDecoder,
freq_handler: FreqHandler,
remaining_data: &'a [u8],
cur: Wrapping<usize>,
pub struct SegmentPostings {
block_cursor: BlockSegmentPostings,
cur: usize,
delete_bitset: DeleteBitSet,
position_computer: Option<UnsafeCell<PositionComputer>>,
}
impl<'a> SegmentPostings<'a> {
fn load_next_block(&mut self) {
let num_remaining_docs = self.len - self.cur.0;
if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
self.remaining_data = self.block_decoder
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
} else {
self.remaining_data = self.block_decoder
.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs);
self.freq_handler.read_freq_vint(self.remaining_data, num_remaining_docs);
}
}
impl SegmentPostings {
/// Reads a Segment postings from an &[u8]
///
/// * `len` - number of document in the posting lists.
/// * `data` - data array. The complete data is not necessarily used.
/// * `freq_handler` - the freq handler is in charge of decoding
/// frequencies and/or positions
pub fn from_data(len: u32,
data: &'a [u8],
delete_bitset: &'a DeleteBitSet,
freq_handler: FreqHandler) -> SegmentPostings<'a> {
pub fn from_block_postings(
segment_block_postings: BlockSegmentPostings,
delete_bitset: DeleteBitSet,
positions_stream_opt: Option<CompressedIntStream>,
) -> SegmentPostings {
let position_computer =
positions_stream_opt.map(|stream| UnsafeCell::new(PositionComputer::new(stream)));
SegmentPostings {
len: len as usize,
doc_offset: 0,
block_decoder: BlockDecoder::new(),
freq_handler: freq_handler,
remaining_data: data,
cur: Wrapping(usize::max_value()),
delete_bitset: delete_bitset.clone(),
block_cursor: segment_block_postings,
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
delete_bitset,
position_computer,
}
}
/// Returns an empty segment postings object
pub fn empty() -> SegmentPostings<'static> {
pub fn empty() -> SegmentPostings {
let empty_block_cursor = BlockSegmentPostings::empty();
SegmentPostings {
len: 0,
doc_offset: 0,
block_decoder: BlockDecoder::new(),
freq_handler: FreqHandler::new_without_freq(),
remaining_data: &EMPTY_DATA,
block_cursor: empty_block_cursor,
delete_bitset: DeleteBitSet::empty(),
cur: Wrapping(usize::max_value()),
cur: COMPRESSION_BLOCK_SIZE,
position_computer: None,
}
}
/// Index within a block is used as an address when
/// interacting with the `FreqHandler`
fn index_within_block(&self) -> usize {
self.cur.0 % NUM_DOCS_PER_BLOCK
fn position_add_skip<F: FnOnce() -> usize>(&self, num_skips_fn: F) {
if let Some(position_computer) = self.position_computer.as_ref() {
let num_skips = num_skips_fn();
unsafe {
(*position_computer.get()).add_skip(num_skips);
}
}
}
}
impl<'a> DocSet for SegmentPostings<'a> {
impl DocSet for SegmentPostings {
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> bool {
loop {
self.cur += Wrapping(1);
if self.cur.0 >= self.len {
return false;
}
if self.index_within_block() == 0 {
self.load_next_block();
self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = COMPRESSION_BLOCK_SIZE;
return false;
}
}
self.position_add_skip(|| self.term_freq() as usize);
if !self.delete_bitset.is_deleted(self.doc()) {
return true;
}
}
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
if !self.advance() {
return SkipResult::End;
}
// in the following, thanks to the call to advance above,
// we know that the position is not loaded and we need
// to skip every doc_freq we cross.
// skip blocks until one that might contain the target
loop {
// check if we need to go to the next block
let (current_doc, last_doc_in_block) = {
let block_docs = self.block_cursor.docs();
(block_docs[self.cur], block_docs[block_docs.len() - 1])
};
if target > last_doc_in_block {
// we add skip for the current term independantly,
// so that position_add_skip will decide if it should
// just set itself to Some(0) or effectively
// add the term freq.
//let num_skips: u32 = ;
self.position_add_skip(|| {
let freqs_skipped = &self.block_cursor.freqs()[self.cur..];
let sum_freq: u32 = freqs_skipped.iter().cloned().sum();
sum_freq as usize
});
if !self.block_cursor.advance() {
return SkipResult::End;
}
self.cur = 0;
} else {
if target < current_doc {
// We've passed the target after the first `advance` call
// or we're at the beginning of a block.
// Either way, we're on the first `DocId` greater than `target`
return SkipResult::OverStep;
}
break;
}
}
{
// we're in the right block now, start with an exponential search
let block_docs = self.block_cursor.docs();
let block_len = block_docs.len();
debug_assert!(target >= block_docs[self.cur]);
debug_assert!(target <= block_docs[block_len - 1]);
let mut start = self.cur;
let mut end = block_len;
let mut count = 1;
loop {
let new = start + count;
if new < end && block_docs[new] < target {
start = new;
count *= 2;
} else {
break;
}
}
end = cmp::min(start + count, end);
// now do a binary search
let mut count = end - start;
while count > 0 {
let step = count / 2;
let mid = start + step;
let doc = block_docs[mid];
if doc < target {
start = mid + 1;
count -= step + 1;
} else {
count = step;
}
}
// `doc` is now >= `target`
let doc = block_docs[start];
self.position_add_skip(|| {
let freqs_skipped = &self.block_cursor.freqs()[self.cur..start];
let sum_freqs: u32 = freqs_skipped.iter().sum();
sum_freqs as usize
});
self.cur = start;
if !self.delete_bitset.is_deleted(doc) {
if doc == target {
return SkipResult::Reached;
} else {
return SkipResult::OverStep;
}
}
}
if self.advance() {
SkipResult::OverStep
} else {
SkipResult::End
}
}
fn size_hint(&self) -> usize {
self.len()
}
/// Return the current document's `DocId`.
#[inline]
fn doc(&self) -> DocId {
self.block_decoder.output(self.index_within_block())
let docs = self.block_cursor.docs();
debug_assert!(
self.cur < docs.len(),
"Have you forgotten to call `.advance()` at least once before calling .doc()."
);
docs[self.cur]
}
}
impl<'a> HasLen for SegmentPostings<'a> {
impl HasLen for SegmentPostings {
fn len(&self) -> usize {
self.len
self.block_cursor.doc_freq()
}
}
impl<'a> Postings for SegmentPostings<'a> {
impl Postings for SegmentPostings {
fn term_freq(&self) -> u32 {
self.freq_handler.freq(self.index_within_block())
self.block_cursor.freq(self.cur)
}
fn positions(&self) -> &[u32] {
self.freq_handler.positions(self.index_within_block())
let term_freq = self.term_freq();
self.position_computer
.as_ref()
.map(|position_computer| unsafe {
(&mut *position_computer.get()).positions(term_freq as usize)
})
.unwrap_or(&EMPTY_POSITIONS[..])
}
}
/// `BlockSegmentPostings` is a cursor iterating over blocks
/// of documents.
///
/// # Warning
///
/// While it is useful for some very specific high-performance
/// use cases, you should prefer using `SegmentPostings` for most usage.
pub struct BlockSegmentPostings {
doc_decoder: BlockDecoder,
freq_decoder: BlockDecoder,
has_freq: bool,
doc_freq: usize,
doc_offset: DocId,
num_bitpacked_blocks: usize,
num_vint_docs: usize,
remaining_data: SourceRead,
}
impl BlockSegmentPostings {
pub(crate) fn from_data(
doc_freq: usize,
data: SourceRead,
has_freq: bool,
) -> BlockSegmentPostings {
let num_bitpacked_blocks: usize = (doc_freq as usize) / COMPRESSION_BLOCK_SIZE;
let num_vint_docs = (doc_freq as usize) - COMPRESSION_BLOCK_SIZE * num_bitpacked_blocks;
BlockSegmentPostings {
num_bitpacked_blocks,
num_vint_docs,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
has_freq,
remaining_data: data,
doc_offset: 0,
doc_freq,
}
}
// Resets the block segment postings on another position
// in the postings file.
//
// This is useful for enumerating through a list of terms,
// and consuming the associated posting lists while avoiding
// reallocating a `BlockSegmentPostings`.
//
// # Warning
//
// This does not reset the positions list.
pub(crate) fn reset(&mut self, doc_freq: usize, postings_data: SourceRead) {
let num_binpacked_blocks: usize = doc_freq / COMPRESSION_BLOCK_SIZE;
let num_vint_docs = doc_freq & (COMPRESSION_BLOCK_SIZE - 1);
self.num_bitpacked_blocks = num_binpacked_blocks;
self.num_vint_docs = num_vint_docs;
self.remaining_data = postings_data;
self.doc_offset = 0;
self.doc_freq = doc_freq;
}
/// Returns the document frequency associated to this block postings.
///
/// This `doc_freq` is simply the sum of the length of all of the blocks
/// length, and it does not take in account deleted documents.
pub fn doc_freq(&self) -> usize {
self.doc_freq
}
/// Returns the array of docs in the current block.
///
/// Before the first call to `.advance()`, the block
/// returned by `.docs()` is empty.
#[inline]
pub fn docs(&self) -> &[DocId] {
self.doc_decoder.output_array()
}
/// Return the document at index `idx` of the block.
#[inline]
pub fn doc(&self, idx: usize) -> u32 {
self.doc_decoder.output(idx)
}
/// Return the array of `term freq` in the block.
#[inline]
pub fn freqs(&self) -> &[u32] {
self.freq_decoder.output_array()
}
/// Return the frequency at index `idx` of the block.
#[inline]
pub fn freq(&self, idx: usize) -> u32 {
self.freq_decoder.output(idx)
}
/// Returns the length of the current block.
///
/// All blocks have a length of `NUM_DOCS_PER_BLOCK`,
/// except the last block that may have a length
/// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1`
#[inline]
fn block_len(&self) -> usize {
self.doc_decoder.output_len
}
/// Advance to the next block.
///
/// Returns false iff there was no remaining blocks.
pub fn advance(&mut self) -> bool {
if self.num_bitpacked_blocks > 0 {
let num_consumed_bytes = self.doc_decoder
.uncompress_block_sorted(self.remaining_data.as_ref(), self.doc_offset);
self.remaining_data.advance(num_consumed_bytes);
if self.has_freq {
let num_consumed_bytes = self.freq_decoder
.uncompress_block_unsorted(self.remaining_data.as_ref());
self.remaining_data.advance(num_consumed_bytes);
}
// it will be used as the next offset.
self.doc_offset = self.doc_decoder.output(COMPRESSION_BLOCK_SIZE - 1);
self.num_bitpacked_blocks -= 1;
true
} else if self.num_vint_docs > 0 {
let num_compressed_bytes = self.doc_decoder.uncompress_vint_sorted(
self.remaining_data.as_ref(),
self.doc_offset,
self.num_vint_docs,
);
self.remaining_data.advance(num_compressed_bytes);
if self.has_freq {
self.freq_decoder
.uncompress_vint_unsorted(self.remaining_data.as_ref(), self.num_vint_docs);
}
self.num_vint_docs = 0;
true
} else {
false
}
}
/// Returns an empty segment postings object
pub fn empty() -> BlockSegmentPostings {
BlockSegmentPostings {
num_bitpacked_blocks: 0,
num_vint_docs: 0,
doc_decoder: BlockDecoder::new(),
freq_decoder: BlockDecoder::with_val(1),
has_freq: false,
remaining_data: From::from(ReadOnlySource::empty()),
doc_offset: 0,
doc_freq: 0,
}
}
}
impl<'b> Streamer<'b> for BlockSegmentPostings {
type Item = &'b [DocId];
fn next(&'b mut self) -> Option<&'b [DocId]> {
if self.advance() {
Some(self.docs())
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use DocSet;
use super::SegmentPostings;
use schema::SchemaBuilder;
use core::Index;
use schema::INT_INDEXED;
use schema::Term;
use fst::Streamer;
use schema::IndexRecordOption;
use common::HasLen;
use super::BlockSegmentPostings;
#[test]
fn test_empty_segment_postings() {
let mut postings = SegmentPostings::empty();
assert!(!postings.advance());
assert!(!postings.advance());
assert_eq!(postings.len(), 0);
}
#[test]
fn test_empty_block_segment_postings() {
let mut postings = BlockSegmentPostings::empty();
assert!(!postings.advance());
assert_eq!(postings.doc_freq(), 0);
}
#[test]
fn test_block_segment_postings() {
let mut schema_builder = SchemaBuilder::default();
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
for _ in 0..100_000 {
let doc = doc!(int_field=>0u64);
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field);
let term = Term::from_field_u64(int_field, 0u64);
let term_info = inverted_index.get_term_info(&term).unwrap();
let mut block_segments =
inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic);
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());
// checking that the `doc_freq` is correct
assert_eq!(block_segments.doc_freq(), 100_000);
while let Some(block) = block_segments.next() {
for (i, doc) in block.iter().cloned().enumerate() {
assert_eq!(offset + (i as u32), doc);
}
offset += block.len() as u32;
}
}
#[test]
fn test_reset_block_segment_postings() {
let mut schema_builder = SchemaBuilder::default();
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// create two postings list, one containg even number,
// the other containing odd numbers.
for i in 0..6 {
let doc = doc!(int_field=> (i % 2) as u64);
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let mut block_segments;
{
let term = Term::from_field_u64(int_field, 0u64);
let inverted_index = segment_reader.inverted_index(int_field);
let term_info = inverted_index.get_term_info(&term).unwrap();
block_segments = inverted_index
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic);
}
assert!(block_segments.advance());
assert_eq!(block_segments.docs(), &[0, 2, 4]);
{
let term = Term::from_field_u64(int_field, 1u64);
let inverted_index = segment_reader.inverted_index(int_field);
let term_info = inverted_index.get_term_info(&term).unwrap();
inverted_index.reset_block_postings_from_terminfo(&term_info, &mut block_segments);
}
assert!(block_segments.advance());
assert_eq!(block_segments.docs(), &[1, 3, 5]);
}
}

View File

@@ -1,17 +0,0 @@
/// Object describing the amount of information required when reading a postings.
///
/// Since decoding information is not free, this makes it possible to
/// avoid this extra cost when the information is not required.
/// For instance, positions are useful when running phrase queries
/// but useless in other queries.
#[derive(Clone, Copy, Debug)]
pub enum SegmentPostingsOption {
/// Only the doc ids are decoded
NoFreq,
/// DocIds and term frequencies are decoded
Freq,
/// DocIds, term frequencies and positions will be decoded.
FreqAndPositions,
}

View File

@@ -1,22 +1,19 @@
use Result;
use datastruct::FstMapBuilder;
use termdict::TermDictionaryBuilderImpl;
use super::TermInfo;
use schema::Field;
use schema::FieldEntry;
use schema::FieldType;
use schema::Schema;
use schema::TextIndexingOptions;
use directory::WritePtr;
use compression::{NUM_DOCS_PER_BLOCK, BlockEncoder, CompositeEncoder};
use compression::{BlockEncoder, COMPRESSION_BLOCK_SIZE};
use DocId;
use core::Segment;
use std::io;
use core::SegmentComponent;
use std::io::Write;
use std::io::{self, Write};
use compression::VIntEncoder;
use common::VInt;
use common::BinarySerializable;
use common::CountingWriter;
use common::CompositeWrite;
use termdict::TermDictionaryBuilder;
/// `PostingsSerializer` is in charge of serializing
/// postings on disk, in the
@@ -49,86 +46,126 @@ use common::BinarySerializable;
///
/// A description of the serialization format is
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
pub struct PostingsSerializer {
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>,
postings_write: WritePtr,
positions_write: WritePtr,
written_bytes_postings: usize,
written_bytes_positions: usize,
last_doc_id_encoded: u32,
positions_encoder: CompositeEncoder,
block_encoder: BlockEncoder,
doc_ids: Vec<DocId>,
term_freqs: Vec<u32>,
position_deltas: Vec<u32>,
pub struct InvertedIndexSerializer {
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
schema: Schema,
text_indexing_options: TextIndexingOptions,
term_open: bool,
current_term_info: TermInfo,
}
impl PostingsSerializer {
impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment
pub fn new(terms_write: WritePtr,
postings_write: WritePtr,
positions_write: WritePtr,
schema: Schema)
-> Result<PostingsSerializer> {
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
Ok(PostingsSerializer {
terms_fst_builder: terms_fst_builder,
postings_write: postings_write,
positions_write: positions_write,
written_bytes_postings: 0,
written_bytes_positions: 0,
last_doc_id_encoded: 0u32,
positions_encoder: CompositeEncoder::new(),
block_encoder: BlockEncoder::new(),
doc_ids: Vec::new(),
term_freqs: Vec::new(),
position_deltas: Vec::new(),
schema: schema,
text_indexing_options: TextIndexingOptions::Unindexed,
term_open: false,
current_term_info: TermInfo::default(),
fn new(
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
schema: Schema,
) -> Result<InvertedIndexSerializer> {
Ok(InvertedIndexSerializer {
terms_write,
postings_write,
positions_write,
schema,
})
}
/// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> Result<PostingsSerializer> {
let terms_write = try!(segment.open_write(SegmentComponent::TERMS));
let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS));
PostingsSerializer::new(terms_write,
postings_write,
positions_write,
segment.schema())
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
use SegmentComponent::{POSITIONS, POSTINGS, TERMS};
InvertedIndexSerializer::new(
CompositeWrite::wrap(segment.open_write(TERMS)?),
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
segment.schema(),
)
}
/// Must be called before starting pushing terms of
/// Must be called before starting pushing terms of
/// a given field.
///
/// Loads the indexing options for the given field.
pub fn new_field(&mut self, field: Field) {
pub fn new_field(&mut self, field: Field) -> io::Result<FieldSerializer> {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
self.text_indexing_options = match *field_entry.field_type() {
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
FieldType::U64(ref int_options) => {
if int_options.is_indexed() {
TextIndexingOptions::Unindexed
let term_dictionary_write = self.terms_write.for_field(field);
let postings_write = self.postings_write.for_field(field);
let positions_write = self.positions_write.for_field(field);
FieldSerializer::new(
field_entry.field_type().clone(),
term_dictionary_write,
postings_write,
positions_write,
)
}
/// Closes the serializer.
pub fn close(self) -> io::Result<()> {
self.terms_write.close()?;
self.postings_write.close()?;
self.positions_write.close()?;
Ok(())
}
}
/// The field serializer is in charge of
/// the serialization of a specific field.
pub struct FieldSerializer<'a> {
term_dictionary_builder: TermDictionaryBuilderImpl<&'a mut CountingWriter<WritePtr>>,
postings_serializer: PostingsSerializer<&'a mut CountingWriter<WritePtr>>,
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
current_term_info: TermInfo,
term_open: bool,
}
impl<'a> FieldSerializer<'a> {
fn new(
field_type: FieldType,
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
postings_write: &'a mut CountingWriter<WritePtr>,
positions_write: &'a mut CountingWriter<WritePtr>,
) -> io::Result<FieldSerializer<'a>> {
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
FieldType::Str(ref text_options) => {
if let Some(text_indexing_options) = text_options.get_indexing_options() {
let index_option = text_indexing_options.index_option();
(
index_option.is_termfreq_enabled(),
index_option.is_position_enabled(),
)
} else {
TextIndexingOptions::Untokenized
(false, false)
}
}
FieldType::I64(ref int_options) => {
if int_options.is_indexed() {
TextIndexingOptions::Unindexed
} else {
TextIndexingOptions::Untokenized
}
}
_ => (false, false),
};
let term_dictionary_builder =
TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?;
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled);
let positions_serializer_opt = if position_enabled {
Some(PositionSerializer::new(positions_write))
} else {
None
};
Ok(FieldSerializer {
term_dictionary_builder,
postings_serializer,
positions_serializer_opt,
current_term_info: TermInfo::default(),
term_open: false,
})
}
fn current_term_info(&self) -> TermInfo {
let (filepos, offset) = self.positions_serializer_opt
.as_ref()
.map(|positions_serializer| positions_serializer.addr())
.unwrap_or((0u64, 0u8));
TermInfo {
doc_freq: 0,
postings_offset: self.postings_serializer.addr(),
positions_offset: filepos,
positions_inner_offset: offset,
}
}
/// Starts the postings for a new term.
@@ -136,73 +173,16 @@ impl PostingsSerializer {
/// to the lexicographical order.
/// * doc_freq - return the number of document containing the term.
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
if self.term_open {
panic!("Called new_term, while the previous term was not closed.");
}
assert!(
!self.term_open,
"Called new_term, while the previous term was not closed."
);
self.term_open = true;
self.doc_ids.clear();
self.last_doc_id_encoded = 0;
self.term_freqs.clear();
self.position_deltas.clear();
self.current_term_info = TermInfo {
doc_freq: 0,
postings_offset: self.written_bytes_postings as u32,
positions_offset: self.written_bytes_positions as u32,
};
self.terms_fst_builder.insert_key(term)
self.postings_serializer.clear();
self.current_term_info = self.current_term_info();
self.term_dictionary_builder.insert_key(term)
}
/// Finish the serialization for this term postings.
///
/// If the current block is incomplete, it need to be encoded
/// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> {
if self.term_open {
self.terms_fst_builder.insert_value(&self.current_term_info)?;
if !self.doc_ids.is_empty() {
// we have doc ids waiting to be written
// this happens when the number of doc ids is
// not a perfect multiple of our block size.
//
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded = self.block_encoder
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
self.written_bytes_postings += block_encoded.len();
try!(self.postings_write.write_all(block_encoded));
self.doc_ids.clear();
}
// ... Idem for term frequencies
if self.text_indexing_options.is_termfreq_enabled() {
let block_encoded = self.block_encoder
.compress_vint_unsorted(&self.term_freqs[..]);
for num in block_encoded {
self.written_bytes_postings +=
try!(num.serialize(&mut self.postings_write));
}
self.term_freqs.clear();
}
}
// On the other hand, positions are entirely buffered until the
// end of the term, at which point they are compressed and written.
if self.text_indexing_options.is_position_enabled() {
self.written_bytes_positions += try!(VInt(self.position_deltas.len() as u64)
.serialize(&mut self.positions_write));
let positions_encoded: &[u8] = self.positions_encoder
.compress_unsorted(&self.position_deltas[..]);
try!(self.positions_write.write_all(positions_encoded));
self.written_bytes_positions += positions_encoded.len();
self.position_deltas.clear();
}
self.term_open = false;
}
Ok(())
}
/// Serialize the information that a document contains the current term,
/// its term frequency, and the position deltas.
///
@@ -212,34 +192,89 @@ impl PostingsSerializer {
///
/// Term frequencies and positions may be ignored by the serializer depending
/// on the configuration of the field in the `Schema`.
pub fn write_doc(&mut self,
doc_id: DocId,
term_freq: u32,
position_deltas: &[u32])
-> io::Result<()> {
pub fn write_doc(
&mut self,
doc_id: DocId,
term_freq: u32,
position_deltas: &[u32],
) -> io::Result<()> {
self.current_term_info.doc_freq += 1;
self.postings_serializer.write_doc(doc_id, term_freq)?;
if let Some(ref mut positions_serializer) = self.positions_serializer_opt.as_mut() {
positions_serializer.write(position_deltas)?;
}
Ok(())
}
/// Finish the serialization for this term postings.
///
/// If the current block is incomplete, it need to be encoded
/// using `VInt` encoding.
pub fn close_term(&mut self) -> io::Result<()> {
if self.term_open {
self.term_dictionary_builder
.insert_value(&self.current_term_info)?;
self.postings_serializer.close_term()?;
self.term_open = false;
}
Ok(())
}
/// Closes the current current field.
pub fn close(mut self) -> io::Result<()> {
self.close_term()?;
if let Some(positions_serializer) = self.positions_serializer_opt {
positions_serializer.close()?;
}
self.postings_serializer.close()?;
self.term_dictionary_builder.finish()?;
Ok(())
}
}
struct PostingsSerializer<W: Write> {
postings_write: CountingWriter<W>,
last_doc_id_encoded: u32,
block_encoder: BlockEncoder,
doc_ids: Vec<DocId>,
term_freqs: Vec<u32>,
termfreq_enabled: bool,
}
impl<W: Write> PostingsSerializer<W> {
fn new(write: W, termfreq_enabled: bool) -> PostingsSerializer<W> {
PostingsSerializer {
postings_write: CountingWriter::wrap(write),
block_encoder: BlockEncoder::new(),
doc_ids: vec![],
term_freqs: vec![],
last_doc_id_encoded: 0u32,
termfreq_enabled,
}
}
fn write_doc(&mut self, doc_id: DocId, term_freq: u32) -> io::Result<()> {
self.doc_ids.push(doc_id);
if self.text_indexing_options.is_termfreq_enabled() {
if self.termfreq_enabled {
self.term_freqs.push(term_freq as u32);
}
if self.text_indexing_options.is_position_enabled() {
self.position_deltas.extend_from_slice(position_deltas);
}
if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
if self.doc_ids.len() == COMPRESSION_BLOCK_SIZE {
{
// encode the doc ids
let block_encoded: &[u8] = self.block_encoder
.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
self.last_doc_id_encoded = self.doc_ids[self.doc_ids.len() - 1];
try!(self.postings_write.write_all(block_encoded));
self.written_bytes_postings += block_encoded.len();
self.postings_write.write_all(block_encoded)?;
}
if self.text_indexing_options.is_termfreq_enabled() {
if self.termfreq_enabled {
// encode the term_freqs
let block_encoded: &[u8] = self.block_encoder
.compress_block_unsorted(&self.term_freqs);
try!(self.postings_write.write_all(block_encoded));
self.written_bytes_postings += block_encoded.len();
let block_encoded: &[u8] =
self.block_encoder.compress_block_unsorted(&self.term_freqs);
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
self.doc_ids.clear();
@@ -247,12 +282,89 @@ impl PostingsSerializer {
Ok(())
}
/// Closes the serializer.
pub fn close(mut self) -> io::Result<()> {
try!(self.close_term());
try!(self.terms_fst_builder.finish());
try!(self.postings_write.flush());
try!(self.positions_write.flush());
fn close_term(&mut self) -> io::Result<()> {
if !self.doc_ids.is_empty() {
// we have doc ids waiting to be written
// this happens when the number of doc ids is
// not a perfect multiple of our block size.
//
// In that case, the remaining part is encoded
// using variable int encoding.
{
let block_encoded = self.block_encoder
.compress_vint_sorted(&self.doc_ids, self.last_doc_id_encoded);
self.postings_write.write_all(block_encoded)?;
self.doc_ids.clear();
}
// ... Idem for term frequencies
if self.termfreq_enabled {
let block_encoded = self.block_encoder
.compress_vint_unsorted(&self.term_freqs[..]);
self.postings_write.write_all(block_encoded)?;
self.term_freqs.clear();
}
}
Ok(())
}
fn close(mut self) -> io::Result<()> {
self.postings_write.flush()
}
fn addr(&self) -> u64 {
self.postings_write.written_bytes() as u64
}
fn clear(&mut self) {
self.doc_ids.clear();
self.term_freqs.clear();
self.last_doc_id_encoded = 0;
}
}
struct PositionSerializer<W: Write> {
buffer: Vec<u32>,
write: CountingWriter<W>, // See if we can offset the original counting writer.
block_encoder: BlockEncoder,
}
impl<W: Write> PositionSerializer<W> {
fn new(write: W) -> PositionSerializer<W> {
PositionSerializer {
buffer: Vec::with_capacity(COMPRESSION_BLOCK_SIZE),
write: CountingWriter::wrap(write),
block_encoder: BlockEncoder::new(),
}
}
fn addr(&self) -> (u64, u8) {
(self.write.written_bytes() as u64, self.buffer.len() as u8)
}
fn write_block(&mut self) -> io::Result<()> {
assert_eq!(self.buffer.len(), COMPRESSION_BLOCK_SIZE);
let block_compressed: &[u8] = self.block_encoder.compress_block_unsorted(&self.buffer);
self.write.write_all(block_compressed)?;
self.buffer.clear();
Ok(())
}
fn write(&mut self, mut vals: &[u32]) -> io::Result<()> {
let mut buffer_len = self.buffer.len();
while vals.len() + buffer_len >= COMPRESSION_BLOCK_SIZE {
let len_to_completion = COMPRESSION_BLOCK_SIZE - buffer_len;
self.buffer.extend_from_slice(&vals[..len_to_completion]);
self.write_block()?;
vals = &vals[len_to_completion..];
buffer_len = self.buffer.len();
}
self.buffer.extend_from_slice(vals);
Ok(())
}
fn close(mut self) -> io::Result<()> {
self.buffer.resize(COMPRESSION_BLOCK_SIZE, 0u32);
self.write_block()?;
self.write.flush()
}
}

View File

@@ -1,44 +1,46 @@
use common::BinarySerializable;
use std::io;
/// `TermInfo` contains all of the information
/// `TermInfo` contains all of the information
/// associated to terms in the `.term` file.
///
///
/// It consists of
/// * `doc_freq` : the number of document in the segment
/// containing this term. It is also the length of the
/// posting list associated to this term
/// * `postings_offset` : an offset in the `.idx` file
/// * `postings_offset` : an offset in the `.idx` file
/// addressing the start of the posting list associated
/// to this term.
#[derive(Debug,Default,Ord,PartialOrd,Eq,PartialEq,Clone)]
#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)]
pub struct TermInfo {
/// Number of documents in the segment containing the term
pub doc_freq: u32,
/// Offset within the postings (`.idx`) file.
pub postings_offset: u32,
/// Offset within the postings (`.idx`) file.
pub postings_offset: u64,
/// Offset within the position (`.pos`) file.
pub positions_offset: u32,
pub positions_offset: u64,
/// Offset within the position block.
pub positions_inner_offset: u8,
}
impl BinarySerializable for TermInfo {
fn serialize(&self, writer: &mut io::Write) -> io::Result<usize> {
Ok(
try!(self.doc_freq.serialize(writer)) +
try!(self.postings_offset.serialize(writer)) +
try!(self.positions_offset.serialize(writer))
)
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
self.doc_freq.serialize(writer)?;
self.postings_offset.serialize(writer)?;
self.positions_offset.serialize(writer)?;
self.positions_inner_offset.serialize(writer)
}
fn deserialize(reader: &mut io::Read) -> io::Result<Self> {
let doc_freq = try!(u32::deserialize(reader));
let postings_offset = try!(u32::deserialize(reader));
let positions_offset = try!(u32::deserialize(reader));
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let doc_freq = u32::deserialize(reader)?;
let postings_offset = u64::deserialize(reader)?;
let positions_offset = u64::deserialize(reader)?;
let positions_inner_offset = u8::deserialize(reader)?;
Ok(TermInfo {
doc_freq: doc_freq,
postings_offset: postings_offset,
positions_offset: positions_offset,
doc_freq,
postings_offset,
positions_offset,
positions_inner_offset,
})
}
}

View File

@@ -1,7 +1,7 @@
#![allow(dead_code)]
use DocId;
use postings::{Postings, DocSet, HasLen};
use postings::{DocSet, HasLen, Postings};
use std::num::Wrapping;
const EMPTY_ARRAY: [u32; 0] = [];
@@ -19,7 +19,7 @@ pub struct VecPostings {
impl From<Vec<DocId>> for VecPostings {
fn from(doc_ids: Vec<DocId>) -> VecPostings {
VecPostings {
doc_ids: doc_ids,
doc_ids,
cursor: Wrapping(usize::max_value()),
}
}
@@ -34,6 +34,10 @@ impl DocSet for VecPostings {
fn doc(&self) -> DocId {
self.doc_ids[self.cursor.0]
}
fn size_hint(&self) -> usize {
self.len()
}
}
impl HasLen for VecPostings {
@@ -57,8 +61,7 @@ pub mod tests {
use super::*;
use DocId;
use postings::{Postings, SkipResult, DocSet};
use postings::{DocSet, Postings, SkipResult};
#[test]
pub fn test_vec_postings() {

View File

@@ -6,7 +6,7 @@ use Searcher;
use query::Query;
use schema::Term;
use query::TermQuery;
use postings::SegmentPostingsOption;
use schema::IndexRecordOption;
use query::Occur;
use query::OccurFilter;
@@ -27,7 +27,7 @@ pub struct BooleanQuery {
impl From<Vec<(Occur, Box<Query>)>> for BooleanQuery {
fn from(subqueries: Vec<(Occur, Box<Query>)>) -> BooleanQuery {
BooleanQuery { subqueries: subqueries }
BooleanQuery { subqueries }
}
}
@@ -37,10 +37,10 @@ impl Query for BooleanQuery {
}
fn weight(&self, searcher: &Searcher) -> Result<Box<Weight>> {
let sub_weights = try!(self.subqueries
let sub_weights = self.subqueries
.iter()
.map(|&(ref _occur, ref subquery)| subquery.weight(searcher))
.collect());
.collect::<Result<_>>()?;
let occurs: Vec<Occur> = self.subqueries
.iter()
.map(|&(ref occur, ref _subquery)| *occur)
@@ -54,12 +54,13 @@ impl BooleanQuery {
/// Helper method to create a boolean query matching a given list of terms.
/// The resulting query is a disjunction of the terms.
pub fn new_multiterms_query(terms: Vec<Term>) -> BooleanQuery {
let occur_term_queries: Vec<(Occur, Box<Query>)> = terms.into_iter()
let occur_term_queries: Vec<(Occur, Box<Query>)> = terms
.into_iter()
.map(|term| {
let term_query: Box<Query> = box TermQuery::new(term, SegmentPostingsOption::Freq);
let term_query: Box<Query> = box TermQuery::new(term, IndexRecordOption::WithFreqs);
(Occur::Should, term_query)
})
.collect();
BooleanQuery::from(occur_term_queries)
}
}
}

View File

@@ -6,11 +6,10 @@ use postings::DocSet;
use query::OccurFilter;
use query::boolean_query::ScoreCombiner;
/// Each `HeapItem` represents the head of
/// one of scorer being merged.
///
/// * `doc` - is the current doc id for the given segment postings
/// * `doc` - is the current doc id for the given segment postings
/// * `ord` - is the ordinal used to identify to which segment postings
/// this heap item belong to.
#[derive(Eq, PartialEq)]
@@ -27,8 +26,8 @@ impl PartialOrd for HeapItem {
}
impl Ord for HeapItem {
fn cmp(&self, other:&Self) -> Ordering {
(other.doc).cmp(&self.doc)
fn cmp(&self, other: &Self) -> Ordering {
(other.doc).cmp(&self.doc)
}
}
@@ -41,9 +40,7 @@ pub struct BooleanScorer<TScorer: Scorer> {
}
impl<TScorer: Scorer> BooleanScorer<TScorer> {
pub fn new(scorers: Vec<TScorer>,
occur_filter: OccurFilter) -> BooleanScorer<TScorer> {
pub fn new(scorers: Vec<TScorer>, occur_filter: OccurFilter) -> BooleanScorer<TScorer> {
let score_combiner = ScoreCombiner::default_for_num_scorers(scorers.len());
let mut non_empty_scorers: Vec<TScorer> = Vec::new();
for mut posting in scorers {
@@ -56,33 +53,30 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
.iter()
.map(|posting| posting.doc())
.enumerate()
.map(|(ord, doc)| {
HeapItem {
doc: doc,
ord: ord as u32
}
.map(|(ord, doc)| HeapItem {
doc,
ord: ord as u32,
})
.collect();
BooleanScorer {
scorers: non_empty_scorers,
queue: BinaryHeap::from(heap_items),
doc: 0u32,
score_combiner: score_combiner,
occur_filter: occur_filter,
score_combiner,
occur_filter,
}
}
/// Advances the head of our heap (the segment posting with the lowest doc)
/// It will also update the new current `DocId` as well as the term frequency
/// associated with the segment postings.
///
///
/// After advancing the `SegmentPosting`, the postings is removed from the heap
/// if it has been entirely consumed, or pushed back into the heap.
///
///
/// # Panics
/// This method will panic if the head `SegmentPostings` is not empty.
fn advance_head(&mut self,) {
fn advance_head(&mut self) {
{
let mut mutable_head = self.queue.peek_mut().unwrap();
let cur_scorers = &mut self.scorers[mutable_head.ord as usize];
@@ -96,7 +90,18 @@ impl<TScorer: Scorer> BooleanScorer<TScorer> {
}
impl<TScorer: Scorer> DocSet for BooleanScorer<TScorer> {
fn advance(&mut self,) -> bool {
fn size_hint(&self) -> usize {
// TODO fix this. it should be the min
// of the MUST scorer
// and the max of the SHOULD scorers.
self.scorers
.iter()
.map(|scorer| scorer.size_hint())
.max()
.unwrap()
}
fn advance(&mut self) -> bool {
loop {
self.score_combiner.clear();
let mut ord_bitset = 0u64;
@@ -106,40 +111,37 @@ impl<TScorer: Scorer> DocSet for BooleanScorer<TScorer> {
self.doc = heap_item.doc;
let score = self.scorers[ord].score();
self.score_combiner.update(score);
ord_bitset |= 1 << ord;
ord_bitset |= 1 << ord;
}
None => {
return false;
}
}
self.advance_head();
while let Some(&HeapItem {doc, ord}) = self.queue.peek() {
while let Some(&HeapItem { doc, ord }) = self.queue.peek() {
if doc == self.doc {
let ord = ord as usize;
let score = self.scorers[ord].score();
self.score_combiner.update(score);
ord_bitset |= 1 << ord;
}
else {
} else {
break;
}
self.advance_head();
}
}
if self.occur_filter.accept(ord_bitset) {
return true;
}
}
}
fn doc(&self,) -> DocId {
}
fn doc(&self) -> DocId {
self.doc
}
}
impl<TScorer: Scorer> Scorer for BooleanScorer<TScorer> {
fn score(&self,) -> f32 {
fn score(&self) -> f32 {
self.score_combiner.score()
}
}

View File

@@ -13,19 +13,18 @@ pub struct BooleanWeight {
impl BooleanWeight {
pub fn new(weights: Vec<Box<Weight>>, occur_filter: OccurFilter) -> BooleanWeight {
BooleanWeight {
weights: weights,
occur_filter: occur_filter,
weights,
occur_filter,
}
}
}
impl Weight for BooleanWeight {
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
let sub_scorers: Vec<Box<Scorer + 'a>> = try!(self.weights
let sub_scorers: Vec<Box<Scorer + 'a>> = self.weights
.iter()
.map(|weight| weight.scorer(reader))
.collect());
.collect::<Result<_>>()?;
let boolean_scorer = BooleanScorer::new(sub_scorers, self.occur_filter);
Ok(box boolean_scorer)
}

View File

@@ -7,11 +7,9 @@ pub use self::boolean_query::BooleanQuery;
pub use self::boolean_scorer::BooleanScorer;
pub use self::score_combiner::ScoreCombiner;
#[cfg(test)]
mod tests {
use super::*;
use postings::{DocSet, VecPostings};
use query::Scorer;
@@ -23,13 +21,12 @@ mod tests {
use collector::tests::TestCollector;
use Index;
use schema::*;
use fastfield::{U64FastFieldReader};
use postings::SegmentPostingsOption;
use fastfield::U64FastFieldReader;
use schema::IndexRecordOption;
fn abs_diff(left: f32, right: f32) -> f32 {
(right - left).abs()
}
}
#[test]
pub fn test_boolean_query() {
@@ -64,7 +61,10 @@ mod tests {
}
let make_term_query = |text: &str| {
let term_query = TermQuery::new(Term::from_field_text(text_field, text), SegmentPostingsOption::NoFreq);
let term_query = TermQuery::new(
Term::from_field_text(text_field, text),
IndexRecordOption::Basic,
);
let query: Box<Query> = box term_query;
query
};
@@ -78,68 +78,75 @@ mod tests {
test_collector.docs()
};
{
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")) ]);
assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 3));
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a"))]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")) ]);
assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 3));
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a"))]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::Should, make_term_query("a")), (Occur::Should, make_term_query("b"))]);
assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 2, 3));
let boolean_query = BooleanQuery::from(vec![
(Occur::Should, make_term_query("a")),
(Occur::Should, make_term_query("b")),
]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 2, 3]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")), (Occur::Should, make_term_query("b"))]);
assert_eq!(matching_docs(&boolean_query), vec!(0, 1, 3));
let boolean_query = BooleanQuery::from(vec![
(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b")),
]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1, 3]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b")),
(Occur::MustNot, make_term_query("d")),
]);
assert_eq!(matching_docs(&boolean_query), vec!(0, 1));
let boolean_query = BooleanQuery::from(vec![
(Occur::Must, make_term_query("a")),
(Occur::Should, make_term_query("b")),
(Occur::MustNot, make_term_query("d")),
]);
assert_eq!(matching_docs(&boolean_query), vec![0, 1]);
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::MustNot, make_term_query("d")),]);
let boolean_query = BooleanQuery::from(vec![(Occur::MustNot, make_term_query("d"))]);
assert_eq!(matching_docs(&boolean_query), Vec::<u32>::new());
}
}
#[test]
pub fn test_boolean_scorer() {
let occurs = vec!(Occur::Should, Occur::Should);
let occurs = vec![Occur::Should, Occur::Should];
let occur_filter = OccurFilter::new(&occurs);
let left_fieldnorms = U64FastFieldReader::from(vec!(100,200,300));
let left = VecPostings::from(vec!(1, 2, 3));
let left_fieldnorms =
U64FastFieldReader::from((0u64..9u64).map(|doc| doc * 3).collect::<Vec<u64>>());
let left = VecPostings::from(vec![1, 2, 3]);
let left_scorer = TermScorer {
idf: 1f32,
fieldnorm_reader_opt: Some(left_fieldnorms),
postings: left,
};
let right_fieldnorms = U64FastFieldReader::from(vec!(15,25,35));
let right = VecPostings::from(vec!(1, 3, 8));
let right_fieldnorms =
U64FastFieldReader::from((0u64..9u64).map(|doc| doc * 5).collect::<Vec<u64>>());
let right = VecPostings::from(vec![1, 3, 8]);
let right_scorer = TermScorer {
idf: 4f32,
fieldnorm_reader_opt: Some(right_fieldnorms),
postings: right,
};
let mut boolean_scorer = BooleanScorer::new(vec!(left_scorer, right_scorer), occur_filter);
let mut boolean_scorer = BooleanScorer::new(vec![left_scorer, right_scorer], occur_filter);
assert_eq!(boolean_scorer.next(), Some(1u32));
assert!(abs_diff(boolean_scorer.score(), 0.8707107) < 0.001);
assert!(abs_diff(boolean_scorer.score(), 2.3662047) < 0.001);
assert_eq!(boolean_scorer.next(), Some(2u32));
assert!(abs_diff(boolean_scorer.score(), 0.028867513) < 0.001f32);
assert!(abs_diff(boolean_scorer.score(), 0.20412415) < 0.001f32);
assert_eq!(boolean_scorer.next(), Some(3u32));
assert_eq!(boolean_scorer.next(), Some(8u32));
assert!(abs_diff(boolean_scorer.score(), 0.5163978) < 0.001f32);
assert!(abs_diff(boolean_scorer.score(), 0.31622776) < 0.001f32);
assert!(!boolean_scorer.advance());
}
}

Some files were not shown because too many files have changed in this diff Show More