Compare commits

..

558 Commits
0.2.0 ... wasm

Author SHA1 Message Date
Paul Masurel
232ca5c06c Added convert to static [u8] 2018-04-10 21:18:32 +09:00
Paul Masurel
743ae102f1 Using bitpacker@3 2018-04-10 10:05:42 +09:00
Paul Masurel
e78af20375 remove comment 2018-04-09 21:51:17 +09:00
Paul Masurel
30637f7a7f Ok on wasm 2018-03-31 17:42:26 +09:00
Paul Masurel
0107fe886b Removed timer 2018-03-31 15:40:16 +09:00
Paul Masurel
1d9566e73c Making mmap a feature 2018-03-31 13:23:43 +09:00
Paul Masurel
8006f1df11 Added comments 2018-03-28 08:28:49 +09:00
Paul Masurel
ffa03bad71 TermScorer does not handle deletes 2018-03-27 17:35:20 +09:00
Paul Masurel
98cf4ba63a Small refactor of postings's skip method 2018-03-27 16:14:28 +09:00
Paul Masurel
4d65771e04 field norm reader is not an option anymore. 2018-03-26 13:25:29 +09:00
Paul Masurel
9712a75399 Added unit test for intersection score 2018-03-25 12:58:24 +09:00
Paul Masurel
3ae03b91ae PhraseScorer's score aligned with that of Lucene.) 2018-03-25 12:44:16 +09:00
Paul Masurel
238b02ce7d Bugfixed 2018-03-23 18:50:57 +09:00
Paul Masurel
3091459777 Fixed main bug. Unit test still not passing because of altered scoring 2018-03-23 13:52:10 +09:00
Paul Masurel
b7f8884246 Closes #245 = BM25. (#260)
* Closes #245 = BM25.

Scores are the same as Lucene.

* Fixing travis conf
2018-03-22 15:06:56 +09:00
Paul Masurel
e22f767fda Backmerge 2018-03-21 21:18:46 +09:00
Paul Masurel
3ecfc36e53 Total field norm fixed. 2018-03-21 20:43:02 +09:00
Paul Masurel
1c9450174e Fieldnorm reader working except merge 2018-03-21 17:36:16 +09:00
Paul Masurel
cde4c391cd Added fieldnorm module 2018-03-21 15:41:46 +09:00
Paul Masurel
6d47634616 Added unit tests 2018-03-20 12:11:28 +09:00
Paul Masurel
39b182c24b Simplified phrase queries. Reading several time is ok. 2018-03-20 11:47:48 +09:00
Paul Masurel
baaae3f4ec Making it possible to read positions twice 2018-03-20 11:36:22 +09:00
Paul Masurel
63064601a7 Readded test for reading positions twice 2018-03-20 10:04:36 +09:00
Paul Masurel
07a8023a3a Added 2018-03-19 14:36:43 +09:00
Paul Masurel
59639cd311 In sync with master. Fixed merging 2018-03-19 12:58:42 +09:00
Paul Masurel
b0e5e1f61d Back merged master 2018-03-19 12:19:08 +09:00
Paul Masurel
234a902470 Removed cc from Cargo.toml 2018-03-19 12:09:25 +09:00
Paul Masurel
75d130f1ce Edited CHANGELOG 2018-03-19 12:01:48 +09:00
Paul Masurel
410187dd24 Removed .vimrc 2018-03-19 11:54:10 +09:00
Paul Masurel
88303d4833 Removed script directory 2018-03-19 11:53:15 +09:00
Paul Masurel
a26b0ff4a2 Removed exclude cpp from travis configuration 2018-03-19 11:51:41 +09:00
Paul Masurel
d4ed86f13a Issue/255 (#256)
* Remove cpp compression.

* Pointing to publish bitpacking

* Edited README
2018-03-19 11:48:40 +09:00
Paul Masurel
fc8902353c fieldnrom encoding. test broken 2018-03-10 18:35:16 +09:00
Paul Masurel
a2ee988304 Small change in pop_lowest. 2018-03-10 15:32:30 +09:00
Paul Masurel
97b7984200 Updated CHANGELOG 2018-03-10 14:08:11 +09:00
Paul Masurel
8683718159 Version bump 2018-03-10 14:01:30 +09:00
Paul Masurel
0cf274135b Clippy 2018-03-10 13:07:18 +09:00
Paul Masurel
a3b44773bb Bugfix and rustfmt 2018-03-10 12:21:50 +09:00
Paul Masurel
ec7c582109 NOBUG no-simd compression fix 2018-03-09 14:19:58 +09:00
Ewan Higgs
ee7ab72fb1 Support trailing commas using ',+ ,' trick from Blandy 2017. (#250) 2018-02-27 10:33:39 +09:00
Paul Masurel
2c20759829 removed unsafecell for position computer 2018-02-24 12:07:55 +09:00
Paul Masurel
23387b0ed0 Positions writes to an external Vec 2018-02-24 11:14:45 +09:00
Dylan DPC
e82859f2e6 Update Cargo.toml (#249) 2018-02-24 09:17:33 +09:00
Paul Masurel
be830b03c5 Bugfix in intersection.advance and impl skip_next 2018-02-23 11:55:23 +09:00
Paul Masurel
1b94a3e382 Phrase query optimisation 2018-02-23 00:00:22 +09:00
Paul Masurel
c3fbc4c8fa Simplified a notch TinySet::pop_lowest() 2018-02-22 10:43:06 +09:00
Paul Masurel
4ee2db25a0 Generic on Postings rather than deletes in TermScorer 2018-02-22 08:26:45 +09:00
Paul Masurel
e423784fd0 Added specialized SegmentPostings when there are no DeleteSet 2018-02-21 23:49:20 +09:00
Paul Masurel
fdb9c3c516 Tantivy version 0.5.0 2018-02-21 11:38:26 +09:00
Paul Masurel
6fb114224a Added unit test 2018-02-21 00:13:04 +09:00
Paul Masurel
2c3e33895a Added unit tests 2018-02-21 00:03:41 +09:00
Paul Masurel
d512b53688 Added handling of parenthesis in query parser 2018-02-20 23:18:02 +09:00
Paul Masurel
c8afd2b55d Added unit tests 2018-02-20 17:05:33 +09:00
Paul Masurel
3fd6d7125b Added unit test 2018-02-20 13:12:05 +09:00
Paul Masurel
de6a3987a9 Ignoring functional test 2018-02-20 12:58:06 +09:00
Paul Masurel
3dedc465fa Merge branch 'feature/multivalued-i64-u64' 2018-02-20 12:54:18 +09:00
Paul Masurel
f16cc6367e Refactoring of fastfields 2018-02-20 12:52:30 +09:00
Paul Masurel
4026fc5fb1 Removed redundant compressed_block_size function 2018-02-20 08:28:28 +09:00
Paul Masurel
43742a93ef Multivalue u64 field / i64 field. 2018-02-20 00:16:20 +09:00
Paul Masurel
2a843d86cb Code cleaning 2018-02-19 21:51:39 +09:00
Paul Masurel
9a706c296a Larger union horizon 2018-02-19 21:50:33 +09:00
Paul Masurel
5ff8123b7a Code cleaning 2018-02-19 15:41:19 +09:00
Paul Masurel
6061158506 Added long running test to travis conf 2018-02-19 13:23:04 +09:00
Paul Masurel
4e8b0e89d9 Added unit test 2018-02-19 13:19:18 +09:00
Paul Masurel
0540ebb49e Cargo clippy 2018-02-19 12:36:24 +09:00
Paul Masurel
ef94582203 Rustfmt 2018-02-19 12:12:10 +09:00
Paul Masurel
2f242d5f52 Moving docset around 2018-02-19 12:07:05 +09:00
Paul Masurel
da3d372e6e Faster union counts 2018-02-19 10:17:16 +09:00
Paul Masurel
42fd3fe5c7 Bugfix on TermWeight::count() 2018-02-18 10:59:18 +09:00
Paul Masurel
5dae6e6bbc Downcast TermScorer for intersection when all legs are TermScorers 2018-02-18 10:28:43 +09:00
Paul Masurel
e608e0a1df Removed half baked usage of Any 2018-02-18 10:01:14 +09:00
Paul Masurel
6c8c90d348 Removed lifetime from scorer 2018-02-18 09:12:40 +09:00
Paul Masurel
eb50e92ec4 Removed specialized postings on SegmentPostings 2018-02-18 00:09:15 +09:00
Paul Masurel
20bede9462 Bugfix when requesting no termfreq. 2018-02-17 22:41:12 +09:00
Paul Masurel
4640ab4e65 Merge branch 'master' into issue/query-perf 2018-02-17 17:31:51 +09:00
Paul Masurel
cd51ed0f9f Added comments 2018-02-17 16:59:28 +09:00
Paul Masurel
6676fe5717 Added a count method 2018-02-17 15:02:51 +09:00
Paul Masurel
292bb17346 Disable scoring
- Disabling scoring is an argument of the `.weight()` method
- Collectors declare whether they need scoring
2018-02-17 12:43:16 +09:00
Paul Masurel
0300e7272b Scoring for union. 2018-02-17 11:56:21 +09:00
Paul Masurel
8760899fa2 Stupid implementaiton of Box<Scorer>::collect 2018-02-16 19:30:50 +09:00
Paul Masurel
c89d570a79 rustfmt 2018-02-16 17:50:05 +09:00
Paul Masurel
1da06d867b Using the same logic when score is enabled. 2018-02-16 17:36:33 +09:00
Paul Masurel
76e8db6ed3 blop 2018-02-16 14:57:08 +09:00
Paul Masurel
31e5580bfa Renaming intersection / exclude 2018-02-16 11:55:56 +09:00
Paul Masurel
930d3db2f7 Integrated reqopt_scorer 2018-02-16 11:43:27 +09:00
Paul Masurel
1593e1dc6f Added reqopt 2018-02-16 11:22:39 +09:00
Paul Masurel
e0189fc9e6 Added exclude query 2018-02-14 18:06:51 +09:00
Paul Masurel
ffdb4ef0a7 Added unit test 2018-02-14 11:58:40 +09:00
Paul Masurel
58845344c2 Unit test + bugfix in union 2018-02-13 14:54:20 +09:00
Paul Masurel
548ec9ecca Added ok unit test 2018-02-12 17:48:41 +09:00
Paul Masurel
86b700fa93 Updated travis.yml 2018-02-12 12:13:36 +09:00
Paul Masurel
e95c49e749 Added unit test to show bug in intersection 2018-02-12 12:06:19 +09:00
Paul Masurel
f3033a8469 Added sudo required to travis conf because of https://github.com/travis-ci/travis-ci/issues/9061 2018-02-12 11:19:12 +09:00
Paul Masurel
c4125bda59 Backmerging master 2018-02-12 11:08:57 +09:00
Paul Masurel
a7ffc0e610 Rustfmt 2018-02-12 10:31:29 +09:00
Paul Masurel
9370427ae2 Terminfo blocks (#244)
* Using u64 key in the store
* Using Option<> for the next element, as opposed to u64
* Code simplification.
* Added TermInfoStoreWriter.
* Added a TermInfoStore
* Added FixedSized for BinarySerialized.
2018-02-12 10:24:58 +09:00
Paul Masurel
1fc7afa90a Issue/range query (#242)
BitSet and RangeQuery
2018-02-05 09:33:25 +09:00
Paul Masurel
6a104e4f69 Cargo fmt 2018-02-03 11:59:34 +09:00
Paul Masurel
920f086e1d Clippy 2018-02-03 11:46:01 +09:00
Paul Masurel
13aaca7e11 Merge branch 'master' into merge-facets 2018-02-03 11:13:02 +09:00
Paul Masurel
df53dc4ceb Format 2018-02-03 00:21:05 +09:00
Paul Masurel
dd028841e8 Added documentation / test and change the contract of .add_facet() 2018-02-03 00:17:51 +09:00
Paul Masurel
eb84b8a60d bugfix 2018-02-02 18:52:07 +09:00
Paul Masurel
c05f46ad0e skip for intersection 2018-02-02 17:22:58 +09:00
Paul Masurel
435ff9d524 Make constructor of RangeQuery public 2018-02-02 16:50:22 +09:00
Paul Masurel
fdd5dd8496 Merge branch 'master' into issue/query-perf 2018-02-02 16:39:28 +09:00
Paul Masurel
fb5476d5de Query optimization: phrase query + union 2018-02-02 16:39:17 +09:00
Paul Masurel
dd8332c327 Added disabling scoring 2018-02-02 12:11:56 +09:00
Paul Masurel
63d201150b issue/range-query Added range query 2018-02-02 00:41:12 +09:00
Paul Masurel
b78efdc59f NOBUG Use the skipping logic of segment postings in 2018-02-01 18:36:55 +09:00
Paul Masurel
5cb08f7996 Method to create bitset from DocSet directly. 2018-02-01 18:25:43 +09:00
Paul Masurel
1947a19700 Added bitse 2018-01-31 23:56:54 +09:00
Paul Masurel
271b019420 added cargo doc 2018-01-30 15:18:19 +09:00
Paul Masurel
340693184f Added comment 2018-01-30 15:15:55 +09:00
Paul Masurel
97782a9511 updated travis-cargo 2018-01-30 13:18:51 +09:00
Paul Masurel
930010aa88 Unit test passing 2018-01-28 00:03:51 +09:00
Paul Masurel
7f5b07d4e7 Fixing unit tests 2018-01-25 14:55:29 +09:00
Paul Masurel
3edb3dce6a Test not passing 2018-01-25 12:46:32 +09:00
Paul Masurel
1edaf7a312 Closes #236. Removes dependency to version. 2018-01-20 12:12:43 +09:00
Paul Masurel
137906ff29 Fixing PhraseQuery, broken due to the reordering of the intersection clauses.
Closes #234
2018-01-12 21:01:28 +09:00
Paul Masurel
143a143cde issue/232 added unit test. (#233) 2018-01-11 23:37:45 +09:00
Paul Masurel
4f5ce12a77 NOBUG removed cpp from patterns 2018-01-05 12:09:42 +09:00
Paul Masurel
813efa4ab3 NOBUG coveralls 2018-01-05 11:03:27 +09:00
Paul Masurel
c3b6c1dc0b NOBUG coveralls 2018-01-05 00:31:57 +09:00
Paul Masurel
6f5e0ef6f4 NOBUG Simplify travis 2018-01-04 20:51:00 +09:00
Paul Masurel
7224f58895 Merge branch 'issue/218'
Conflicts:
	src/directory/mmap_directory.rs
	src/lib.rs
2018-01-04 18:47:10 +09:00
Paul Masurel
49519c3f61 added comments 2018-01-04 12:53:20 +09:00
Paul Masurel
cb11b92505 Added comments 2018-01-04 12:27:14 +09:00
Paul Masurel
7b2dcfbd91 Merge branch 'issue/227' 2018-01-04 12:12:00 +09:00
Paul Masurel
d2e30e6681 Merge branch 'master' of github.com:tantivy-search/tantivy 2018-01-04 12:09:44 +09:00
Paul Masurel
ef109927b3 rustfmt 2018-01-04 12:08:34 +09:00
Paul Masurel
44e5c4dfd3 Added alphanum only token filter 2017-12-31 13:43:10 +09:00
Paul Masurel
6f223253ea Made load_metas public 2017-12-31 08:57:19 +09:00
Paul Masurel
f7b0392bd5 issue/230 Add an optional commit message. (#231)
Closes #230
2017-12-27 12:27:02 +09:00
Paul Masurel
442bc9a1b8 Fixes the computation of the memory size of a hashtable with a key of n bits. (#229)
Closes #228
2017-12-25 13:04:10 +09:00
Paul Masurel
db7d784573 Issue 227 Faster merge when there are no deletes 2017-12-21 22:04:05 +09:00
Paul Masurel
79132e803a NOBUG Switched to 64 bits addr 2017-12-21 11:06:46 +09:00
Paul Masurel
9e132b7dde NOBUG QueryParser does not need to be mut. Code cleanup 2017-12-16 15:43:35 +09:00
Paul Masurel
1e55189db1 NOBUG rustfmt 2017-12-14 19:30:31 +09:00
Paul Masurel
8b1b389a76 NOBUG Clippy 2017-12-14 19:25:12 +09:00
Paul Masurel
46f3ec87a5 Removed packed memory layout. 2017-12-14 18:37:04 +09:00
Paul Masurel
f24e5f405e NOBUG intellij misc lint 2017-12-14 18:23:35 +09:00
Paul Masurel
2589be3984 BUGFIX Serialization of schema got broken after serde's update 2017-12-14 17:37:20 +09:00
Paul Masurel
a02a9294e4 removed doc in travis 2017-11-27 13:53:58 +09:00
Paul Masurel
8023445b63 docs 2017-11-26 11:52:03 +09:00
Paul Masurel
05ce093f97 doc 2017-11-26 11:43:11 +09:00
Paul Masurel
6937e23a56 fixing doctest 2017-11-26 11:06:34 +09:00
Paul Masurel
974c321153 cargo fmt 2017-11-26 11:02:02 +09:00
Paul Masurel
f30ec9b36b Merge branch 'master' of github.com:tantivy-search/tantivy
Conflicts:
	src/analyzer/mod.rs
	src/schema/index_record_option.rs
	src/tokenizer/lower_caser.rs
	src/tokenizer/tokenizer.rs
2017-11-26 10:54:05 +09:00
Paul Masurel
acd7c1ea2d Added comments 2017-11-26 10:44:49 +09:00
Paul Masurel
aaeeda2bc5 Editing rustdoc 2017-11-25 13:23:32 +09:00
Paul Masurel
ac4d433fad Renamed analyzer to tokenizer 2017-11-24 16:50:32 +09:00
Paul Masurel
a298c084e6 Analyzer's Analyzer::token_stream does not need to me &mut self 2017-11-22 20:37:34 +09:00
Paul Masurel
185a72b341 Closes #224. Fixes documentation about STORED in the example. (#225) 2017-11-16 08:22:54 +09:00
Paul Masurel
bb41ae76f9 Closes #224. Fixes documentation about STORED in the example. 2017-11-16 08:16:17 +09:00
Paul Masurel
74d32e522a Stopped using mmap in tantivy. Caching MmapReadOnly.
Closes #218
2017-10-08 17:07:19 +09:00
Jain Jacob
927dd1ee6f Updates crate gcc to cc v1 (#217)
* Bump cc to v1

* Changes gcc::Config to cc::Build. Resolves #216
2017-10-06 16:18:44 +09:00
Paul Masurel
2c9302290f #191 Analyzer 2017-09-20 22:56:55 +09:00
Paul Masurel
426cc436da Test passing 2017-09-10 17:48:41 +09:00
Paul Masurel
68d42c9cf2 Added raw tokenizer, using the right analyzer in query parser. 2017-09-10 16:58:50 +09:00
Paul Masurel
ca49d6130f Test not passing 2017-09-09 17:32:47 +09:00
Paul Masurel
3588ca0561 Integrated with the merge branch 2017-09-09 15:27:19 +09:00
Paul Masurel
7c6cdcd876 Merge branch 'master' of github.com:tantivy-search/tantivy 2017-09-02 16:03:06 +09:00
Paul Masurel
71366b9a56 issue/197 Remove logic that prevents leak from crossbeam MsQueue. (#212)
Closes #197
2017-09-02 15:55:23 +09:00
Paul Masurel
a3247ebcfb issue/197 Remove logic that prevents leak from csossbeam MsQueue. 2017-09-02 15:53:07 +09:00
Paul Masurel
3ec13a8719 Readded fix for non-simd 2017-08-28 23:18:56 +09:00
Paul Masurel
f8593c76d5 Merge branch 'imhotep-new-codec'
Conflicts:
	src/common/bitpacker.rs
	src/compression/pack/compression_pack_nosimd.rs
	src/indexer/log_merge_policy.rs
2017-08-28 19:30:01 +09:00
Paul Masurel
f8710bd4b0 Format 2017-08-28 18:22:41 +09:00
Paul Masurel
8d05b8f7b2 Added comments. Renamed field reader 2017-08-28 17:00:12 +09:00
Paul Masurel
fc25516b7a Added unit test. 2017-08-28 11:15:37 +09:00
Paul Masurel
5b1e71947f Stream working, all test passing 2017-08-27 20:20:38 +09:00
Paul Masurel
69351fb4a5 Toward a new codec 2017-08-27 18:44:37 +09:00
Paul Masurel
3d0082d020 Delta encoded. Range and get are broken 2017-08-26 19:59:51 +09:00
Paul Masurel
8e450c770a Better error handling. Some doc. 2017-08-26 18:40:30 +09:00
Paul Masurel
a757902aed Merge branch 'feature/streamdict-simd' into imhotep 2017-08-22 18:58:57 +09:00
Paul Masurel
b3a8074826 removed println 2017-08-22 18:58:17 +09:00
Paul Masurel
4289625348 Merged with the new codec branch 2017-08-22 18:26:09 +09:00
Paul Masurel
850f10c1fe Exposing Field 2017-08-22 18:21:35 +09:00
raphael claude
d7f9bfdfc5 fix segments sorting in log_merge_policy (#211)
bug: segments were sorted on their indices (first field in the tuples)
fix: sort on the segments size
2017-08-20 08:59:54 +09:00
Paul Masurel
d0d5db4515 Streamdict using SIMD instruction. 2017-08-19 12:03:04 +09:00
Paul Masurel
303fc7e820 Better unit test for termdict. Checking the TermInfo 2017-08-17 12:08:39 +09:00
Paul Masurel
744edb2c5c NOBUG Avoid serializing position offset when useless. Test passing 2017-08-16 14:06:00 +09:00
Paul Masurel
2d70efb7b0 Removed trait boundary on termdict 2017-08-15 14:43:05 +09:00
Paul Masurel
eb5b2ffdcc Cleanups 2017-08-15 13:57:22 +09:00
Paul Masurel
38513014d5 Reenable unit test.
Consuming CompositeWrite on Close.
2017-08-14 23:35:09 +09:00
Paul Masurel
9cb7a0f6e6 Unit tests passing 2017-08-13 19:38:25 +09:00
Paul Masurel
8d466b8a76 half way through removing FastFieldsReader 2017-08-13 18:39:45 +09:00
Paul Masurel
413d0e1719 NOBUG test passing 2017-08-13 17:57:11 +09:00
Paul Masurel
0eb3c872fd Using composite file for all of the inverted index component 2017-08-12 19:34:23 +09:00
Paul Masurel
f9203228be Using composite file in fast field. 2017-08-12 18:45:59 +09:00
Paul Masurel
8f377b92d0 introducing a field serializer 2017-08-11 18:11:32 +09:00
Paul Masurel
1e89f86267 blop 2017-08-08 13:55:09 +09:00
Paul Masurel
d1f61a50c1 issue/207 Lazily decompressing positions. 2017-08-06 20:29:21 +09:00
Dru Sellers
2bb85ed575 Minor Doc Changes (#206)
* Various small documentation tweaks

* walking through the docs

* Update lib.rs

* Update lib.rs

* Update mod.rs
2017-08-06 09:22:03 +09:00
Paul Masurel
236fa74767 Positions almost working. 2017-08-05 23:17:35 +09:00
Paul Masurel
63b35dd87b removing freq handler. 2017-08-05 18:09:19 +09:00
Paul Masurel
efb910f4e8 Added CompressedIntStream 2017-08-05 16:44:01 +09:00
Paul Masurel
aff7e64d4e test 2017-08-04 22:07:14 +09:00
Paul Masurel
92a3f3981f issue/204 trying to fix nosimd branch. test not passing 2017-08-04 21:19:18 +09:00
king6cong
447a9361d8 Remove submodule information in README as subtree is now used 2017-08-03 13:52:16 +09:00
Paul Masurel
5f59139484 NOBUG simplified code. 2017-08-02 20:49:47 +09:00
Paul Masurel
27c373d26d NOBUG Updated changelog and bumped version 2017-07-24 18:52:45 +09:00
Paul Masurel
80ae136646 issue/198 Getting living_file after getting the list of managed files. 2017-07-24 18:46:41 +09:00
Paul Masurel
52b1398702 NOBUG version 0.4.0 -> 0.4.1 2017-07-19 19:07:54 +09:00
Paul Masurel
7b9cd09a6e Closes #199. Unindexed fields are indexed as untokenized 2017-07-19 18:41:22 +09:00
Paul Masurel
4c423ad2ca Merge branch 'master' of github.com:tantivy-search/tantivy 2017-07-19 17:01:32 +09:00
Paul Masurel
9f542d5252 NOBUG Fix spelling of "encountered". (as reported by @dazzag24) 2017-07-19 16:59:50 +09:00
Paul Masurel
77d8e81ae4 issue/17 Slightly more explicit error message 2017-07-19 11:08:42 +09:00
Paul Masurel
76e07b9705 NOBUG Small fixes. 2017-07-14 18:09:54 +09:00
Paul Masurel
ea4e9fdaf1 NOBUG updated README 2017-07-14 14:09:13 +09:00
Paul Masurel
e418bee693 NOBUG Garbage collection after end merge. 2017-07-14 12:09:47 +09:00
Paul Masurel
af4f1a86bc Merge remote-tracking branch 'origin/exp/hash_intable' 2017-07-13 20:50:54 +09:00
Paul Masurel
753b639454 NOBUG splitting the per-thread memory between the table and the heap 2017-07-13 17:11:39 +09:00
Paul Masurel
5907a47547 NOBUG Added whitespaces. 2017-07-13 15:14:12 +09:00
Paul Masurel
586a6e62a2 NOBUG Added Changelog for 4.0 2017-07-13 15:06:09 +09:00
Paul Masurel
fdae0eff5a NOBUG Remove range step_by 2017-07-13 14:05:33 +09:00
Paul Masurel
6eea407f20 Removing usage of step_by 2017-06-23 17:46:39 +09:00
Paul Masurel
1ba51d4dc4 NOBUG removed using range.step_by 2017-06-22 22:10:53 +09:00
Paul Masurel
6e742d5145 NOBUG removing batch add docs 2017-06-22 11:35:22 +09:00
Paul Masurel
1843259e91 NOBUG Simplified addr definitions 2017-06-22 11:27:32 +09:00
Paul Masurel
4ebacb7297 BytesRef is now wrapping an addr 2017-06-21 22:32:05 +09:00
Paul Masurel
fb75e60c6e issue/136 Added hashmaps. 2017-06-21 15:47:55 +09:00
Paul Masurel
04b15c6c11 Merge branch 'master' into exp/hash_intable
Conflicts:
	src/datastruct/stacker/hashmap.rs
2017-06-21 11:40:49 +09:00
Paul Masurel
b05b5f5487 issue/191 Added an analyzer manager. 2017-06-20 10:02:26 +09:00
Paul Masurel
4fe96483bc fill_buffer 2017-06-14 23:32:58 +09:00
Paul Masurel
09e27740e2 Added fill_buffer in DocSet 2017-06-14 18:28:30 +09:00
Paul Masurel
e51feea574 Removed cargo fmt from travis. 2017-06-14 13:45:11 +09:00
Paul Masurel
93e7f28cc0 Added unit test 2017-06-14 10:46:06 +09:00
Paul Masurel
8875b9794a Added API to get range from fastfield 2017-06-13 23:16:50 +09:00
Paul Masurel
f26874557e Remove the concept of pipeline. Made a BoableAnalyzer 2017-06-10 20:06:00 +09:00
Paul Masurel
a7d10b65ae Added support for Japanese. 2017-06-09 22:25:03 +09:00
Paul Masurel
e120e3b7aa issue/191 Added proper analyzer 2017-06-07 23:21:36 +09:00
Paul Masurel
90fcfb3f43 issue/188 Using murmurhash 2017-06-07 09:30:34 +09:00
Paul Masurel
e547e8abad Closes #184
Resizing the `Vec` was a bad idea, as for some stacker operation,
we may have a living reference to an object in the current heap.
2017-06-06 23:16:28 +09:00
Paul Masurel
5aa4565424 Tiny cleaning 2017-06-05 23:40:08 +09:00
Paul Masurel
3637620187 Merge branch 'master' of github.com:tantivy-search/tantivy 2017-06-02 21:03:37 +09:00
Laurentiu Nicola
a94679d74d Use four terms in the intersection bench 2017-05-31 08:31:33 +09:00
Laurentiu Nicola
a35a8638cc Comment nit 2017-05-31 08:31:33 +09:00
Paul Masurel
97a051996f issue 171. Hopefully bugfix? 2017-05-31 08:31:33 +09:00
Laurentiu Nicola
69525cb3c7 Add extra intersection test 2017-05-31 08:31:33 +09:00
Laurentiu Nicola
63867a7150 Fix document generation for posting benchmarks 2017-05-31 08:31:33 +09:00
Paul Masurel
19c073385a Better intersection and added size_hint 2017-05-31 08:31:33 +09:00
Paul Masurel
0521844e56 Format, small changes in VInt 2017-05-31 08:31:20 +09:00
Paul Masurel
8d4778f94d issue/181 BinarySerializable does not return the len + Generics over Read+Write 2017-05-31 08:31:20 +09:00
Paul Masurel
1d5464351d generic read 2017-05-31 08:31:20 +09:00
Paul Masurel
522ebdc674 made ResultExt public 2017-05-31 08:31:20 +09:00
Paul Masurel
4a805733db another hash 2017-05-30 15:36:48 +09:00
Paul Masurel
568d149db8 Merge branch 'master' into exp/hash_intable 2017-05-30 08:27:33 +09:00
Paul Masurel
4cfc9806c0 made ResultExt public 2017-05-30 08:22:17 +09:00
Paul Masurel
37042e3ccb Send and Sync impl now useless 2017-05-29 18:53:49 +09:00
Paul Masurel
b316cd337a Optimization in bitpacker 2017-05-29 18:53:49 +09:00
Paul Masurel
c04991e5ad Removed pointer in fastfield 2017-05-29 18:53:49 +09:00
Paul Masurel
c59b712eeb Added hash info in the table 2017-05-29 18:47:20 +09:00
Ashley Mannix
da61baed3b run fmt 2017-05-29 18:29:39 +09:00
Ashley Mannix
b6140d2962 drop some patch bounds 2017-05-29 18:29:39 +09:00
Ashley Mannix
6a9a71bb1b re-export ErrorKind 2017-05-29 18:29:39 +09:00
Ashley Mannix
e8fc4c77e2 fix delete error msg 2017-05-29 18:29:39 +09:00
Ashley Mannix
80837601ea remove error::* imports 2017-05-29 18:29:39 +09:00
Ashley Mannix
2b2703cf51 run cargo fmt 2017-05-29 18:29:39 +09:00
Ashley Mannix
d79018a7f8 fix build warnings 2017-05-29 18:29:39 +09:00
Ashley Mannix
d8a7c428f7 impl std error for directory errors 2017-05-29 18:29:39 +09:00
Ashley Mannix
45595234cc fix error match 2017-05-29 18:29:39 +09:00
Ashley Mannix
1bcebdd29e initial error-chain 2017-05-29 18:29:39 +09:00
Paul Masurel
ed0333a404 Optimized streamer 2017-05-28 19:58:28 +09:00
Paul Masurel
ac0b1a21eb Term as a wrapper
Small changes

Plastic
2017-05-25 23:49:54 +09:00
Paul Masurel
6bbc789d84 Fmt fix 2017-05-25 23:49:54 +09:00
Paul Masurel
87152daef3 issue/174 Added doc, and made field private 2017-05-25 23:49:54 +09:00
Paul Masurel
e0fce4782a Added documentation 2017-05-25 23:49:54 +09:00
Paul Masurel
a633c2a49a Avoid exposing common. Exposes u64 to i64 conversion instead. 2017-05-25 23:49:54 +09:00
Paul Masurel
51623d593e Avoid exposign schema from segment_reader 2017-05-25 23:49:54 +09:00
Paul Masurel
29bf740ddf Exposing the remaining API 2017-05-25 23:49:54 +09:00
Paul Masurel
511bd25a31 trailing whitespace 2017-05-25 18:17:37 +09:00
Paul Masurel
66e14ac1b1 clippy 2017-05-25 18:17:37 +09:00
Paul Masurel
09e94072ba Cargo fmt 2017-05-25 18:17:37 +09:00
Paul Masurel
6c68136d31 Reorganized code 2017-05-25 18:17:37 +09:00
Paul Masurel
aaf1b2c6b6 Reorganized code and added documentation. 2017-05-25 18:17:37 +09:00
Paul Masurel
8a6af2aefa Added unit test and bugfix 2017-05-25 18:17:37 +09:00
Paul Masurel
7a6e62976b Added stream dictionary code, merge unit test 2017-05-25 18:17:37 +09:00
Paul Masurel
2712930bd6 Added the feature 2017-05-25 18:17:37 +09:00
Paul Masurel
cb05f8c098 Prevent execution of the code in the macro doc 2017-05-22 10:55:45 +09:00
Paul Masurel
c0c9d04ca9 Added extra doc 2017-05-22 10:55:45 +09:00
Paul Masurel
7ea5e740e0 Using the $crate thing to make the macro usable in and outside tantivy 2017-05-22 10:55:45 +09:00
Paul Masurel
2afa6c372a issue/168 Make doc! macro usable outside tantivy 2017-05-22 10:55:45 +09:00
Paul Masurel
c7db8866b5 Merge branch 'facets' 2017-05-21 22:57:01 +09:00
Paul Masurel
02d992324a simplified facets. 2017-05-21 22:56:43 +09:00
Paul Masurel
4ab511ffc6 Merging 2017-05-21 22:15:02 +09:00
Paul Masurel
f318172ea4 Merge branch 'issue/162' 2017-05-21 20:04:03 +09:00
Paul Masurel
581449a824 issue/162 Docs and unit tests 2017-05-21 18:58:04 +09:00
Maciej Dziardziel
272589a381 faceting for fast numerical fields 2017-05-21 12:04:29 +03:00
Laurentiu Nicola
73d54c6379 Inline block_len 2017-05-21 10:44:49 +03:00
Paul Masurel
3e4606de5d Simplifying, and reordering the members 2017-05-21 16:31:52 +09:00
Laurentiu Nicola
020779f61b Make things faster 2017-05-20 20:56:37 +03:00
Laurentiu Nicola
835936585f Don't search whole blocks, but only the remaining part 2017-05-20 18:45:41 +03:00
Paul Masurel
bdd05e97d1 Added bench for segment postings 2017-05-20 23:38:53 +09:00
Paul Masurel
2be5f08cd6 issue/162 Added block iteration API 2017-05-20 11:46:40 +09:00
Paul Masurel
3f49d65a87 issue/162 Create block postings 2017-05-20 00:46:23 +09:00
Paul Masurel
f9baf4bcc8 Merge branch 'issue/155'
Conflicts:
	src/indexer/merger.rs
	src/indexer/segment_writer.rs
2017-05-19 20:14:36 +09:00
Paul Masurel
7ee93fbed5 Cleaning 2017-05-19 20:08:04 +09:00
Paul Masurel
57a5547ae8 Comments and cleaning up API 2017-05-19 11:20:27 +09:00
Paul Masurel
c57ab6a335 Renamed fstmap to termdict 2017-05-19 09:26:18 +09:00
Paul Masurel
02bfa9be52 Moving to termdict 2017-05-19 08:43:52 +09:00
Paul Masurel
b3f62b8acc Better API 2017-05-18 23:35:39 +09:00
Paul Masurel
2a08c247af Clippy 2017-05-18 23:20:41 +09:00
Paul Masurel
d2926b6ee0 Format 2017-05-18 23:09:20 +09:00
Paul Masurel
0272167c2e Code cleaning 2017-05-18 23:06:02 +09:00
Laurentiu Nicola
a9cf0bde16 Format code 2017-05-18 22:07:49 +09:00
Laurentiu Nicola
5a457df45d VInt encode values in IntFastFieldWriter
Closes #131
2017-05-18 22:07:49 +09:00
Paul Masurel
ca76fd5ba0 Uncommenting unit test 2017-05-18 20:41:56 +09:00
Paul Masurel
e79a316e41 Issue 155 - Trying to avoid term lookup when merging terms
+ Adds a proper Streamer interface
2017-05-18 20:12:00 +09:00
Paul Masurel
733f54d80e Making clippy happy. 2017-05-17 19:07:39 +09:00
Paul Masurel
7b2b181652 Merge branch 'master' into issue/136
Conflicts:
	src/datastruct/stacker/hashmap.rs
	src/datastruct/stacker/heap.rs
	src/datastruct/stacker/mod.rs
	src/indexer/index_writer.rs
	src/indexer/merger.rs
	src/indexer/segment_updater.rs
	src/indexer/segment_writer.rs
	src/postings/postings_writer.rs
	src/postings/recorder.rs
	src/schema/term.rs
2017-05-17 18:40:09 +09:00
Laurentiu Nicola
b3f39f2343 Remove unneeded suppressions, make clippy lints explicit 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
a13122d392 use explicit drop instead of suppression 2017-05-17 15:50:07 +09:00
Paul Masurel
113917c521 Making clippy happy.
+ Simplifying bitpacking by adding a 7 byte padding.
+ Bugfix in a unit test.
2017-05-17 15:50:07 +09:00
Laurentiu Nicola
1352b95b07 clippy: fix never_loop warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
c0538dbe9a clippy: fix mut_from_ref warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
0d5ea98132 clippy: fix inline_always warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
0404df3fd5 Fix typo in docstring 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
a67caee141 clippy: fix len_zero warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
f5fb29422a clippy: fix while_let_loop warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
4e48bbf0ea clippy: fix needless_lifetimes warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
6fea510869 clippy: fix redundant_closure warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
39958ec476 clippy: fix single_match warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
36f51e289e clippy: fix match_same_arms warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
5c83153035 clippy: fix or_fun_call warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
8e407bb314 clippy: fix needless_borrow warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
103ba6ba35 clippy: fix match_ref_pats warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
3965b26cd2 clippy: fix useless_let_if_seq warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
1cd0b378fb clippy: fix map_clone warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
92f383fa51 clippy: fix let_unit_value warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
6ae34d2a77 clippy: fix toplevel_ref_arg warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
1af1f7e0d1 clippy: fix if_let_redundant_pattern_matching warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
feec2e2620 clippy: fix needless_bool warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
3e2ad7542d clippy: fix needless_return warnings 2017-05-17 15:50:07 +09:00
Laurentiu Nicola
ac02c76b1e clippy: fix doc_markdown warnings 2017-05-17 15:50:07 +09:00
Paul Masurel
e5c7c0b8b9 Update CHANGELOG.md 2017-05-16 21:13:33 +09:00
Laurentiu Nicola
49dbe4722f Add a test for SegmentPostings::skip_len 2017-05-16 21:12:43 +09:00
Laurentiu Nicola
f64ff77424 Use an exponential search 2017-05-16 21:12:43 +09:00
Laurentiu Nicola
2bf93e9e51 Avoid rebuilding simdcomp when running tests 2017-05-16 08:37:43 +09:00
Laurentiu Nicola
3dde748b25 Make rustfmt happy 2017-05-16 00:49:05 +03:00
Laurentiu Nicola
1dabe26395 Add comment about block_len 2017-05-15 21:26:28 +03:00
Laurentiu Nicola
5590537739 Disable early exit 2017-05-15 21:18:06 +03:00
Laurentiu Nicola
ccf0f9cb2f Merge branch 'master' of github.com:tantivy-search/tantivy into issue/130 2017-05-15 18:54:16 +03:00
Laurentiu Nicola
e21913ecdc Use binary search for SegmentPostings::skip_next 2017-05-15 18:33:43 +03:00
Laurentiu Nicola
2cc826adc7 Add a bench for SegmentPostings::SkipNext 2017-05-15 18:33:43 +03:00
Laurentiu Nicola
4d90d8fc1d Move the random sampling helpers to the tests module 2017-05-15 18:33:43 +03:00
Paul Masurel
0606a8ae73 Bugfix in travis yml 2017-05-16 00:22:11 +09:00
Paul Masurel
03564214e7 Added check for rustfmt in travis 2017-05-15 22:46:43 +09:00
Paul Masurel
4c8f9742f8 format 2017-05-15 22:30:18 +09:00
Paul Masurel
a23b7a1815 Test the size of complete 0..128 block 2017-05-15 19:09:52 +09:00
Paul Masurel
6f89a86b14 Added simple search in travis CI 2017-05-15 12:10:23 +09:00
Laurentiu Nicola
b2beac1203 Check the result of wait_merging_threads 2017-05-15 08:00:25 +09:00
Paul Masurel
8cd5a2d81d Fixed logging deleted files twice 2017-05-15 00:25:49 +09:00
Paul Masurel
b26c22ada0 Merge branch 'issue/148' 2017-05-15 00:02:51 +09:00
Laurentiu Nicola
8a35259300 Avoid clone() call 2017-05-14 23:28:17 +09:00
Paul Masurel
db56167a5d Display backtrace 2017-05-14 23:28:17 +09:00
Paul Masurel
ab66ffed4e Closes #147 2017-05-14 23:28:17 +09:00
Laurentiu Nicola
e04f2f0b08 issue/148 Wait for the index writer threads to shut down in simple_search 2017-05-14 16:35:24 +03:00
Paul Masurel
7a5df33c85 issue/148 Wrapping MsQueue to drop all of its concent on Drop 2017-05-14 16:25:33 +03:00
Laurentiu Nicola
ee0873dd07 Avoid clone() call 2017-05-13 16:11:58 +03:00
Paul Masurel
695c8828b8 Display backtrace 2017-05-13 18:51:38 +09:00
Paul Masurel
4ff7dc7a4f Closes #147 2017-05-13 18:46:50 +09:00
Paul Masurel
69832bfd03 NOBUG Disabling running examples in CI as it is not working. 2017-05-12 14:35:50 +09:00
Paul Masurel
ecbdd70c37 Removed the clunky linked list logic of the heap. 2017-05-12 14:01:52 +09:00
Paul Masurel
fb1b2be782 issue/136 Fix following CR 2017-05-12 13:51:09 +09:00
Paul Masurel
9cd7458978 NOBUG Hiding methods making it possible to build a incorrect Term. 2017-05-11 21:12:59 +09:00
Paul Masurel
4c4c28e2c4 Fix broke compile 2017-05-11 20:57:32 +09:00
Paul Masurel
9f9e588905 Merge branch 'master' into issue/136
Conflicts:
	src/postings/postings_writer.rs
2017-05-11 20:50:24 +09:00
Paul Masurel
6fd17e0ead Code cleaning 2017-05-11 20:47:30 +09:00
Paul Masurel
65dc5b0d83 Closes #145 2017-05-11 19:48:06 +09:00
Paul Masurel
15d15c01f8 Runing examples in CI
Closes #143
2017-05-11 19:43:36 +09:00
Paul Masurel
106832a66a Make Term::with_capacity crate-public 2017-05-11 19:37:15 +09:00
Paul Masurel
477b9136b9 FIXED inconsistent Term's field serialization.
Also.

Cleaned up the code to make sure that the logic
is only in one place.
Removed allocate_vec

Closes #141
Closes #139
Closes #142
Closes #138
2017-05-11 19:37:15 +09:00
Paul Masurel
7852d097b8 CHANGELOG 0.3.1 did not included the fix of the Field(u32) 2017-05-11 09:48:37 +09:00
Ashley Mannix
0bd56241bb pretty print meta.json 2017-05-10 20:13:53 +09:00
Paul Masurel
54ab897755 Added comment 2017-05-10 19:30:24 +09:00
Paul Masurel
1369d2d144 Quadratic probing. 2017-05-10 10:38:47 +09:00
Paul Masurel
d3f829dc8a Bugfix 2017-05-10 00:29:37 +09:00
Paul Masurel
e82ccf9627 Merge branch 'master' into issue/indexing-refactoring 2017-05-09 16:43:33 +09:00
Paul Masurel
d3d29f7f54 NOBUG Updated CHANGELOG with the serde change for 0.4.0 2017-05-09 16:42:25 +09:00
Paul Masurel
3566717979 Merge pull request #134 from tantivy-search/chore/serde-rebase
Replace rustc_serialize with serde (updated)
2017-05-09 16:38:42 +09:00
Paul Masurel
90bc3e3773 Added limitation on term dictionary saturation 2017-05-09 14:10:33 +09:00
Paul Masurel
ffb62b6835 working 2017-05-09 10:17:05 +09:00
Ashley Mannix
4f9ce91d6a update underflow test 2017-05-08 14:40:58 +10:00
Laurentiu Nicola
3c3a2fbfe8 Remove old serialization code 2017-05-08 07:36:15 +03:00
Laurentiu Nicola
0508571d1a Use the proper error type on u64 overflow 2017-05-08 07:35:33 +03:00
Laurentiu Nicola
7b733dd34f Fix i64 overflow check and merge NotJSON with NotJSONObject 2017-05-08 07:09:54 +03:00
Ashley Mannix
2c798e3147 Replace rustc_serialize with serde 2017-05-07 20:21:22 +03:00
Paul Masurel
2c13f210bc Bugfix on merging i64 fast fields 2017-05-07 15:57:29 +09:00
Paul Masurel
0dad02791c issues/65 Added comments
Closes #65
Closes #132
2017-05-06 23:09:45 +09:00
Paul Masurel
2947364ae1 issues/65 Phrase query for untokenized fields are not tokenized. 2017-05-06 22:14:26 +09:00
Paul Masurel
05111599b3 Removed several TODOs 2017-05-05 16:08:09 +08:00
Paul Masurel
83263eabbb issues/65 Updated changelog added some doc. 2017-05-04 17:13:14 +08:00
Paul Masurel
5cb5c9a8f2 issues/65 Added i64 fast fields 2017-05-04 16:46:14 +08:00
Paul Masurel
9ab92b7739 i64 fast field working 2017-05-04 16:46:14 +08:00
Paul Masurel
962bddfbbf Merge with panicks. 2017-05-04 16:46:14 +08:00
Paul Masurel
26cfe2909f FastField with different types 2017-05-04 16:46:13 +08:00
Paul Masurel
afdfb1a69b Compiling... fastfield not implemented yet 2017-05-04 16:46:13 +08:00
Paul Masurel
b26ad1d57a Added int options 2017-05-04 16:46:13 +08:00
Paul Masurel
1dbd54edbb Renamed u64options 2017-05-04 16:46:13 +08:00
Paul Masurel
deb04eb090 issue/65 Switching to u64. 2017-05-04 16:46:13 +08:00
Paul Masurel
bed34bf502 Merge branch 'issues/122' 2017-04-23 16:14:40 +08:00
Paul Masurel
80f1e26c3b Tantivy 0.3.1 2017-04-23 15:52:07 +08:00
Paul Masurel
3e68b61d8f issue/122 Adds a garbage collect method 2017-04-23 15:51:06 +08:00
Paul Masurel
95bfb71901 NOBUG Remove 256 num fields limit 2017-04-19 22:37:34 +09:00
Paul Masurel
74e10843a7 issue/120 Disabled SIMD vbyte compression for msvc 2017-04-17 22:36:32 +09:00
Paul Masurel
1b922e6d23 issue 120. Using streamvbyte codec for the vbyte part of the encoding 2017-04-16 18:49:53 +09:00
Paul Masurel
a7c6c31538 Merge commit '9d071c8d4610aa61f4b1f7dd489210415a05cfc0' as 'cpp/streamvbyte' 2017-04-16 15:22:43 +09:00
Paul Masurel
9d071c8d46 Squashed 'cpp/streamvbyte/' content from commit f38aa6b
git-subtree-dir: cpp/streamvbyte
git-subtree-split: f38aa6b6ec4c5cee9d72c94ef305e6a79a108252
2017-04-16 15:22:43 +09:00
Paul Masurel
04074f7bcb Merge pull request #119 from tantivy-search/issue/118
Using u32 for field ids
2017-04-15 13:11:22 +09:00
Paul Masurel
8a28d1643d Using u32 for field ids 2017-04-15 13:04:33 +09:00
Paul Masurel
44c684af5c NOBUG Fixes winapi version 2017-04-08 19:01:31 +09:00
Paul Masurel
60279a03b6 RELEASE Tantivy 0.3. See Changelog 2017-04-08 18:53:40 +09:00
Paul Masurel
dc43135fe0 NOBUG Remove .info 2017-04-08 18:49:37 +09:00
Paul Masurel
ce022e5f06 issue/54 Clone segment reader rather than reload.
Closes #54.
2017-04-08 17:52:33 +09:00
Paul Masurel
0be977d9eb Merge pull request #114 from tantivy-search/issue/96
Closes Issue/96
2017-04-08 17:49:48 +09:00
Paul Masurel
a4ba20eea3 issue/96 code clean up, adding comments.wq 2017-04-08 17:30:25 +09:00
Paul Masurel
4bef6c99ee issue/96 Cleaning up some lock management 2017-04-05 10:12:39 +09:00
Paul Masurel
a84871468b issue/96 Rename FileError -> OpenReadError 2017-04-05 10:01:49 +09:00
Paul Masurel
e0a39fb273 issue/96 Added unit test, documentation and various tiny improvements. 2017-04-04 22:43:35 +09:00
Paul Masurel
35203378ef Considering merge options after calling end_merge 2017-04-03 17:26:21 +09:00
Paul Masurel
b5bf9bb13c issue/96 Looping over wait_merging_thread. 2017-04-03 08:39:18 +09:00
Paul Masurel
ea3349644c issue/96 Fixed unit test condition to something reasonable 2017-04-02 21:58:38 +09:00
Paul Masurel
d4f2e475ff issue/96 removed faulty assert 2017-04-02 19:21:20 +09:00
Paul Masurel
17631ed866 issue/96 Added functionality to protect files from deletion
Hopefully fixed the race condition happening when merging files.
2017-04-02 18:48:20 +09:00
Paul Masurel
9eb2d3e8c5 issue/96 avoid removing the bitset from segment_entry. 2017-04-02 16:26:28 +09:00
Paul Masurel
afd08a7bbc issue/96 Changed datastruct for the delete queue. 2017-04-01 21:01:10 +09:00
Paul Masurel
4fc7bc5f09 Added helper to create Vec with a given sizewq 2017-03-31 18:54:23 +09:00
Paul Masurel
602b9d235f Merge pull request #113 from kaedroho/patch-1
Mark "cpp" folder as linguist-vendored in .gitattributes
2017-03-31 09:05:57 +09:00
Karl Hobley
b22c6b86c7 Mark "cpp" folder as linguist-vendored in .gitattributes
This repo is currently being detected as a C project because of some vendored libraries in the "cpp" folder.

According to https://github.com/github/linguist#using-gitattributes you can use ``.gitattributes`` tell GitHub to not count this folder when detecting the language.
2017-03-30 13:43:03 +01:00
Paul Masurel
f0dc0de4b7 Added helper to create Vec with a given size 2017-03-29 11:26:24 +09:00
Paul Masurel
456dd3a60d issue/96 merge 2017-03-28 16:49:48 +09:00
Paul Masurel
d768a10077 master merged in feature branch 2017-03-27 09:27:23 +09:00
Paul Masurel
ddb2b8d807 test passing.
SegmentWriter create SegmentEntry which contain a delete_bitset
2017-03-26 18:32:53 +09:00
Paul Masurel
45806951b1 added quotation mark 2017-03-25 22:48:07 +09:00
Paul Masurel
84a060552d issue/109 trying to get proper logging in appveyor 2017-03-25 22:34:40 +09:00
Paul Masurel
68a956c6e7 issue/109 Showing debug! if test fails 2017-03-25 21:54:17 +09:00
Paul Masurel
f50f557cfc issue/109 Remove futures from most of segment_updater API. 2017-03-25 19:36:03 +09:00
Paul Masurel
daa19b770a (hopefully) bugfix race condition on wait merging threadwq. 2017-03-24 18:20:58 +09:00
Paul Masurel
e75402be80 Merge pull request #108 from KodrAus/ci/appveyor
Add appveyor config
2017-03-24 15:49:50 +09:00
Ashley Mannix
51cab39186 drop to vs2015 image 2017-03-24 16:37:30 +10:00
Ashley Mannix
c8e12b6847 try set mingw path 2017-03-24 16:22:32 +10:00
Ashley Mannix
b44a9cb89d add appveyor config 2017-03-24 16:11:51 +10:00
Paul Masurel
e650fab927 Merge pull request #106 from tantivy-search/wip/delay-test-deletes
Fix delete tests on Windows
2017-03-22 09:26:36 +09:00
Paul Masurel
b12a97abe4 Add unit test for when deleting fails
Test that when delete fails, we still keep
the file as managed.

Remove the error log for windows, as failing
to delete is expected.
2017-03-22 08:57:09 +09:00
Laurentiu Nicola
2b5a4bbde2 Don't delete twice on not(windows) 2017-03-21 07:48:58 +02:00
Laurentiu Nicola
2d169c4454 Delay deleting the files in the test suite to make it work on Windows 2017-03-21 07:37:28 +02:00
Paul Masurel
66d6e4e246 Merge pull request #103 from tantivy-search/lnicola-fix-sync-directory
Make directory syncing work on Windows (resubmit)
2017-03-21 10:55:03 +09:00
Paul Masurel
a061ba091d Merge pull request #105 from tantivy-search/wip/simdcomp-build
Avoid using make for building simdcomp
2017-03-21 10:00:49 +09:00
Laurentiu Nicola
92ce9b906b Avoid using make for building simdcomp 2017-03-21 00:25:04 +02:00
Laurentiu Nicola
1e0ac31e11 Clarify comment and use qualified import for the flag 2017-03-20 23:12:48 +02:00
Paul Masurel
ebcea0128c Getting the FLAG from the winapi module. 2017-03-19 11:09:15 +09:00
Paul Masurel
30075176cb blop 2017-03-19 10:52:54 +09:00
Laurentiu Nicola
7c114b602d Make directory syncing work on Windows 2017-03-19 02:17:13 +02:00
Paul Masurel
50659147d1 NOBUG updated simple_search.html 2017-03-14 12:04:21 +09:00
Paul Masurel
da10fe3b4d Various fixes. 2017-03-13 22:01:55 +09:00
Paul Masurel
4db56c6bd8 Merge pull request #101 from tantivy-search/issue/99
Improvements to simple_search.rs: fixes #100 and improves #99
2017-03-13 13:26:39 +09:00
Claus Matzinger
292dd6dcb6 fixup 2017-03-13 00:24:54 -04:00
Claus Matzinger
37e71f7c63 fixes #100 and improves #99 2017-03-12 22:59:38 -04:00
Paul Masurel
5932278e00 test passing 2017-03-13 10:00:19 +09:00
Paul Masurel
202dda98ba baby step 3 2017-03-12 19:00:57 +09:00
Paul Masurel
7c971b5d3b baby step 2 2017-03-11 16:14:20 +09:00
Paul Masurel
77c61ddab2 Baby step1 2017-03-11 14:20:46 +09:00
Paul Masurel
b7f026bab9 Merger returns a SegmentMeta 2017-03-10 09:05:51 +09:00
Paul Masurel
cc2f78184b Added unit test for #96 2017-03-10 09:05:51 +09:00
Paul Masurel
673423f762 Merge pull request #98 from KodrAus/feat/no-cpp
Convert simd wrapper to C
2017-03-09 13:11:08 +09:00
Paul Masurel
7532c4a440 Removed double ; 2017-03-09 10:57:30 +09:00
Ashley Mannix
324b56a60c fix warnings 2017-03-09 06:54:48 +10:00
Paul Masurel
ac3890f93c NOBUG Marked the functional test as ignore 2017-03-08 19:08:29 +09:00
Ashley Mannix
69b3de43f6 convert simd wrapper to c 2017-03-08 14:02:48 +10:00
Paul Masurel
3d1196d53e NOBUG added doc link. 2017-03-07 10:14:00 +09:00
Paul Masurel
a397537ed8 NOBUG added rustdoc 2017-03-07 10:10:43 +09:00
Paul Masurel
ebca904767 NOBUG added rustdoc 2017-03-07 09:58:51 +09:00
Paul Masurel
3a472914ce Fix .write -> .write_all 2017-03-06 16:28:30 +09:00
Paul Masurel
c59507444f issue/77 ManagedDirectory working
Closes #77
2017-03-06 12:18:36 +09:00
Paul Masurel
4b7afa2ae7 issue/77 Added managed directory 2017-03-03 22:41:37 +09:00
Paul Masurel
590a8582c9 The reference doc should not point to the schema page. 2017-02-28 21:17:19 +09:00
Paul Masurel
ab3440f925 NOBUG Bypass github cache for coveralls badge 2017-02-27 12:39:59 +09:00
Paul Masurel
ec5fb2eaa9 NOBUG cleanup 2017-02-27 09:52:28 +09:00
Paul Masurel
15b60d72cc NOBUG add_document does not return result 2017-02-27 09:36:41 +09:00
Paul Masurel
7a07144c68 Bugfix related with deletes, rollback and the index opstamp. 2017-02-27 01:42:25 +09:00
Paul Masurel
8bcfdb8e80 NOBUG misc ... 2017-02-26 21:35:18 +09:00
Paul Masurel
a7f10f055d Nobug hidding doc, filling doc 2017-02-26 00:11:32 +09:00
Paul Masurel
597dac9cb6 NOBUG Adding doc. 2017-02-25 23:39:02 +09:00
Paul Masurel
6a002bcc76 NOBUGwq 2017-02-25 21:20:55 +09:00
Paul Masurel
3a86fc00a2 Closes #64 - Improve Index creationg API / documentation 2017-02-25 20:40:39 +09:00
Paul Masurel
ca1617d3cd Fixes #91 2017-02-25 20:32:26 +09:00
Paul Masurel
e4a102d859 Merge branch 'issue/43'
Conflicts:
	src/directory/mmap_directory.rs
2017-02-25 19:36:21 +09:00
Paul Masurel
1d9924ee90 Closes #43. 2017-02-25 19:32:36 +09:00
Paul Masurel
f326a2dafe TODO hunt 2017-02-25 15:28:56 +09:00
Paul Masurel
78228ece73 Closes #92. ByteOrder of u32 terms. 2017-02-24 23:41:46 +09:00
Paul Masurel
503d0295cb issue/43 TODO hunt 2017-02-23 09:54:54 +09:00
Paul Masurel
eb39db44fc issue/43 Avoid keeping segments with 0 documents. 2017-02-23 09:20:30 +09:00
Paul Masurel
7f78d1f4ca Fixes #82 Renamed and commented the function to create Term from &[u8] 2017-02-23 08:33:59 +09:00
Paul Masurel
df9090cb0b NOBUG TODO hunt, and cleanups 2017-02-22 22:18:33 +09:00
Paul Masurel
4a8eb3cb05 issue/43 Added unit test for deletes including merging. 2017-02-22 21:38:37 +09:00
Paul Masurel
a74b41d7ed NOBUG run benchmark over exactly 100 K elements 2017-02-21 11:43:55 +09:00
Paul Masurel
06017bd422 NOBUG made the cleanup limit adaptive in MmapCache 2017-02-21 00:37:45 +09:00
Paul Masurel
17beaab8bf Merge branch 'issue/72' 2017-02-21 00:25:24 +09:00
Paul Masurel
062e38a2ab Fixes #72 - Cache directory uses weak ref. Introduced CacheInfo object. 2017-02-21 00:24:33 +09:00
Paul Masurel
8c2b20c496 NOBUG Trying to fix coverall conf. 2017-02-20 17:47:16 +09:00
Paul Masurel
c677eb9f13 issue/43 Removed notify 2017-02-19 22:41:45 +09:00
Paul Masurel
0f332d1fd3 issue/43 Removed doc freq from recorders. 2017-02-19 22:39:31 +09:00
Paul Masurel
1b45539f32 issue/43 Added support for delete in merged index 2017-02-19 22:39:31 +09:00
Paul Masurel
7315000fd4 issue/43 Merging ok for postings / fastfields. 2017-02-19 22:39:31 +09:00
Paul Masurel
e3d2fca844 issue/43 Isolated segment_entry / doc_opstamp_mapping 2017-02-19 22:39:31 +09:00
Paul Masurel
1c03d98a11 issue/43 added delete_queue right in the segment updater 2017-02-19 22:39:31 +09:00
Paul Masurel
8b68f22be1 issue/43 made the delete queue shareable 2017-02-19 22:39:31 +09:00
Paul Masurel
d007cf3435 issue/43 simplification. removed the notion of delete cursor. 2017-02-19 22:39:04 +09:00
Paul Masurel
72afbb28c7 issue/43 test passing 2017-02-19 22:39:04 +09:00
Paul Masurel
2fc3a505bc issue/43 refactoring segment meta 2017-02-19 22:39:04 +09:00
Paul Masurel
e337c35721 issue/43 SegmentMeta refactoring 2017-02-19 22:39:04 +09:00
Paul Masurel
0c318339b0 issue/43 Path logic in segment. 2017-02-19 22:39:04 +09:00
Paul Masurel
64fee11bc0 issue/43 Clean up 2017-02-19 22:39:04 +09:00
Paul Masurel
e12fc4bb09 issue/43 deletes
merge not working
only updating uncommitted
2017-02-19 22:39:04 +09:00
Paul Masurel
0820992141 issue/43 docstamp -> opstamp 2017-02-19 22:38:39 +09:00
Paul Masurel
09782858da issue/43 Segment have a commit opstamp 2017-02-19 22:38:39 +09:00
Paul Masurel
ca977fb17b issue/43 Refactoring of SegmentUpdater 2017-02-19 22:38:39 +09:00
Paul Masurel
e8ecb68f00 issue/43 switching for futures 2017-02-19 22:38:39 +09:00
Paul Masurel
0ec492dcf2 issue/43 refactoring in order to remove the segment updater non sense for simpler futures 2017-02-19 22:38:39 +09:00
Paul Masurel
20eb586660 issue/43 Rename SegmentUpdater 2017-02-19 22:38:39 +09:00
Paul Masurel
6530d43d6a issue/43 Small fixes. 2017-02-19 22:38:39 +09:00
Paul Masurel
926e71a573 issue/43 unit test running. segment updater uses futures. 2017-02-19 22:38:38 +09:00
Paul Masurel
bacaabf857 issue/43 fixed on unit test. need big refactoring of segment updater 2017-02-19 22:38:38 +09:00
Paul Masurel
d6e7157173 issue/43 Test broken... moved segment manager to the segment updater / segment writer 2017-02-19 22:38:15 +09:00
Paul Masurel
093dcbd253 issue/43 Isolated SegmentMeta 2017-02-19 22:38:15 +09:00
Paul Masurel
fba44b78b6 issue/43 Added delete doc file 2017-02-19 22:38:15 +09:00
Paul Masurel
01cf303dec issue/43 segment writer 2017-02-19 22:38:14 +09:00
Paul Masurel
d5c161e196 issue/43 Computing deleted doc bitset 2017-02-19 22:38:14 +09:00
Paul Masurel
183d5221b5 issue/43 DeleteQueue. 2017-02-19 22:38:14 +09:00
Paul Masurel
5a06f45403 issue/43 small progress 2017-02-19 22:36:57 +09:00
Paul Masurel
395cbf3913 issue/43 Change the delete queue datastruct for something cleaner/functional 2017-02-19 22:36:57 +09:00
Paul Masurel
fe2ddb8844 issue43 Added DeleteQueue. 2017-02-19 22:36:57 +09:00
Paul Masurel
3129701e92 issue/71 Added list of supported OSes 2017-02-19 14:14:15 +09:00
Paul Masurel
56ba698def Merge pull request #76 from Ameobea/master
Updated dependency versions and implementations
2017-02-17 18:20:44 +09:00
Casey Primozic
e0ba699c16 Updated dependency versions and implementations
- Updated `byteorder` error usage (now returns straight `Error`s)
 - Updated `Uuid` implementation (`to_simple_string` now `.simple().to_string()`)
2017-02-17 01:26:13 -06:00
Paul Masurel
b6423f9a76 Merge pull request #73 from manuel-woelker/pr-subtree
Use git subtree mechanism for simdcomp to simplify build (cf. #24)
2017-01-27 14:52:36 +09:00
Manuel Woelker
a667394a49 update README and build after simdcomp subtree refactor 2017-01-26 21:14:05 +01:00
Manuel Woelker
9f02b090dd Merge commit 'f07ccd6e4fbc5bbfeb94d40e0f14bc527a7d5439' as 'cpp/simdcomp' 2017-01-26 20:28:23 +01:00
Manuel Woelker
f07ccd6e4f Squashed 'cpp/simdcomp/' content from commit 0dca286
git-subtree-dir: cpp/simdcomp
git-subtree-split: 0dca28668f1fb6d343dc3c62fa7750a00f1d7201
2017-01-26 20:28:23 +01:00
Manuel Woelker
f19f8757de remove git submodule to replace via git subtree 2017-01-26 20:17:44 +01:00
Paul Masurel
f729edb529 NOBUG added badges / categories for crates.io 2017-01-21 09:35:44 +09:00
Paul Masurel
73ef201c44 Merge branch 'master' of github.com:tantivy-search/tantivy 2017-01-11 21:09:05 +09:00
Paul Masurel
3b69e790e9 NOBUG expose a version public api. Handy to check if the compilation was made with simd or not. 2017-01-11 21:06:41 +09:00
Paul Masurel
1b0b3051c2 NOBUG Pinned some version, removed import warning. 2017-01-09 15:30:50 +09:00
Paul Masurel
43c1da1a92 Merge branch 'issue/67' 2016-12-20 16:52:33 +01:00
Paul Masurel
e1cb5e299d NOBUG split field_type into 2 2016-12-20 16:51:34 +01:00
Paul Masurel
14ebed392b Merge pull request #68 from tantivy-search/issue/67
Issue/67
2016-12-20 11:27:05 +01:00
Paul Masurel
d3d34be167 issue/67 Added a advance interface to the term iterator 2016-12-20 11:25:52 +01:00
Paul Masurel
98cdc83428 Issue #67 Removing afterwards. 2016-12-18 11:57:28 +01:00
Paul Masurel
4d7d201f21 Issue #67 - Removed segment ord array from term iteration.
This was probably an early optimization.
2016-12-17 09:44:51 +01:00
Paul Masurel
ca5f3e1d46 issue/67 First stab. Iterator working. 2016-12-17 00:58:12 +01:00
Paul Masurel
1559733b03 Merge pull request #63 from vandenoever/readme
fix for build instructions
2016-12-12 10:17:27 +09:00
Paul Masurel
44b5f1868c Merge branch 'master' into readme 2016-12-12 10:17:19 +09:00
Paul Masurel
4cedfd903d NOBUG Added ga beacon to README 2016-12-12 10:07:30 +09:00
Paul Masurel
c0049e8487 NOBUG fixed doc urls. 2016-12-11 21:43:14 +09:00
Paul Masurel
e88adbff5c Bumped tantivy's version in Cargo.toml 2016-12-11 17:51:45 +09:00
Jos van den Oever
e497e04f70 fix for build instructions
And clarification that nighty is required.
2016-12-10 18:08:15 +01:00
194 changed files with 21548 additions and 7477 deletions

1
.gitattributes vendored Normal file
View File

@@ -0,0 +1 @@
cpp/* linguist-vendored

5
.gitignore vendored
View File

@@ -1,3 +1,4 @@
*.swp
target
target/debug
.vscode
@@ -5,3 +6,7 @@ target/release
Cargo.lock
benchmark
.DS_Store
cpp/simdcomp/bitpackingbenchmark
*.bk
.idea
trace.dat

3
.gitmodules vendored
View File

@@ -1,3 +0,0 @@
[submodule "cpp/simdcomp"]
path = cpp/simdcomp
url = git@github.com:lemire/simdcomp.git

View File

@@ -1,11 +1,8 @@
language: rust
sudo: required
cache: cargo
rust:
- nightly
git:
submodules: false
before_install:
- sed -i 's/git@github.com:/https:\/\/github.com\//' .gitmodules
- git submodule update --init --recursive
env:
global:
- CC=gcc-4.8
@@ -16,6 +13,7 @@ addons:
apt:
sources:
- ubuntu-toolchain-r-test
- kalakris-cmake
packages:
- gcc-4.8
- g++-4.8
@@ -23,18 +21,17 @@ addons:
- libelf-dev
- libdw-dev
- binutils-dev
- cmake
before_script:
- |
pip install 'travis-cargo<0.2' --user &&
export PATH=$HOME/.local/bin:$PATH
- export PATH=$HOME/.cargo/bin:$PATH
- cargo install cargo-update || echo "cargo-update already installed"
- cargo install cargo-travis || echo "cargo-travis already installed"
script:
- |
travis-cargo build &&
travis-cargo test &&
travis-cargo bench &&
travis-cargo doc
- cargo build
- cargo test
- cargo test -- --ignored
- cargo run --example simple_search
- cargo doc
after_success:
- bash ./script/build-doc.sh
- travis-cargo doc-upload
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then travis-cargo coveralls --no-sudo --verify; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./kcov/build/src/kcov --verify --coveralls-id=$TRAVIS_JOB_ID --exclude-pattern=/.cargo target/kcov target/debug/tantivy-*; fi
- cargo coveralls --exclude-pattern src/functional_test.rs
- cargo doc-upload

124
CHANGELOG.md Normal file
View File

@@ -0,0 +1,124 @@
Tantivy 0.5.2
==========================
- Removed C code. Tantivy is now pure Rust.
- BM25
- Approximate field norms encoded over 1 byte.
Tantivy 0.5.1
==========================
- bugfix #254 : tantivy failed if no documents in a segment contained a specific field.
Tantivy 0.5
==========================
- Faceting
- RangeQuery
- Configurable tokenization pipeline
- Bugfix in PhraseQuery
- Various query optimisation
- Allowing very large indexes
- 64 bits file address
- Smarter encoding of the `TermInfo` objects
Tantivy 0.4.3
==========================
- Bugfix race condition when deleting files. (#198)
Tantivy 0.4.2
==========================
- Prevent usage of AVX2 instructions (#201)
Tantivy 0.4.1
==========================
- Bugfix for non-indexed fields. (#199)
Tantivy 0.4.0
==========================
- Raise the limit of number of fields (previously 256 fields) (@fulmicoton)
- Removed u32 fields. They are replaced by u64 and i64 fields (#65) (@fulmicoton)
- Optimized skip in SegmentPostings (#130) (@lnicola)
- Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola
- Using error-chain (@KodrAus)
- QueryParser: (@fulmicoton)
- Explicit error returned when searched for a term that is not indexed
- Searching for a int term via the query parser was broken `(age:1)`
- Searching for a non-indexed field returns an explicit Error
- Phrase query for non-tokenized field are not tokenized by the query parser.
- Faster/Better indexing (@fulmicoton)
- using murmurhash2
- faster merging
- more memory efficient fast field writer (@lnicola )
- better handling of collisions
- lesser memory usage
- Added API, most notably to iterate over ranges of terms (@fulmicoton)
- Bugfix that was preventing to unmap segment files, on index drop (@fulmicoton)
- Made the doc! macro public (@fulmicoton)
- Added an alternative implementation of the streaming dictionary (@fulmicoton)
Tantivy 0.3.1
==========================
- Expose a method to trigger files garbage collection
Tantivy 0.3
==========================
Special thanks to @Kodraus @lnicola @Ameobea @manuel-woelker @celaus
for their contribution to this release.
Thanks also to everyone in tantivy gitter chat
for their advise and company :)
https://gitter.im/tantivy-search/tantivy
Warning:
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
code and index format.
You should not expect backward compatibility before
tantivy 1.0.
New Features
------------
- Delete. You can now delete documents from an index.
- Support for windows (Thanks to @lnicola)
Various Bugfixes & small improvements
----------------------------------------
- Added CI for Windows (https://ci.appveyor.com/project/fulmicoton/tantivy)
Thanks to @KodrAus ! (#108)
- Various dependy version update (Thanks to @Ameobea) #76
- Fixed several race conditions in `Index.wait_merge_threads`
- Fixed #72. Mmap were never released.
- Fixed #80. Fast field used to take an amplitude of 32 bits after a merge. (Ouch!)
- Fixed #92. u32 are now encoded using big endian in the fst
in order to make there enumeration consistent with
the natural ordering.
- Building binary targets for tantivy-cli (Thanks to @KodrAus)
- Misc invisible bug fixes, and code cleanup.
- Use

View File

@@ -1,46 +1,53 @@
[package]
name = "tantivy"
version = "0.1.1"
version = "0.5.1"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
build = "build.rs"
license = "MIT"
categories = ["database-implementations", "data-structures"]
description = """Tantivy is a search engine library."""
documentation = "http://fulmicoton.com/tantivy/tantivy/index.html"
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
homepage = "https://github.com/tantivy-search/tantivy"
repository = "https://github.com/tantivy-search/tantivy"
readme = "README.md"
keywords = ["search", "information", "retrieval"]
[dependencies]
byteorder = "0.4"
memmap = "0.2"
lazy_static = "0.1"
regex = "0.1"
fst = "0.1"
atomicwrites = "0.0.14"
tempfile = "2.0"
rustc-serialize = "0.3"
log = "0.3"
combine = "2.0.*"
byteorder = "1.0"
lazy_static = "0.2.1"
tinysegmenter = "0.1.0"
regex = "0.2"
fst = {version="0.2", default-features=false}
atomicwrites = {version="0.1", optional=true}
log = "0.3.6"
combine = "2.2"
tempdir = "0.3"
bincode = "0.4"
libc = {version = "0.2.6", optional=true}
num_cpus = "0.2"
itertools = "0.4"
lz4 = "1.13"
time = "0.1"
uuid = "0.1"
serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
num_cpus = "1.2"
itertools = "0.5.9"
bit-set = "0.4.0"
uuid = { version = "0.6", features = ["v4", "serde"] }
chan = "0.1"
crossbeam = "0.2"
crossbeam = "0.3"
futures = "0.1"
futures-cpupool = "0.1"
error-chain = "0.8"
owning_ref = "0.3"
stable_deref_trait = "1.0.0"
rust-stemmers = "0.1.0"
downcast = { version="0.9", features = ["nightly"]}
matches = "0.1"
snap = "0.2"
bitpacking = {path = "../bitpacking"}
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
[dev-dependencies]
rand = "0.3"
[build-dependencies]
gcc = {version = "0.3", optional=true}
tempfile = "2.1"
env_logger = "0.4"
[profile.release]
opt-level = 3
@@ -50,5 +57,23 @@ debug-assertions = false
[features]
default = ["simdcompression"]
simdcompression = ["libc", "gcc"]
default = ["mmap"]
streamdict = []
mmap = ["fst/mmap", "atomicwrites"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }
[[example]]
name = "simple_search"
required-features = ["mmap"]
[[bin]]
name = "convert_to_static"
path = "./bin/convert_to_static.rs"
[[bin]]
name = "test_static_dir"
path = "./bin/test_static_dir.rs"

View File

@@ -1,10 +1,10 @@
![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png)
[![Build Status](https://travis-ci.org/tantivy-search/tantivy.svg?branch=master)](https://travis-ci.org/tantivy-search/tantivy)
[![Coverage Status](https://coveralls.io/repos/github/tantivy-search/tantivy/badge.svg?branch=master)](https://coveralls.io/github/tantivy-search/tantivy?branch=master)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Coverage Status](https://coveralls.io/repos/github/tantivy-search/tantivy/badge.svg?branch=master&refresh1)](https://coveralls.io/github/tantivy-search/tantivy?branch=master)
[![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy)
**Tantivy** is a **full text search engine library** written in rust.
@@ -12,44 +12,74 @@ It is strongly inspired by Lucene's design.
# Features
- configurable indexing (optional term frequency and position indexing)
- Tiny startup time (<10ms), perfect for command line tools
- tf-idf scoring
- Basic query language
- Phrase queries
- Incremental indexing
- Multithreaded indexing (indexing English Wikipedia takes 4 minutes on my desktop)
- mmap based
- SIMD integer compression
- u32 fast fields (equivalent of doc values in Lucene)
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
- Mmap directory
- optional SIMD integer compression
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
- LZ4 compressed document store
- Range queries
- Faceting
- configurable indexing (optional term frequency and position indexing
- Cheesy logo with a horse
Tantivy supports Linux, MacOS and Windows.
# Getting started
- [tantivy's usage example](http://fulmicoton.com/tantivy-examples/simple_search.html)
- [tantivy-cli and its tutorial](https://github.com/fulmicoton/tantivy-cli).
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli).
It will walk you through getting a wikipedia search engine up and running in a few minutes.
- [reference doc](http://fulmicoton.com/tantivy/tantivy/index.html).
- [reference doc]
- [For the last released version](https://docs.rs/tantivy/)
- [For the last master branch](https://tantivy-search.github.io/tantivy/tantivy/index.html)
# Compiling
## Development
Tantivy requires Rust Nightly because it uses requires the features [`box_syntax`](https://doc.rust-lang.org/stable/unstable-book/language-features/box-syntax.html), [`optin_builtin_traits`](https://github.com/rust-lang/rfcs/blob/master/text/0019-opt-in-builtin-traits.md), [`conservative_impl_trait`](https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md),
and [simd](https://github.com/rust-lang/rust/issues/27731).
# Compiling
To check out and run test, you can simply run :
By default, `tantivy` uses a git submodule called `simdcomp`.
After cloning the repository, you will need to initialize and update
the submodules. The project can then be built using `cargo`.
git clone git@github.com:tantivy-search/tantivy.git
cd tantivy
cargo +nightly build
git clone git@github.com:fulmicoton/tantivy.git
git submodule init
git submodule update
cargo build
## Note on release build and performance
If your project depends on `tantivy`, for better performance, make sure to enable
`sse3` instructions using a RUSTFLAGS. (This instruction set is likely to
be available on most `x86_64` CPUs you will encounter).
For instance,
RUSTFLAGS='-C target-feature=+sse3'
Or, if you are targetting a specific cpu
RUSTFLAGS='-C target-cpu=native' build --release
Regardless of the flags you pass, by default `tantivy` will contain `SSE3` instructions.
If you want to disable those, you can run the following command :
cargo build --no-default-features
Alternatively, if you are trying to compile `tantivy` without simd compression,
you can disable this functionality. In this case, this submodule is not required
and you can compile tantivy by using the `--no-default-features` flag.
cargo build --no-default-features
cargo build --no-default-features
# Contribute
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.

25
appveyor.yml Normal file
View File

@@ -0,0 +1,25 @@
# Appveyor configuration template for Rust using rustup for Rust installation
# https://github.com/starkat99/appveyor-rust
os: Visual Studio 2015
environment:
matrix:
- channel: nightly
target: x86_64-pc-windows-msvc
- channel: nightly
target: x86_64-pc-windows-gnu
msys_bits: 64
install:
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
- rustup-init -yv --default-toolchain %channel% --default-host %target%
- set PATH=%PATH%;%USERPROFILE%\.cargo\bin
- if defined msys_bits set PATH=%PATH%;C:\msys64\mingw%msys_bits%\bin
- rustc -vV
- cargo -vV
build: false
test_script:
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
- REM SET RUST_BACKTRACE=1 & cargo run --example simple_search

20
bin/convert_to_static.rs Normal file
View File

@@ -0,0 +1,20 @@
use std::env;
use std::path::PathBuf;
use std::fs::File;
use std::io::Write;
extern crate tantivy;
use tantivy::directory::write_static_from_directory;
fn main() {
// Prints each argument on a separate line
let mut args = env::args();
args.next().unwrap();
let directory_path= args.next().expect("Expect 2 args.<directory_path> <outputfile>");
let output_path = args.next().expect("Expect 2 args.<directory_path> <outputfile>");
println!("{} => {}", directory_path, output_path);
let buffer = write_static_from_directory(&PathBuf::from(directory_path)).unwrap();
println!("Read all");
let mut output = File::create(output_path).unwrap();
output.write_all(&buffer[..]).unwrap();
output.flush().unwrap();
}

51
bin/test_static_dir.rs Normal file
View File

@@ -0,0 +1,51 @@
use std::env;
use std::path::PathBuf;
use std::fs::File;
use std::io::Write;
extern crate tantivy;
use tantivy::directory::{StaticDirectory, write_static_from_directory};
use tantivy::Index;
use tantivy::query::QueryParser;
use tantivy::collector::TopCollector;
static DATA: &'static [u8] = include_bytes!("output.bin");
fn run() -> tantivy::Result<()> {
// Prints each argument on a separate line
let directory = StaticDirectory::open(DATA).unwrap();
let index = Index::open_directory(directory).unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let schema = index.schema();
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let query_parser = QueryParser::for_index(&index, vec![title, body]);
let query = query_parser.parse_query("sea whale")?;
let mut top_collector = TopCollector::with_limit(10);
searcher.search(&*query, &mut top_collector)?;
let doc_addresses = top_collector.docs();
// The actual documents still need to be
// retrieved from Tantivy's store.
//
// Since the body field was not configured as stored,
// the document returned will only contain
// a title.
for doc_address in doc_addresses {
let retrieved_doc = searcher.doc(&doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())
}
fn main() {
run().unwrap();
}

View File

@@ -1,40 +0,0 @@
#[cfg(feature= "simdcompression")]
mod build {
extern crate gcc;
use std::process::Command;
pub fn build() {
Command::new("make")
.current_dir("cpp/simdcomp")
.output()
.unwrap_or_else(|e| { panic!("Failed to make simdcomp: {}", e) });
gcc::Config::new()
.cpp(true)
.flag("-std=c++11")
.flag("-O3")
.flag("-mssse3")
.include("./cpp/simdcomp/include")
.object("cpp/simdcomp/avxbitpacking.o")
.object("cpp/simdcomp/simdintegratedbitpacking.o")
.object("cpp/simdcomp/simdbitpacking.o")
.object("cpp/simdcomp/simdpackedsearch.o")
.object("cpp/simdcomp/simdcomputil.o")
.object("cpp/simdcomp/simdpackedselect.o")
.object("cpp/simdcomp/simdfor.o")
.file("cpp/simdcomp_wrapper.cpp")
.compile("libsimdcomp.a");
println!("cargo:rustc-flags=-l dylib=stdc++");
}
}
#[cfg(not(feature= "simdcompression"))]
mod build {
pub fn build() {
}
}
fn main() {
build::build();
}

1
cpp/simdcomp vendored

Submodule cpp/simdcomp deleted from 0dca28668f

View File

@@ -1,48 +0,0 @@
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include "simdcomp.h"
#include "simdcomputil.h"
extern "C" {
// assumes datain has a size of 128 uint32
// and that buffer is large enough to host the data.
size_t compress_sorted_cpp(
const uint32_t* datain,
uint8_t* output,
const uint32_t offset) {
const uint32_t b = simdmaxbitsd1(offset, datain);
*output++ = b;
simdpackwithoutmaskd1(offset, datain, (__m128i *) output, b);
return 1 + b * sizeof(__m128i);;
}
// assumes datain has a size of 128 uint32
// and that buffer is large enough to host the data.
size_t uncompress_sorted_cpp(
const uint8_t* compressed_data,
uint32_t* output,
uint32_t offset) {
const uint32_t b = *compressed_data++;
simdunpackd1(offset, (__m128i *)compressed_data, output, b);
return 1 + b * sizeof(__m128i);
}
size_t compress_unsorted_cpp(
const uint32_t* datain,
uint8_t* output) {
const uint32_t b = maxbits(datain);
*output++ = b;
simdpackwithoutmask(datain, (__m128i *) output, b);
return 1 + b * sizeof(__m128i);;
}
size_t uncompress_unsorted_cpp(
const uint8_t* compressed_data,
uint32_t* output) {
const uint32_t b = *compressed_data++;
simdunpack((__m128i *)compressed_data, output, b);
return 1 + b * sizeof(__m128i);
}
}

View File

@@ -30,10 +30,12 @@
</div>
<div class="content"><div class='highlight'><pre><span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> rustc_serialize;
<span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> tantivy;
<div class="content"><div class='highlight'><pre><span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> tantivy;
<span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> tempdir;
<span class="hljs-meta">#[macro_use]</span>
<span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> serde_json;
<span class="hljs-keyword">use</span> std::path::Path;
<span class="hljs-keyword">use</span> tempdir::TempDir;
<span class="hljs-keyword">use</span> tantivy::Index;
@@ -52,7 +54,7 @@
<div class="pilwrap ">
<a class="pilcrow" href="#section-2">&#182;</a>
</div>
<p>Lets create a temporary directory for the
<p>Lets create a temporary directory for the
sake of this example</p>
</div>
@@ -60,7 +62,7 @@ sake of this example</p>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">if</span> <span class="hljs-keyword">let</span> <span class="hljs-literal">Ok</span>(dir) = TempDir::new(<span class="hljs-string">"tantivy_example_dir"</span>) {
run_example(dir.path()).unwrap();
dir.close().unwrap();
}
}
}
@@ -78,7 +80,7 @@ sake of this example</p>
<h1 id="defining-the-schema">Defining the schema</h1>
<p>The Tantivy index requires a very strict schema.
The schema declares which fields are in the index,
and for each field, its type and “the way it should
and for each field, its type and “the way it should
be indexed”.</p>
</div>
@@ -108,15 +110,15 @@ be indexed”.</p>
<a class="pilcrow" href="#section-5">&#182;</a>
</div>
<p>Our first field is title.
We want full-text search for it, and we want to be able
to retrieve the document after the search.</p>
We want full-text search for it, and we also want
to be able to retrieve the document after the search.</p>
<p>TEXT | STORED is some syntactic sugar to describe
that. </p>
that.</p>
<p><code>TEXT</code> means the field should be tokenized and indexed,
along with its term frequency and term positions.</p>
<p><code>STORED</code> means that the field will also be saved
in a compressed, row-oriented key-value store.
This store is useful to reconstruct the
This store is useful to reconstruct the
documents that were selected during the search phase.</p>
</div>
@@ -132,14 +134,17 @@ documents that were selected during the search phase.</p>
<div class="pilwrap ">
<a class="pilcrow" href="#section-6">&#182;</a>
</div>
<p>Our first field is body.
We want full-text search for it, and we want to be able
to retrieve the body after the search.</p>
<p>Our second field is body.
We want full-text search for it, but we do not
need to be able to be able to retrieve it
for our application. </p>
<p>We can make our index lighter and
by omitting <code>STORED</code> flag.</p>
</div>
<div class="content"><div class='highlight'><pre> schema_builder.add_text_field(<span class="hljs-string">"body"</span>, TEXT);
<span class="hljs-keyword">let</span> schema = schema_builder.build();</pre></div></div>
</li>
@@ -158,7 +163,7 @@ with our schema in the directory.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> index = <span class="hljs-built_in">try!</span>(Index::create(index_path, schema.clone()));</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> index = Index::create(index_path, schema.clone())?;</pre></div></div>
</li>
@@ -173,14 +178,12 @@ with our schema in the directory.</p>
There must be only one writer at a time.
This single <code>IndexWriter</code> is already
multithreaded.</p>
<p>Here we use a buffer of 1 GB. Using a bigger
heap for the indexer can increase its throughput.
This buffer will be split between the indexing
threads.</p>
<p>Here we use a buffer of 50MB per thread. Using a bigger
heap for the indexer can increase its throughput.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> index_writer = <span class="hljs-built_in">try!</span>(index.writer(<span class="hljs-number">1_000_000_000</span>));</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> index_writer = index.writer(<span class="hljs-number">50_000_000</span>)?;</pre></div></div>
</li>
@@ -213,10 +216,14 @@ one by one in a Document object.</p>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> title = schema.get_field(<span class="hljs-string">"title"</span>).unwrap();
<span class="hljs-keyword">let</span> body = schema.get_field(<span class="hljs-string">"body"</span>).unwrap();
<span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> old_man_doc = Document::<span class="hljs-keyword">default</span>();
old_man_doc.add_text(title, <span class="hljs-string">"The Old Man and the Sea"</span>);
old_man_doc.add_text(body, <span class="hljs-string">"He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish."</span>);</pre></div></div>
old_man_doc.add_text(
body,
<span class="hljs-string">"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."</span>,
);</pre></div></div>
</li>
@@ -231,7 +238,7 @@ one by one in a Document object.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-built_in">try!</span>(index_writer.add_document(old_man_doc));</pre></div></div>
<div class="content"><div class='highlight'><pre> index_writer.add_document(old_man_doc);</pre></div></div>
</li>
@@ -243,18 +250,27 @@ one by one in a Document object.</p>
<a class="pilcrow" href="#section-12">&#182;</a>
</div>
<h3 id="create-a-document-directly-from-json-">Create a document directly from json.</h3>
<p>Alternatively, we can use our schema to parse
a document object directly from json.</p>
<p>Alternatively, we can use our schema to parse a
document object directly from json.
The document is a string, but we use the <code>json</code> macro
from <code>serde_json</code> for the convenience of multi-line support.</p>
</div>
<div class="content"><div class='highlight'><pre>
<span class="hljs-keyword">let</span> mice_and_men_doc = <span class="hljs-built_in">try!</span>(schema.parse_document(r#<span class="hljs-string">"{
"</span>title<span class="hljs-string">": "</span>Of Mice and Men<span class="hljs-string">",
"</span>body<span class="hljs-string">": "</span>few miles south of Soledad, the Salinas River drops <span class="hljs-keyword">in</span> close to the hillside bank and runs deep and green. The water is warm too, <span class="hljs-keyword">for</span> it has slipped twinkling over the yellow sands <span class="hljs-keyword">in</span> the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying <span class="hljs-keyword">in</span> their lower leaf junctures the debris of the winters flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool<span class="hljs-string">"
}"</span>#));
<span class="hljs-built_in">try!</span>(index_writer.add_document(mice_and_men_doc));</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> json = json!({
<span class="hljs-string">"title"</span>: <span class="hljs-string">"Of Mice and Men"</span>,
<span class="hljs-string">"body"</span>: <span class="hljs-string">"A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"</span>
});
<span class="hljs-keyword">let</span> mice_and_men_doc = schema.parse_document(&amp;json.to_string())?;
index_writer.add_document(mice_and_men_doc);</pre></div></div>
</li>
@@ -271,11 +287,16 @@ The following document has two titles.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> frankenstein_doc = <span class="hljs-built_in">try!</span>(schema.parse_document(r#<span class="hljs-string">"{
"</span>title<span class="hljs-string">": ["</span>Frankenstein<span class="hljs-string">", "</span>The Modern Promotheus<span class="hljs-string">"],
"</span>body<span class="hljs-string">": "</span>You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence <span class="hljs-keyword">in</span> the success of my undertaking.<span class="hljs-string">"
}"</span>#));
<span class="hljs-built_in">try!</span>(index_writer.add_document(frankenstein_doc));</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> json = json!({
<span class="hljs-string">"title"</span>: [<span class="hljs-string">"Frankenstein"</span>, <span class="hljs-string">"The Modern Prometheus"</span>],
<span class="hljs-string">"body"</span>: <span class="hljs-string">"You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."</span>
});
<span class="hljs-keyword">let</span> frankenstein_doc = schema.parse_document(&amp;json.to_string())?;
index_writer.add_document(frankenstein_doc);</pre></div></div>
</li>
@@ -288,7 +309,7 @@ The following document has two titles.</p>
</div>
<p>This is an example, so we will only index 3 documents
here. You can check out tantivys tutorial to index
the English wikipedia. Tantivys indexing is rather fast.
the English wikipedia. Tantivys indexing is rather fast.
Indexing 5 million articles of the English wikipedia takes
around 4 minutes on my computer!</p>
@@ -313,7 +334,7 @@ the existence of new documents.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-built_in">try!</span>(index_writer.commit());</pre></div></div>
<div class="content"><div class='highlight'><pre> index_writer.commit()?;</pre></div></div>
</li>
@@ -343,15 +364,13 @@ commit.</p>
<a class="pilcrow" href="#section-17">&#182;</a>
</div>
<h1 id="searching">Searching</h1>
<p>Lets search our index. We start
by creating a searcher. There can be more
than one searcher at a time.</p>
<p>You should create a searcher
every time you start a “search query”.</p>
<p>Lets search our index. Start by reloading
searchers in the index. This should be done
after every commit().</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> searcher = index.searcher();</pre></div></div>
<div class="content"><div class='highlight'><pre> index.load_searchers()?;</pre></div></div>
</li>
@@ -362,14 +381,13 @@ every time you start a “search query”.</p>
<div class="pilwrap ">
<a class="pilcrow" href="#section-18">&#182;</a>
</div>
<p>The query parser can interpret human queries.
Here, if the user does not specify which
field they want to search, tantivy will search
in both title and body.</p>
<p>Afterwards create one (or more) searchers.</p>
<p>You should create a searcher
every time you start a “search query”.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> query_parser = QueryParser::new(index.schema(), <span class="hljs-built_in">vec!</span>(title, body));</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> searcher = index.searcher();</pre></div></div>
</li>
@@ -380,13 +398,14 @@ in both title and body.</p>
<div class="pilwrap ">
<a class="pilcrow" href="#section-19">&#182;</a>
</div>
<p>QueryParser may fail if the query is not in the right
format. For user facing applications, this can be a problem.
A ticket has been opened regarding this problem.</p>
<p>The query parser can interpret human queries.
Here, if the user does not specify which
field they want to search, tantivy will search
in both title and body.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> query = <span class="hljs-built_in">try!</span>(query_parser.parse_query(<span class="hljs-string">"sea whale"</span>));</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> query_parser = QueryParser::for_index(index, <span class="hljs-built_in">vec!</span>[title, body]);</pre></div></div>
</li>
@@ -397,6 +416,23 @@ A ticket has been opened regarding this problem.</p>
<div class="pilwrap ">
<a class="pilcrow" href="#section-20">&#182;</a>
</div>
<p>QueryParser may fail if the query is not in the right
format. For user facing applications, this can be a problem.
A ticket has been opened regarding this problem.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> query = query_parser.parse_query(<span class="hljs-string">"sea whale"</span>)?;</pre></div></div>
</li>
<li id="section-21">
<div class="annotation">
<div class="pilwrap ">
<a class="pilcrow" href="#section-21">&#182;</a>
</div>
<p>A query defines a set of documents, as
well as the way they should be scored.</p>
<p>A query created by the query parser is scored according
@@ -408,36 +444,20 @@ any document matching at least one of our terms.</p>
</li>
<li id="section-21">
<div class="annotation">
<div class="pilwrap ">
<a class="pilcrow" href="#section-21">&#182;</a>
</div>
<h3 id="collectors">Collectors</h3>
<p>We are not interested in all of the documents but
only in the top 10. Keeping track of our top 10 best documents
is the role of the TopCollector.</p>
</div>
<div class="content"><div class='highlight'><pre>
<span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> top_collector = TopCollector::with_limit(<span class="hljs-number">10</span>);</pre></div></div>
</li>
<li id="section-22">
<div class="annotation">
<div class="pilwrap ">
<a class="pilcrow" href="#section-22">&#182;</a>
</div>
<p>We can now perform our query.</p>
<h3 id="collectors">Collectors</h3>
<p>We are not interested in all of the documents but
only in the top 10. Keeping track of our top 10 best documents
is the role of the TopCollector.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-built_in">try!</span>(searcher.search(&amp;query, &amp;<span class="hljs-keyword">mut</span> top_collector)));</pre></div></div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> top_collector = TopCollector::with_limit(<span class="hljs-number">10</span>);</pre></div></div>
</li>
@@ -448,12 +468,11 @@ is the role of the TopCollector.</p>
<div class="pilwrap ">
<a class="pilcrow" href="#section-23">&#182;</a>
</div>
<p>Our top collector now contains the 10
most relevant doc ids…</p>
<p>We can now perform our query.</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> doc_addresses = top_collector.docs();</pre></div></div>
<div class="content"><div class='highlight'><pre> searcher.search(&amp;*query, &amp;<span class="hljs-keyword">mut</span> top_collector)?;</pre></div></div>
</li>
@@ -464,7 +483,23 @@ most relevant doc ids…</p>
<div class="pilwrap ">
<a class="pilcrow" href="#section-24">&#182;</a>
</div>
<p>The actual documents still need to be
<p>Our top collector now contains the 10
most relevant doc ids…</p>
</div>
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> doc_addresses = top_collector.docs();</pre></div></div>
</li>
<li id="section-25">
<div class="annotation">
<div class="pilwrap ">
<a class="pilcrow" href="#section-25">&#182;</a>
</div>
<p>The actual documents still need to be
retrieved from Tantivys store.</p>
<p>Since the body field was not configured as stored,
the document returned will only contain
@@ -472,11 +507,29 @@ a title.</p>
</div>
<div class="content"><div class='highlight'><pre>
<div class="content"><div class='highlight'><pre>
<span class="hljs-keyword">for</span> doc_address <span class="hljs-keyword">in</span> doc_addresses {
<span class="hljs-keyword">let</span> retrieved_doc = <span class="hljs-built_in">try!</span>(searcher.doc(&amp;doc_address));
<span class="hljs-built_in">println!</span>(<span class="hljs-string">"{}"</span>, schema.to_json(&amp;retrieved_doc));
}
<span class="hljs-keyword">let</span> retrieved_doc = searcher.doc(&amp;doc_address)?;
<span class="hljs-built_in">println!</span>(<span class="hljs-string">"{}"</span>, schema.to_json(&amp;retrieved_doc));
}</pre></div></div>
</li>
<li id="section-26">
<div class="annotation">
<div class="pilwrap ">
<a class="pilcrow" href="#section-26">&#182;</a>
</div>
<p>Wait for indexing and merging threads to shut down.
Usually this isnt needed, but in <code>main</code> we try to
delete the temporary directory and that fails on
Windows if the files are still open.</p>
</div>
<div class="content"><div class='highlight'><pre> index_writer.wait_merging_threads()?;
<span class="hljs-literal">Ok</span>(())
}</pre></div></div>

View File

@@ -1,7 +1,9 @@
extern crate rustc_serialize;
extern crate tantivy;
extern crate tempdir;
#[macro_use]
extern crate serde_json;
use std::path::Path;
use tempdir::TempDir;
use tantivy::Index;
@@ -10,149 +12,161 @@ use tantivy::collector::TopCollector;
use tantivy::query::QueryParser;
fn main() {
// Let's create a temporary directory for the
// Let's create a temporary directory for the
// sake of this example
if let Ok(dir) = TempDir::new("tantivy_example_dir") {
run_example(dir.path()).unwrap();
dir.close().unwrap();
}
}
}
fn run_example(index_path: &Path) -> tantivy::Result<()> {
// # Defining the schema
//
// The Tantivy index requires a very strict schema.
// The schema declares which fields are in the index,
// and for each field, its type and "the way it should
// and for each field, its type and "the way it should
// be indexed".
// first we need to define a schema ...
let mut schema_builder = SchemaBuilder::default();
// Our first field is title.
// We want full-text search for it, and we want to be able
// to retrieve the document after the search.
// We want full-text search for it, and we also want
// to be able to retrieve the document after the search.
//
// TEXT | STORED is some syntactic sugar to describe
// that.
//
// that.
//
// `TEXT` means the field should be tokenized and indexed,
// along with its term frequency and term positions.
//
// `STORED` means that the field will also be saved
// in a compressed, row-oriented key-value store.
// This store is useful to reconstruct the
// This store is useful to reconstruct the
// documents that were selected during the search phase.
schema_builder.add_text_field("title", TEXT | STORED);
// Our first field is body.
// We want full-text search for it, and we want to be able
// to retrieve the body after the search.
// Our second field is body.
// We want full-text search for it, but we do not
// need to be able to be able to retrieve it
// for our application.
//
// We can make our index lighter and
// by omitting `STORED` flag.
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let schema = schema_builder.build();
// # Indexing documents
//
// Let's create a brand new index.
//
//
// This will actually just save a meta.json
// with our schema in the directory.
let index = try!(Index::create(index_path, schema.clone()));
let index = Index::create(index_path, schema.clone())?;
// To insert document we need an index writer.
// There must be only one writer at a time.
// This single `IndexWriter` is already
// multithreaded.
//
// Here we use a buffer of 1 GB. Using a bigger
// Here we use a buffer of 50MB per thread. Using a bigger
// heap for the indexer can increase its throughput.
// This buffer will be split between the indexing
// threads.
let mut index_writer = try!(index.writer(1_000_000_000));
let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents!
// We first need a handle on the title and the body field.
// ### Create a document "manually".
//
// We can create a document manually, by setting the fields
// one by one in a Document object.
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text(body, "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish.");
old_man_doc.add_text(
body,
"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish.",
);
// ... and add it to the `IndexWriter`.
try!(index_writer.add_document(old_man_doc));
index_writer.add_document(old_man_doc);
// ### Create a document directly from json.
//
// Alternatively, we can use our schema to parse
// a document object directly from json.
let mice_and_men_doc = try!(schema.parse_document(r#"{
// Alternatively, we can use our schema to parse a
// document object directly from json.
// The document is a string, but we use the `json` macro
// from `serde_json` for the convenience of multi-line support.
let json = json!({
"title": "Of Mice and Men",
"body": "few miles south of Soledad, the Salinas River drops in close to the hillside bank and runs deep and green. The water is warm too, for it has slipped twinkling over the yellow sands in the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying in their lower leaf junctures the debris of the winters flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool"
}"#));
try!(index_writer.add_document(mice_and_men_doc));
"body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
});
let mice_and_men_doc = schema.parse_document(&json.to_string())?;
index_writer.add_document(mice_and_men_doc);
// Multi-valued field are allowed, they are
// expressed in JSON by an array.
// The following document has two titles.
let frankenstein_doc = try!(schema.parse_document(r#"{
"title": ["Frankenstein", "The Modern Promotheus"],
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
}"#));
try!(index_writer.add_document(frankenstein_doc));
let json = json!({
"title": ["Frankenstein", "The Modern Prometheus"],
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
});
let frankenstein_doc = schema.parse_document(&json.to_string())?;
index_writer.add_document(frankenstein_doc);
// This is an example, so we will only index 3 documents
// here. You can check out tantivy's tutorial to index
// the English wikipedia. Tantivy's indexing is rather fast.
// the English wikipedia. Tantivy's indexing is rather fast.
// Indexing 5 million articles of the English wikipedia takes
// around 4 minutes on my computer!
// ### Committing
//
//
// At this point our documents are not searchable.
//
//
//
// We need to call .commit() explicitly to force the
// index_writer to finish processing the documents in the queue,
// flush the current index to the disk, and advertise
// the existence of new documents.
//
// This call is blocking.
try!(index_writer.commit());
index_writer.commit()?;
// If `.commit()` returns correctly, then all of the
// documents that have been added are guaranteed to be
// persistently indexed.
//
//
// In the scenario of a crash or a power failure,
// tantivy behaves as if has rolled back to its last
// commit.
// # Searching
//
// Let's search our index. We start
// by creating a searcher. There can be more
// than one searcher at a time.
//
// Let's search our index. Start by reloading
// searchers in the index. This should be done
// after every commit().
index.load_searchers()?;
// Afterwards create one (or more) searchers.
//
// You should create a searcher
// every time you start a "search query".
let searcher = index.searcher();
@@ -161,47 +175,51 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
// Here, if the user does not specify which
// field they want to search, tantivy will search
// in both title and body.
let query_parser = QueryParser::new(index.schema(), vec!(title, body));
let query_parser = QueryParser::for_index(&index, vec![title, body]);
// QueryParser may fail if the query is not in the right
// format. For user facing applications, this can be a problem.
// A ticket has been opened regarding this problem.
let query = try!(query_parser.parse_query("sea whale"));
let query = query_parser.parse_query("sea whale")?;
// A query defines a set of documents, as
// well as the way they should be scored.
//
//
// A query created by the query parser is scored according
// to a metric called Tf-Idf, and will consider
// any document matching at least one of our terms.
// ### Collectors
// ### Collectors
//
// We are not interested in all of the documents but
// We are not interested in all of the documents but
// only in the top 10. Keeping track of our top 10 best documents
// is the role of the TopCollector.
let mut top_collector = TopCollector::with_limit(10);
// We can now perform our query.
try!(searcher.search(&*query, &mut top_collector));
// Our top collector now contains the 10
// We can now perform our query.
searcher.search(&*query, &mut top_collector)?;
// Our top collector now contains the 10
// most relevant doc ids...
let doc_addresses = top_collector.docs();
// The actual documents still need to be
// The actual documents still need to be
// retrieved from Tantivy's store.
//
//
// Since the body field was not configured as stored,
// the document returned will only contain
// a title.
for doc_address in doc_addresses {
let retrieved_doc = try!(searcher.doc(&doc_address));
println!("{}", schema.to_json(&retrieved_doc));
let retrieved_doc = searcher.doc(&doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
// Wait for indexing and merging threads to shut down.
// Usually this isn't needed, but in `main` we try to
// delete the temporary directory and that fails on
// Windows if the files are still open.
index_writer.wait_merging_threads()?;
Ok(())
}

1
rustfmt.toml Normal file
View File

@@ -0,0 +1 @@
use_try_shorthand = true

View File

@@ -1,10 +0,0 @@
#!/bin/bash
DEST=target/doc/tantivy/docs/
mkdir -p $DEST
for f in $(ls docs/*.md)
do
rustdoc $f -o $DEST --markdown-css ../../rustdoc.css --markdown-css style.css
done
cp docs/*.css $DEST

View File

@@ -1,5 +0,0 @@
#/bin/bash
valgrind --tool=cachegrind target/release/tantivy-bench -i /data/wiki-index -q ./queries.txt -n 3
valgrind --tool=callgrind target/release/tantivy-bench -i /data/wiki-index -q ./queries.txt -n 3

View File

@@ -1,86 +0,0 @@
extern crate regex;
use std::str::Chars;
use std::ascii::AsciiExt;
pub struct TokenIter<'a> {
chars: Chars<'a>,
term_buffer: String,
}
fn append_char_lowercase(c: char, term_buffer: &mut String) {
term_buffer.push(c.to_ascii_lowercase());
}
pub trait StreamingIterator<'a, T> {
fn next(&'a mut self) -> Option<T>;
}
impl<'a, 'b> TokenIter<'b> {
fn consume_token(&'a mut self) -> Option<&'a str> {
for c in &mut self.chars {
if c.is_alphanumeric() {
append_char_lowercase(c, &mut self.term_buffer);
}
else {
break;
}
}
Some(&self.term_buffer)
}
}
impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> {
#[inline]
fn next(&'a mut self,) -> Option<&'a str> {
self.term_buffer.clear();
// skipping non-letter characters.
loop {
match self.chars.next() {
Some(c) => {
if c.is_alphanumeric() {
append_char_lowercase(c, &mut self.term_buffer);
return self.consume_token();
}
}
None => { return None; }
}
}
}
}
pub struct SimpleTokenizer;
impl SimpleTokenizer {
pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> {
TokenIter {
term_buffer: String::new(),
chars: text.chars(),
}
}
}
#[test]
fn test_tokenizer() {
let simple_tokenizer = SimpleTokenizer;
let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!");
assert_eq!(term_reader.next().unwrap(), "hello");
assert_eq!(term_reader.next().unwrap(), "happy");
assert_eq!(term_reader.next().unwrap(), "tax");
assert_eq!(term_reader.next().unwrap(), "payer");
assert_eq!(term_reader.next(), None);
}
#[test]
fn test_tokenizer_empty() {
let simple_tokenizer = SimpleTokenizer;
let mut term_reader = simple_tokenizer.tokenize("");
assert_eq!(term_reader.next(), None);
}

View File

@@ -1,22 +1,25 @@
use Result;
use collector::Collector;
use SegmentLocalId;
use SegmentReader;
use std::io;
use DocId;
use Score;
/// Collector that does nothing.
/// This is used in the chain Collector and will hopefully
/// be optimized away by the compiler.
/// This is used in the chain Collector and will hopefully
/// be optimized away by the compiler.
pub struct DoNothingCollector;
impl Collector for DoNothingCollector {
#[inline]
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
}
#[inline]
fn collect(&mut self, _doc: DocId, _score: Score) {}
#[inline]
fn requires_scoring(&self) -> bool {
false
}
}
/// Zero-cost abstraction used to collect on multiple collectors.
@@ -24,10 +27,10 @@ impl Collector for DoNothingCollector {
/// are known at compile time.
pub struct ChainedCollector<Left: Collector, Right: Collector> {
left: Left,
right: Right
right: Right,
}
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
/// Adds a collector
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
ChainedCollector {
@@ -38,9 +41,13 @@ impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
}
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
try!(self.left.set_segment(segment_local_id, segment));
try!(self.right.set_segment(segment_local_id, segment));
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
self.left.set_segment(segment_local_id, segment)?;
self.right.set_segment(segment_local_id, segment)?;
Ok(())
}
@@ -48,6 +55,10 @@ impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Rig
self.left.collect(doc, score);
self.right.collect(doc, score);
}
fn requires_scoring(&self) -> bool {
self.left.requires_scoring() || self.right.requires_scoring()
}
}
/// Creates a `ChainedCollector`
@@ -58,7 +69,6 @@ pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
}
}
#[cfg(test)]
mod tests {
@@ -70,9 +80,7 @@ mod tests {
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors = chain()
.push(&mut top_collector)
.push(&mut count_collector);
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);
@@ -80,4 +88,4 @@ mod tests {
assert_eq!(count_collector.count(), 3);
assert!(top_collector.at_capacity());
}
}
}

View File

@@ -1,12 +1,13 @@
use std::io;
use super::Collector;
use DocId;
use Score;
use Result;
use SegmentReader;
use SegmentLocalId;
/// `CountCollector` collector only counts how many
/// documents match the query.
/// documents match the query.
#[derive(Default)]
pub struct CountCollector {
count: usize,
}
@@ -14,44 +15,40 @@ pub struct CountCollector {
impl CountCollector {
/// Returns the count of documents that were
/// collected.
pub fn count(&self,) -> usize {
pub fn count(&self) -> usize {
self.count
}
}
impl Default for CountCollector {
fn default() -> CountCollector {
CountCollector {count: 0,
}
}
}
impl Collector for CountCollector {
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
}
fn collect(&mut self, _: DocId, _: Score) {
self.count += 1;
}
fn requires_scoring(&self) -> bool {
false
}
}
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
use collector::Collector;
use collector::{Collector, CountCollector};
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {
let mut count_collector = CountCollector::default();
for doc in 0..1_000_000 {
count_collector.collect(doc, 1f32);
}
count_collector.count()
});
#[test]
fn test_count_collector() {
let mut count_collector = CountCollector::default();
assert_eq!(count_collector.count(), 0);
count_collector.collect(0u32, 1f32);
assert_eq!(count_collector.count(), 1);
assert_eq!(count_collector.count(), 1);
count_collector.collect(1u32, 1f32);
assert_eq!(count_collector.count(), 2);
assert!(!count_collector.requires_scoring());
}
}

View File

@@ -0,0 +1,637 @@
use std::mem;
use collector::Collector;
use fastfield::FacetReader;
use schema::Field;
use std::cell::UnsafeCell;
use schema::Facet;
use std::collections::BTreeMap;
use std::collections::BinaryHeap;
use std::collections::Bound;
use termdict::TermDictionary;
use termdict::TermStreamer;
use termdict::TermStreamerBuilder;
use std::collections::BTreeSet;
use termdict::TermMerger;
use docset::SkipResult;
use std::{usize, u64};
use std::iter::Peekable;
use DocId;
use Result;
use Score;
use SegmentReader;
use SegmentLocalId;
use std::cmp::Ordering;
struct Hit<'a> {
count: u64,
facet: &'a Facet,
}
impl<'a> Eq for Hit<'a> {}
impl<'a> PartialEq<Hit<'a>> for Hit<'a> {
fn eq(&self, other: &Hit) -> bool {
self.count == other.count
}
}
impl<'a> PartialOrd<Hit<'a>> for Hit<'a> {
fn partial_cmp(&self, other: &Hit) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<'a> Ord for Hit<'a> {
fn cmp(&self, other: &Self) -> Ordering {
other.count.cmp(&self.count)
}
}
struct SegmentFacetCounter {
pub facet_reader: FacetReader,
pub facet_ords: Vec<u64>,
pub facet_counts: Vec<u64>,
}
fn facet_depth(facet_bytes: &[u8]) -> usize {
if facet_bytes.is_empty() {
0
} else {
facet_bytes.iter().cloned().filter(|b| *b == 0u8).count() + 1
}
}
/// Collector for faceting
///
/// The collector collects all facets. You need to configure it
/// beforehand with the facet you want to extract.
///
/// This is done by calling `.add_facet(...)` with the root of the
/// facet you want to extract as argument.
///
/// Facet counts will only be computed for the facet that are direct children
/// of such a root facet.
///
/// For instance, if your index represents books, your hierarchy of facets
/// may contain `category`, `language`.
///
/// The category facet may include `subcategories`. For instance, a book
/// could belong to `/category/fiction/fantasy`.
///
/// If you request the facet counts for `/category`, the result will be
/// the breakdown of counts for the direct children of `/category`
/// (e.g. `/category/fiction`, `/category/biography`, `/category/personal_development`).
///
/// Once collection is finished, you can harvest its results in the form
/// of a `FacetCounts` object, and extract your face t counts from it.
///
/// This implementation assumes you are working with a number of facets that
/// is much hundreds of time lower than your number of documents.
///
///
/// ```rust
/// #[macro_use]
/// extern crate tantivy;
/// use tantivy::schema::{Facet, SchemaBuilder, TEXT};
/// use tantivy::{Index, Result};
/// use tantivy::collector::FacetCollector;
/// use tantivy::query::AllQuery;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<()> {
/// let mut schema_builder = SchemaBuilder::new();
///
/// // Facet have their own specific type.
/// // It is not a bad practise to put all of your
/// // facet information in the same field.
/// let facet = schema_builder.add_facet_field("facet");
/// let title = schema_builder.add_text_field("title", TEXT);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
/// {
/// let mut index_writer = index.writer(3_000_000)?;
/// // a document can be associated to any number of facets
/// index_writer.add_document(doc!(
/// title => "The Name of the Wind",
/// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/fiction/fantasy")
/// ));
/// index_writer.add_document(doc!(
/// title => "Dune",
/// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/fiction/sci-fi")
/// ));
/// index_writer.add_document(doc!(
/// title => "La Vénus d'Ille",
/// facet => Facet::from("/lang/fr"),
/// facet => Facet::from("/category/fiction/fantasy"),
/// facet => Facet::from("/category/fiction/horror")
/// ));
/// index_writer.add_document(doc!(
/// title => "The Diary of a Young Girl",
/// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/biography")
/// ));
/// index_writer.commit().unwrap();
/// }
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut facet_collector = FacetCollector::for_field(facet);
/// facet_collector.add_facet("/lang");
/// facet_collector.add_facet("/category");
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
///
/// // this object contains count aggregate for all of the facets.
/// let counts = facet_collector.harvest();
///
/// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = counts
/// .get("/category")
/// .collect();
/// assert_eq!(facets, vec![
/// (&Facet::from("/category/biography"), 1),
/// (&Facet::from("/category/fiction"), 3)
/// ]);
/// }
///
/// {
/// let mut facet_collector = FacetCollector::for_field(facet);
/// facet_collector.add_facet("/category/fiction");
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
///
/// // this object contains count aggregate for all of the facets.
/// let counts = facet_collector.harvest();
///
/// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = counts
/// .get("/category/fiction")
/// .collect();
/// assert_eq!(facets, vec![
/// (&Facet::from("/category/fiction/fantasy"), 2),
/// (&Facet::from("/category/fiction/horror"), 1),
/// (&Facet::from("/category/fiction/sci-fi"), 1)
/// ]);
/// }
///
/// {
/// let mut facet_collector = FacetCollector::for_field(facet);
/// facet_collector.add_facet("/category/fiction");
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
///
/// // this object contains count aggregate for all of the facets.
/// let counts = facet_collector.harvest();
///
/// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = counts.top_k("/category/fiction", 1);
/// assert_eq!(facets, vec![
/// (&Facet::from("/category/fiction/fantasy"), 2)
/// ]);
/// }
///
/// Ok(())
/// }
/// ```
pub struct FacetCollector {
facet_ords: Vec<u64>,
field: Field,
ff_reader: Option<UnsafeCell<FacetReader>>,
segment_counters: Vec<SegmentFacetCounter>,
// facet_ord -> collapse facet_id
current_segment_collapse_mapping: Vec<usize>,
// collapse facet_id -> count
current_segment_counts: Vec<u64>,
// collapse facet_id -> facet_ord
current_collapse_facet_ords: Vec<u64>,
facets: BTreeSet<Facet>,
}
fn skip<'a, I: Iterator<Item = &'a Facet>>(
target: &[u8],
collapse_it: &mut Peekable<I>,
) -> SkipResult {
loop {
match collapse_it.peek() {
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
Ordering::Less => {}
Ordering::Greater => {
return SkipResult::OverStep;
}
Ordering::Equal => {
return SkipResult::Reached;
}
},
None => {
return SkipResult::End;
}
}
collapse_it.next();
}
}
impl FacetCollector {
/// Create a facet collector to collect the facets
/// from a specific facet `Field`.
///
/// This function does not check whether the field
/// is of the proper type.
pub fn for_field(field: Field) -> FacetCollector {
FacetCollector {
facet_ords: Vec::with_capacity(255),
segment_counters: Vec::new(),
field,
ff_reader: None,
facets: BTreeSet::new(),
current_segment_collapse_mapping: Vec::new(),
current_collapse_facet_ords: Vec::new(),
current_segment_counts: Vec::new(),
}
}
/// Adds a facet that we want to record counts
///
/// Adding facet `Facet::from("/country")` for instance,
/// will record the counts of all of the direct children of the facet country
/// (e.g. `/country/FR`, `/country/UK`).
///
/// Adding two facets within which one is the prefix of the other is forbidden.
/// If you need the correct number of unique documents for two such facets,
/// just add them in separate `FacetCollector`.
pub fn add_facet<T>(&mut self, facet_from: T)
where
Facet: From<T>,
{
let facet = Facet::from(facet_from);
for old_facet in &self.facets {
assert!(
!old_facet.is_prefix_of(&facet),
"Tried to add a facet which is a descendant of an already added facet."
);
assert!(
!facet.is_prefix_of(old_facet),
"Tried to add a facet which is an ancestor of an already added facet."
);
}
self.facets.insert(facet);
}
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
self.current_segment_collapse_mapping.clear();
self.current_collapse_facet_ords.clear();
self.current_segment_counts.clear();
let mut collapse_facet_it = self.facets.iter().peekable();
self.current_collapse_facet_ords.push(0);
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
if !facet_streamer.advance() {
return;
}
'outer: loop {
// at the begining of this loop, facet_streamer
// is positionned on a term that has not been processed yet.
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
match skip_result {
SkipResult::Reached => {
// we reach a facet we decided to collapse.
let collapse_depth = facet_depth(facet_streamer.key());
let mut collapsed_id = 0;
self.current_segment_collapse_mapping.push(0);
while facet_streamer.advance() {
let depth = facet_depth(facet_streamer.key());
if depth <= collapse_depth {
continue 'outer;
}
if depth == collapse_depth + 1 {
collapsed_id = self.current_collapse_facet_ords.len();
self.current_collapse_facet_ords
.push(facet_streamer.term_ord());
self.current_segment_collapse_mapping.push(collapsed_id);
} else {
self.current_segment_collapse_mapping.push(collapsed_id);
}
}
break;
}
SkipResult::End | SkipResult::OverStep => {
self.current_segment_collapse_mapping.push(0);
if !facet_streamer.advance() {
break;
}
}
}
}
}
fn finalize_segment(&mut self) {
if self.ff_reader.is_some() {
self.segment_counters.push(SegmentFacetCounter {
facet_reader: self.ff_reader.take().unwrap().into_inner(),
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
});
}
}
/// Returns the results of the collection.
///
/// This method does not just return the counters,
/// it also translates the facet ordinals of the last segment.
pub fn harvest(mut self) -> FacetCounts {
self.finalize_segment();
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
.iter()
.map(|segment_counter| &segment_counter.facet_ords[..])
.collect();
let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters
.iter()
.map(|segment_counter| &segment_counter.facet_counts[..])
.collect();
let facet_streams = self.segment_counters
.iter()
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
.collect::<Vec<_>>();
let mut facet_merger = TermMerger::new(facet_streams);
let mut facet_counts = BTreeMap::new();
while facet_merger.advance() {
let count = facet_merger
.current_kvs()
.iter()
.map(|it| {
let seg_ord = it.segment_ord;
let term_ord = it.streamer.term_ord();
collapsed_facet_ords[seg_ord]
.binary_search(&term_ord)
.map(|collapsed_term_id| {
if collapsed_term_id == 0 {
0
} else {
collapsed_facet_counts[seg_ord][collapsed_term_id]
}
})
.unwrap_or(0)
})
.sum();
if count > 0u64 {
let bytes = facet_merger.key().to_owned();
facet_counts.insert(Facet::from_encoded(bytes), count);
}
}
FacetCounts { facet_counts }
}
}
impl Collector for FacetCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.finalize_segment();
let facet_reader = reader.facet_reader(self.field)?;
self.set_collapse_mapping(&facet_reader);
self.current_segment_counts
.resize(self.current_collapse_facet_ords.len(), 0);
self.ff_reader = Some(UnsafeCell::new(facet_reader));
Ok(())
}
fn collect(&mut self, doc: DocId, _: Score) {
let facet_reader: &mut FacetReader = unsafe {
&mut *self.ff_reader
.as_ref()
.expect("collect() was called before set_segment. This should never happen.")
.get()
};
facet_reader.facet_ords(doc, &mut self.facet_ords);
let mut previous_collapsed_ord: usize = usize::MAX;
for &facet_ord in &self.facet_ords {
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
{
0
} else {
1
};
previous_collapsed_ord = collapsed_ord;
}
}
fn requires_scoring(&self) -> bool {
false
}
}
/// Intermediary result of the `FacetCollector` that stores
/// the facet counts for all the segments.
pub struct FacetCounts {
facet_counts: BTreeMap<Facet, u64>,
}
impl FacetCounts {
#[allow(needless_lifetimes)] //< compiler fails if we remove the lifetime
pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator<Item = (&'a Facet, u64)>
where
Facet: From<T>,
{
let facet = Facet::from(facet_from);
let left_bound = Bound::Excluded(facet.clone());
let right_bound = if facet.is_root() {
Bound::Unbounded
} else {
let mut facet_after_bytes = facet.encoded_bytes().to_owned();
facet_after_bytes.push(1u8);
let facet_after = Facet::from_encoded(facet_after_bytes);
Bound::Excluded(facet_after)
};
self.facet_counts
.range((left_bound, right_bound))
.map(|(facet, count)| (facet, *count))
}
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
where
Facet: From<T>,
{
let mut heap = BinaryHeap::with_capacity(k);
let mut it = self.get(facet);
for (facet, count) in (&mut it).take(k) {
heap.push(Hit { count, facet });
}
let mut lowest_count: u64 = heap.peek().map(|hit| hit.count).unwrap_or(u64::MIN);
for (facet, count) in it {
if count > lowest_count {
lowest_count = count;
if let Some(mut head) = heap.peek_mut() {
*head = Hit { count, facet };
}
}
}
heap.into_sorted_vec()
.into_iter()
.map(|hit| (hit.facet, hit.count))
.collect::<Vec<_>>()
}
}
#[cfg(test)]
mod tests {
use test::Bencher;
use core::Index;
use schema::{Document, Facet, SchemaBuilder};
use query::AllQuery;
use super::{FacetCollector, FacetCounts};
use std::iter;
use schema::Field;
use rand::{thread_rng, Rng};
#[test]
fn test_facet_collector_drilldown() {
let mut schema_builder = SchemaBuilder::new();
let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer(3_000_000).unwrap();
let num_facets: usize = 3 * 4 * 5;
let facets: Vec<Facet> = (0..num_facets)
.map(|mut n| {
let top = n % 3;
n /= 3;
let mid = n % 4;
n /= 4;
let leaf = n % 5;
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
})
.collect();
for i in 0..num_facets * 10 {
let mut doc = Document::new();
doc.add_facet(facet_field, facets[i % num_facets].clone());
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top1"));
searcher.search(&AllQuery, &mut facet_collector).unwrap();
let counts: FacetCounts = facet_collector.harvest();
{
let facets: Vec<(String, u64)> = counts
.get("/top1")
.map(|(facet, count)| (facet.to_string(), count))
.collect();
assert_eq!(
facets,
[
("/top1/mid0", 50),
("/top1/mid1", 50),
("/top1/mid2", 50),
("/top1/mid3", 50),
].iter()
.map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>()
);
}
}
#[test]
#[should_panic(expected = "Tried to add a facet which is a descendant of \
an already added facet.")]
fn test_misused_facet_collector() {
let mut facet_collector = FacetCollector::for_field(Field(0));
facet_collector.add_facet(Facet::from("/country"));
facet_collector.add_facet(Facet::from("/country/europe"));
}
#[test]
fn test_non_used_facet_collector() {
let mut facet_collector = FacetCollector::for_field(Field(0));
facet_collector.add_facet(Facet::from("/country"));
facet_collector.add_facet(Facet::from("/countryeurope"));
}
#[test]
fn test_facet_collector_topk() {
let mut schema_builder = SchemaBuilder::new();
let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut docs: Vec<Document> = vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
.into_iter()
.flat_map(|(c, count)| {
let facet = Facet::from(&format!("/facet_{}", c));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
.collect();
thread_rng().shuffle(&mut docs[..]);
let mut index_writer = index.writer(3_000_000).unwrap();
for doc in docs {
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet("/");
searcher.search(&AllQuery, &mut facet_collector).unwrap();
let counts: FacetCounts = facet_collector.harvest();
{
let facets: Vec<(&Facet, u64)> = counts.top_k("/", 3);
assert_eq!(
facets,
vec![
(&Facet::from("/facet_b"), 100),
(&Facet::from("/facet_e"), 21),
(&Facet::from("/facet_d"), 12),
]
);
}
}
#[bench]
fn bench_facet_collector(b: &mut Bencher) {
let mut schema_builder = SchemaBuilder::new();
let facet_field = schema_builder.add_facet_field("facet");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut docs = vec![];
for val in 0..50 {
let facet = Facet::from(&format!("/facet_{}", val));
for _ in 0..val * val {
docs.push(doc!(facet_field=>facet.clone()));
}
}
// 40425 docs
thread_rng().shuffle(&mut docs[..]);
let mut index_writer = index.writer(3_000_000).unwrap();
for doc in docs {
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
index.load_searchers().unwrap();
b.iter(|| {
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
searcher.search(&AllQuery, &mut facet_collector).unwrap();
});
}
}

View File

@@ -0,0 +1,123 @@
use std::cmp::Eq;
use std::collections::HashMap;
use std::hash::Hash;
use collector::Collector;
use fastfield::FastFieldReader;
use schema::Field;
use DocId;
use Result;
use Score;
use SegmentReader;
use SegmentLocalId;
/// Facet collector for i64/u64 fast field
pub struct IntFacetCollector<T>
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
counters: HashMap<T::ValueType, u64>,
field: Field,
ff_reader: Option<T>,
}
impl<T> IntFacetCollector<T>
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
/// Creates a new facet collector for aggregating a given field.
pub fn new(field: Field) -> IntFacetCollector<T> {
IntFacetCollector {
counters: HashMap::new(),
field: field,
ff_reader: None,
}
}
}
impl<T> Collector for IntFacetCollector<T>
where
T: FastFieldReader,
T::ValueType: Eq + Hash,
{
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
Ok(())
}
fn collect(&mut self, doc: DocId, _: Score) {
let val = self.ff_reader
.as_ref()
.expect(
"collect() was called before set_segment. \
This should never happen.",
)
.get(doc);
*(self.counters.entry(val).or_insert(0)) += 1;
}
}
#[cfg(test)]
mod tests {
use collector::{chain, IntFacetCollector};
use query::QueryParser;
use fastfield::{I64FastFieldReader, U64FastFieldReader};
use schema::{self, FAST, STRING};
use Index;
#[test]
// create 10 documents, set num field value to 0 or 1 for even/odd ones
// make sure we have facet counters correctly filled
fn test_facet_collector_results() {
let mut schema_builder = schema::SchemaBuilder::new();
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
let text_field = schema_builder.add_text_field("text", STRING);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
for i in 0u64..10u64 {
index_writer.add_document(doc!(
num_field_i64 => ((i as i64) % 3i64) as i64,
num_field_u64 => (i % 2u64) as u64,
text_field => "text"
));
}
}
assert_eq!(index_writer.commit().unwrap(), 10u64);
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64);
let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64);
{
// perform the query
let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64);
let mut query_parser = QueryParser::for_index(index, vec![text_field]);
let query = query_parser.parse_query("text:text").unwrap();
query.search(&searcher, &mut facet_collectors).unwrap();
}
assert_eq!(ffvf_u64.counters[&0], 5);
assert_eq!(ffvf_u64.counters[&1], 5);
assert_eq!(ffvf_i64.counters[&0], 4);
assert_eq!(ffvf_i64.counters[&1], 3);
}
}

View File

@@ -1,8 +1,12 @@
/*!
Defines how the documents matching a search query should be processed.
*/
use SegmentReader;
use SegmentLocalId;
use DocId;
use Score;
use std::io;
use Result;
mod count_collector;
pub use self::count_collector::CountCollector;
@@ -13,14 +17,17 @@ pub use self::multi_collector::MultiCollector;
mod top_collector;
pub use self::top_collector::TopCollector;
mod facet_collector;
pub use self::facet_collector::FacetCollector;
mod chained_collector;
pub use self::chained_collector::chain;
/// Collectors are in charge of collecting and retaining relevant
/// Collectors are in charge of collecting and retaining relevant
/// information from the document found and scored by the query.
///
///
/// For instance,
/// For instance,
///
/// - keeping track of the top 10 best documents
/// - computing a breakdown over a fast field
@@ -29,7 +36,7 @@ pub use self::chained_collector::chain;
/// Queries are in charge of pushing the `DocSet` to the collector.
///
/// As they work on multiple segments, they first inform
/// the collector of a change in a segment and then
/// the collector of a change in a segment and then
/// call the `collect` method to push the document to the collector.
///
/// Temporally, our collector will receive calls
@@ -46,25 +53,38 @@ pub use self::chained_collector::chain;
///
/// Segments are not guaranteed to be visited in any specific order.
pub trait Collector {
/// `set_segment` is called before beginning to enumerate
/// `set_segment` is called before beginning to enumerate
/// on this segment.
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>;
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()>;
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score);
/// Returns true iff the collector requires to compute scores for documents.
fn requires_scoring(&self) -> bool;
}
impl<'a, C: Collector> Collector for &'a mut C {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
(*self).set_segment(segment_local_id, segment)
}
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score) {
(*self).collect(doc, score);
C::collect(self, doc, score)
}
fn requires_scoring(&self) -> bool {
C::requires_scoring(self)
}
}
#[cfg(test)]
pub mod tests {
@@ -73,11 +93,10 @@ pub mod tests {
use DocId;
use Score;
use core::SegmentReader;
use std::io;
use SegmentLocalId;
use fastfield::U32FastFieldReader;
use fastfield::FastFieldReader;
use schema::Field;
/// Stores all of the doc ids.
/// This collector is only used for tests.
/// It is unusable in practise, as it does not store
@@ -86,68 +105,75 @@ pub mod tests {
offset: DocId,
segment_max_doc: DocId,
docs: Vec<DocId>,
scores: Vec<Score>,
}
impl TestCollector {
/// Return the exhalist of documents.
pub fn docs(self,) -> Vec<DocId> {
pub fn docs(self) -> Vec<DocId> {
self.docs
}
pub fn scores(self) -> Vec<Score> {
self.scores
}
}
impl Default for TestCollector {
fn default() -> TestCollector {
TestCollector {
docs: Vec::new(),
offset: 0,
segment_max_doc: 0,
docs: Vec::new(),
scores: Vec::new(),
}
}
}
impl Collector for TestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.offset += self.segment_max_doc;
self.segment_max_doc = reader.max_doc();
Ok(())
}
fn collect(&mut self, doc: DocId, _score: Score) {
fn collect(&mut self, doc: DocId, score: Score) {
self.docs.push(doc + self.offset);
self.scores.push(score);
}
fn requires_scoring(&self) -> bool {
true
}
}
/// Collects in order all of the fast fields for all of the
/// doc in the `DocSet`
///
/// This collector is mainly useful for tests.
pub struct FastFieldTestCollector {
vals: Vec<u32>,
vals: Vec<u64>,
field: Field,
ff_reader: Option<U32FastFieldReader>,
ff_reader: Option<FastFieldReader<u64>>,
}
impl FastFieldTestCollector {
pub fn for_field(field: Field) -> FastFieldTestCollector {
FastFieldTestCollector {
vals: Vec::new(),
field: field,
field,
ff_reader: None,
}
}
pub fn vals(&self,) -> &Vec<u32> {
&self.vals
pub fn vals(self) -> Vec<u64> {
self.vals
}
}
impl Collector for FastFieldTestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> {
self.ff_reader = Some(try!(reader.get_fast_field_reader(self.field)));
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
Ok(())
}
@@ -155,9 +181,11 @@ pub mod tests {
let val = self.ff_reader.as_ref().unwrap().get(doc);
self.vals.push(val);
}
fn requires_scoring(&self) -> bool {
false
}
}
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {

View File

@@ -1,13 +1,12 @@
use std::io;
use super::Collector;
use DocId;
use Score;
use Result;
use SegmentReader;
use SegmentLocalId;
/// Multicollector makes it possible to collect on more than one collector.
/// It should only be used for use cases where the Collector types is unknown
/// It should only be used for use cases where the Collector types is unknown
/// at compile time.
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
pub struct MultiCollector<'a> {
@@ -17,17 +16,18 @@ pub struct MultiCollector<'a> {
impl<'a> MultiCollector<'a> {
/// Constructor
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
MultiCollector {
collectors: collectors,
}
MultiCollector { collectors }
}
}
impl<'a> Collector for MultiCollector<'a> {
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
for collector in &mut self.collectors {
try!(collector.set_segment(segment_local_id, segment));
collector.set_segment(segment_local_id, segment)?;
}
Ok(())
}
@@ -37,10 +37,13 @@ impl<'a> Collector for MultiCollector<'a> {
collector.collect(doc, score);
}
}
fn requires_scoring(&self) -> bool {
self.collectors
.iter()
.any(|collector| collector.requires_scoring())
}
}
#[cfg(test)]
mod tests {
@@ -52,7 +55,8 @@ mod tests {
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors = MultiCollector::from(vec!(&mut top_collector, &mut count_collector));
let mut collectors =
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);

View File

@@ -1,8 +1,8 @@
use std::io;
use super::Collector;
use SegmentReader;
use SegmentLocalId;
use DocAddress;
use Result;
use std::collections::BinaryHeap;
use std::cmp::Ordering;
use DocId;
@@ -12,8 +12,7 @@ use Score;
#[derive(Clone, Copy)]
struct GlobalScoredDoc {
score: Score,
doc_address: DocAddress
doc_address: DocAddress,
}
impl PartialOrd for GlobalScoredDoc {
@@ -25,10 +24,10 @@ impl PartialOrd for GlobalScoredDoc {
impl Ord for GlobalScoredDoc {
#[inline]
fn cmp(&self, other: &GlobalScoredDoc) -> Ordering {
other.score.partial_cmp(&self.score)
.unwrap_or(
other.doc_address.cmp(&self.doc_address)
)
other
.score
.partial_cmp(&self.score)
.unwrap_or_else(|| other.doc_address.cmp(&self.doc_address))
}
}
@@ -40,7 +39,6 @@ impl PartialEq for GlobalScoredDoc {
impl Eq for GlobalScoredDoc {}
/// The Top Collector keeps track of the K documents
/// with the best scores.
///
@@ -53,7 +51,6 @@ pub struct TopCollector {
}
impl TopCollector {
/// Creates a top collector, with a number of documents equal to "limit".
///
/// # Panics
@@ -63,14 +60,14 @@ impl TopCollector {
panic!("Limit must be strictly greater than 0.");
}
TopCollector {
limit: limit,
limit,
heap: BinaryHeap::with_capacity(limit),
segment_id: 0,
}
}
/// Returns K best documents sorted in decreasing order.
///
///
/// Calling this method triggers the sort.
/// The result of the sort is not cached.
pub fn docs(&self) -> Vec<DocAddress> {
@@ -81,31 +78,28 @@ impl TopCollector {
}
/// Returns K best ScoredDocument sorted in decreasing order.
///
///
/// Calling this method triggers the sort.
/// The result of the sort is not cached.
pub fn score_docs(&self) -> Vec<(Score, DocAddress)> {
let mut scored_docs: Vec<GlobalScoredDoc> = self.heap
.iter()
.cloned()
.collect();
let mut scored_docs: Vec<GlobalScoredDoc> = self.heap.iter().cloned().collect();
scored_docs.sort();
scored_docs.into_iter()
.map(|GlobalScoredDoc {score, doc_address}| (score, doc_address))
scored_docs
.into_iter()
.map(|GlobalScoredDoc { score, doc_address }| (score, doc_address))
.collect()
}
/// Return true iff at least K documents have gone through
/// the collector.
#[inline]
pub fn at_capacity(&self, ) -> bool {
pub fn at_capacity(&self) -> bool {
self.heap.len() >= self.limit
}
}
impl Collector for TopCollector {
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> {
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
self.segment_id = segment_id;
Ok(())
}
@@ -113,25 +107,30 @@ impl Collector for TopCollector {
fn collect(&mut self, doc: DocId, score: Score) {
if self.at_capacity() {
// It's ok to unwrap as long as a limit of 0 is forbidden.
let limit_doc: GlobalScoredDoc = *self.heap.peek().expect("Top collector with size 0 is forbidden");
let limit_doc: GlobalScoredDoc = *self.heap
.peek()
.expect("Top collector with size 0 is forbidden");
if limit_doc.score < score {
let mut mut_head = self.heap.peek_mut().expect("Top collector with size 0 is forbidden");
let mut mut_head = self.heap
.peek_mut()
.expect("Top collector with size 0 is forbidden");
mut_head.score = score;
mut_head.doc_address = DocAddress(self.segment_id, doc);
mut_head.doc_address = DocAddress(self.segment_id, doc);
}
}
else {
} else {
let wrapped_doc = GlobalScoredDoc {
score: score,
doc_address: DocAddress(self.segment_id, doc)
score,
doc_address: DocAddress(self.segment_id, doc),
};
self.heap.push(wrapped_doc);
}
}
fn requires_scoring(&self) -> bool {
true
}
}
#[cfg(test)]
mod tests {
@@ -147,13 +146,12 @@ mod tests {
top_collector.collect(3, 0.2);
top_collector.collect(5, 0.3);
assert!(!top_collector.at_capacity());
let score_docs: Vec<(Score, DocId)> = top_collector.score_docs()
let score_docs: Vec<(Score, DocId)> = top_collector
.score_docs()
.into_iter()
.map(|(score, doc_address)| (score, doc_address.doc()))
.collect();
assert_eq!(score_docs, vec!(
(0.8, 1), (0.3, 5), (0.2, 3),
));
assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]);
}
#[test]
@@ -171,9 +169,7 @@ mod tests {
.into_iter()
.map(|(score, doc_address)| (score, doc_address.doc()))
.collect();
assert_eq!(score_docs, vec!(
(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)
));
assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]);
}
{
let docs: Vec<DocId> = top_collector
@@ -181,10 +177,8 @@ mod tests {
.into_iter()
.map(|doc_address| doc_address.doc())
.collect();
assert_eq!(docs, vec!(7, 1, 5, 3));
assert_eq!(docs, vec![7, 1, 5, 3]);
}
}
#[test]

View File

@@ -2,153 +2,181 @@ use std::io::Write;
use std::io;
use common::serialize::BinarySerializable;
use std::mem;
use std::ops::Deref;
use std::ptr;
pub fn compute_num_bits(amplitude: u32) -> u8 {
(32u32 - amplitude.leading_zeros()) as u8
}
pub struct BitPacker {
pub(crate) struct BitPacker {
mini_buffer: u64,
mini_buffer_written: usize,
num_bits: usize,
written_size: usize,
}
impl BitPacker {
pub fn new(num_bits: usize) -> BitPacker {
impl BitPacker {
pub fn new() -> BitPacker {
BitPacker {
mini_buffer: 0u64,
mini_buffer_written: 0,
num_bits: num_bits,
written_size: 0,
}
}
pub fn write<TWrite: Write>(&mut self, val: u32, output: &mut TWrite) -> io::Result<()> {
pub fn write<TWrite: Write>(
&mut self,
val: u64,
num_bits: u8,
output: &mut TWrite,
) -> io::Result<()> {
let val_u64 = val as u64;
if self.mini_buffer_written + self.num_bits > 64 {
let num_bits = num_bits as usize;
if self.mini_buffer_written + num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
self.written_size += self.mini_buffer.serialize(output)?;
self.mini_buffer.serialize(output)?;
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
self.mini_buffer_written = self.mini_buffer_written + (self.num_bits as usize) - 64;
}
else {
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
} else {
self.mini_buffer |= val_u64 << self.mini_buffer_written;
self.mini_buffer_written += self.num_bits;
self.mini_buffer_written += num_bits;
if self.mini_buffer_written == 64 {
self.written_size += self.mini_buffer.serialize(output)?;
self.mini_buffer.serialize(output)?;
self.mini_buffer_written = 0;
self.mini_buffer = 0u64;
}
}
}
Ok(())
}
fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()>{
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8;
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
output.write_all(&arr[..num_bytes])?;
self.written_size += num_bytes;
self.mini_buffer_written = 0;
}
Ok(())
}
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<usize> {
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
self.flush(output)?;
Ok(self.written_size)
// Padding the write file to simplify reads.
output.write_all(&[0u8; 7])?;
Ok(())
}
}
pub struct BitUnpacker {
#[derive(Clone)]
pub struct BitUnpacker<Data>
where
Data: Deref<Target = [u8]>,
{
num_bits: usize,
mask: u32,
data_ptr: *const u8,
data_len: usize,
mask: u64,
data: Data,
}
impl BitUnpacker {
pub fn new(data: &[u8], num_bits: usize) -> BitUnpacker {
impl<Data> BitUnpacker<Data>
where
Data: Deref<Target = [u8]>,
{
pub fn new(data: Data, num_bits: u8) -> BitUnpacker<Data> {
let mask: u64 = if num_bits == 64 {
!0u64
} else {
(1u64 << num_bits) - 1u64
};
BitUnpacker {
num_bits: num_bits,
mask: (1u32 << num_bits) - 1u32,
data_ptr: data.as_ptr(),
data_len: data.len()
num_bits: num_bits as usize,
mask,
data,
}
}
pub fn get(&self, idx: usize) -> u32 {
pub fn get(&self, idx: usize) -> u64 {
if self.num_bits == 0 {
return 0;
return 0u64;
}
let addr = (idx * self.num_bits) / 8;
let bit_shift = idx * self.num_bits - addr * 8;
let val_unshifted_unmasked: u64;
if addr + 8 <= self.data_len {
val_unshifted_unmasked = unsafe { * (self.data_ptr.offset(addr as isize) as *const u64) };
}
else {
let mut arr = [0u8; 8];
if addr < self.data_len {
for i in 0..self.data_len - addr {
arr[i] = unsafe { *self.data_ptr.offset( (addr + i) as isize) };
let data: &[u8] = &*self.data;
let num_bits = self.num_bits;
let mask = self.mask;
let addr_in_bits = idx * num_bits;
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
if cfg!(feature = "simdcompression") {
// for simdcompression,
// the bitpacker is only used for fastfields,
// and we expect them to be always padded.
debug_assert!(
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
} else {
let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() {
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }
} else {
let mut buffer = [0u8; 8];
for i in addr..data.len() {
buffer[i - addr] += data[i];
}
}
val_unshifted_unmasked = unsafe { mem::transmute::<[u8; 8], u64>(arr) };
unsafe { ptr::read_unaligned(buffer[..].as_ptr() as *const u64) }
};
let val_shifted = val_unshifted_unmasked >> (bit_shift as u64);
val_shifted & mask
}
}
/// Reads a range of values from the fast field.
///
/// The range of values read is from
/// `[start..start + output.len()[`
pub fn get_range(&self, start: u32, output: &mut [u64]) {
if self.num_bits == 0 {
for val in output.iter_mut() {
*val = 0u64;
}
} else {
let data: &[u8] = &*self.data;
let num_bits = self.num_bits;
let mask = self.mask;
let mut addr_in_bits = (start as usize) * num_bits;
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;
}
}
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u32;
(val_shifted & self.mask)
}
}
#[cfg(test)]
mod test {
use super::{BitPacker, BitUnpacker, compute_num_bits};
#[test]
fn test_compute_num_bits() {
assert_eq!(compute_num_bits(1), 1u8);
assert_eq!(compute_num_bits(0), 0u8);
assert_eq!(compute_num_bits(2), 2u8);
assert_eq!(compute_num_bits(3), 2u8);
assert_eq!(compute_num_bits(4), 3u8);
assert_eq!(compute_num_bits(255), 8u8);
assert_eq!(compute_num_bits(256), 9u8);
}
fn test_bitpacker_util(len: usize, num_bits: usize) {
use super::{BitPacker, BitUnpacker};
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker<Vec<u8>>, Vec<u64>) {
let mut data = Vec::new();
let mut bitpacker = BitPacker::new(num_bits);
let max_val: u32 = (1 << num_bits) - 1;
let vals: Vec<u32> = (0u32..len as u32).map(|i| {
if max_val == 0 {
0
}
else {
i % max_val
}
}).collect();
let mut bitpacker = BitPacker::new();
let max_val: u64 = (1u64 << num_bits as u64) - 1u64;
let vals: Vec<u64> = (0u64..len as u64)
.map(|i| if max_val == 0 { 0 } else { i % max_val })
.collect();
for &val in &vals {
bitpacker.write(val, &mut data).unwrap();
bitpacker.write(val, num_bits, &mut data).unwrap();
}
let num_bytes = bitpacker.close(&mut data).unwrap();
assert_eq!(num_bytes, (num_bits * len + 7) / 8);
assert_eq!(data.len(), num_bytes);
let bitunpacker = BitUnpacker::new(&data, num_bits);
bitpacker.close(&mut data).unwrap();
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7);
let bitunpacker = BitUnpacker::new(data, num_bits);
(bitunpacker, vals)
}
fn test_bitpacker_util(len: usize, num_bits: u8) {
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
for (i, val) in vals.iter().enumerate() {
assert_eq!(bitunpacker.get(i), *val);
}
}
#[test]
fn test_bitpacker() {
test_bitpacker_util(10, 3);
@@ -157,4 +185,17 @@ mod test {
test_bitpacker_util(6, 14);
test_bitpacker_util(1000, 14);
}
}
#[test]
fn test_bitpacker_range() {
let (bitunpacker, vals) = create_fastfield_bitpacker(100_000, 12);
let buffer_len = 100;
let mut buffer = vec![0u64; buffer_len];
for start in vec![0, 10, 20, 100, 1_000] {
bitunpacker.get_range(start as u32, &mut buffer[..]);
for i in 0..buffer_len {
assert_eq!(buffer[i], vals[start + i]);
}
}
}
}

389
src/common/bitset.rs Normal file
View File

@@ -0,0 +1,389 @@
use std::fmt;
use std::u64;
#[derive(Clone, Copy, Eq, PartialEq)]
pub(crate) struct TinySet(u64);
impl fmt::Debug for TinySet {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.into_iter().collect::<Vec<u32>>().fmt(f)
}
}
pub struct TinySetIterator(TinySet);
impl Iterator for TinySetIterator {
type Item = u32;
fn next(&mut self) -> Option<Self::Item> {
self.0.pop_lowest()
}
}
impl IntoIterator for TinySet {
type Item = u32;
type IntoIter = TinySetIterator;
fn into_iter(self) -> Self::IntoIter {
TinySetIterator(self)
}
}
impl TinySet {
/// Returns an empty `TinySet`.
pub fn empty() -> TinySet {
TinySet(0u64)
}
/// Returns the complement of the set in `[0, 64[`.
fn complement(&self) -> TinySet {
TinySet(!self.0)
}
/// Returns true iff the `TinySet` contains the element `el`.
pub fn contains(&self, el: u32) -> bool {
!self.intersect(TinySet::singleton(el)).is_empty()
}
/// Returns the intersection of `self` and `other`
pub fn intersect(&self, other: TinySet) -> TinySet {
TinySet(self.0 & other.0)
}
/// Creates a new `TinySet` containing only one element
/// within `[0; 64[`
#[inline(always)]
pub fn singleton(el: u32) -> TinySet {
TinySet(1u64 << u64::from(el))
}
/// Insert a new element within [0..64[
#[inline(always)]
pub fn insert(self, el: u32) -> TinySet {
self.union(TinySet::singleton(el))
}
/// Insert a new element within [0..64[
#[inline(always)]
pub fn insert_mut(&mut self, el: u32) -> bool {
let old = *self;
*self = old.insert(el);
old != *self
}
/// Returns the union of two tinysets
#[inline(always)]
pub fn union(self, other: TinySet) -> TinySet {
TinySet(self.0 | other.0)
}
/// Returns true iff the `TinySet` is empty.
#[inline(always)]
pub fn is_empty(&self) -> bool {
self.0 == 0u64
}
/// Returns the lowest element in the `TinySet`
/// and removes it.
#[inline(always)]
pub fn pop_lowest(&mut self) -> Option<u32> {
if self.is_empty() {
None
} else {
let lowest = self.0.trailing_zeros() as u32;
self.0 ^= TinySet::singleton(lowest).0;
Some(lowest)
}
}
/// Returns a `TinySet` than contains all values up
/// to limit excluded.
///
/// The limit is assumed to be strictly lower than 64.
pub fn range_lower(upper_bound: u32) -> TinySet {
TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
}
/// Returns a `TinySet` that contains all values greater
/// or equal to the given limit, included. (and up to 63)
///
/// The limit is assumed to be strictly lower than 64.
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
TinySet::range_lower(from_included).complement()
}
pub fn clear(&mut self) {
self.0 = 0u64;
}
pub fn len(&self) -> u32 {
self.0.count_ones()
}
}
#[derive(Clone)]
pub struct BitSet {
tinysets: Box<[TinySet]>,
len: usize, //< Technically it should be u32, but we
// count multiple inserts.
// `usize` guards us from overflow.
max_value: u32,
}
fn num_buckets(max_val: u32) -> u32 {
(max_val + 63u32) / 64u32
}
impl BitSet {
/// Create a new `BitSet` that may contain elements
/// within `[0, max_val[`.
pub fn with_max_value(max_value: u32) -> BitSet {
let num_buckets = num_buckets(max_value);
let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
BitSet {
tinysets: tinybisets,
len: 0,
max_value,
}
}
/// Removes all elements from the `BitSet`.
pub fn clear(&mut self) {
for tinyset in self.tinysets.iter_mut() {
*tinyset = TinySet::empty();
}
}
/// Returns the number of elements in the `BitSet`.
pub fn len(&self) -> usize {
self.len
}
/// Inserts an element in the `BitSet`
pub fn insert(&mut self, el: u32) {
// we do not check saturated els.
let higher = el / 64u32;
let lower = el % 64u32;
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
1
} else {
0
};
}
/// Returns true iff the elements is in the `BitSet`.
pub fn contains(&self, el: u32) -> bool {
self.tinyset(el / 64u32).contains(el % 64)
}
/// Returns the first non-empty `TinySet` associated to a bucket lower
/// or greater than bucket.
///
/// Reminder: the tiny set with the bucket `bucket`, represents the
/// elements from `bucket * 64` to `(bucket+1) * 64`.
pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
self.tinysets[bucket as usize..]
.iter()
.cloned()
.position(|tinyset| !tinyset.is_empty())
.map(|delta_bucket| bucket + delta_bucket as u32)
}
pub fn max_value(&self) -> u32 {
self.max_value
}
/// Returns the tiny bitset representing the
/// the set restricted to the number range from
/// `bucket * 64` to `(bucket + 1) * 64`.
pub(crate) fn tinyset(&self, bucket: u32) -> TinySet {
self.tinysets[bucket as usize]
}
}
#[cfg(test)]
mod tests {
extern crate test;
use tests;
use std::collections::HashSet;
use super::BitSet;
use super::TinySet;
use tests::generate_nonunique_unsorted;
use std::collections::BTreeSet;
use query::BitSetDocSet;
use docset::DocSet;
#[test]
fn test_tiny_set() {
assert!(TinySet::empty().is_empty());
{
let mut u = TinySet::empty().insert(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none())
}
{
let mut u = TinySet::empty().insert(1u32).insert(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none())
}
{
let mut u = TinySet::empty().insert(2u32);
assert_eq!(u.pop_lowest(), Some(2u32));
u.insert_mut(1u32);
assert_eq!(u.pop_lowest(), Some(1u32));
assert!(u.pop_lowest().is_none());
}
{
let mut u = TinySet::empty().insert(63u32);
assert_eq!(u.pop_lowest(), Some(63u32));
assert!(u.pop_lowest().is_none());
}
}
#[test]
fn test_bitset() {
let test_against_hashset = |els: &[u32], max_value: u32| {
let mut hashset: HashSet<u32> = HashSet::new();
let mut bitset = BitSet::with_max_value(max_value);
for &el in els {
assert!(el < max_value);
hashset.insert(el);
bitset.insert(el);
}
for el in 0..max_value {
assert_eq!(hashset.contains(&el), bitset.contains(el));
}
assert_eq!(bitset.max_value(), max_value);
};
test_against_hashset(&[], 0);
test_against_hashset(&[], 1);
test_against_hashset(&[0u32], 1);
test_against_hashset(&[0u32], 100);
test_against_hashset(&[1u32, 2u32], 4);
test_against_hashset(&[99u32], 100);
test_against_hashset(&[63u32], 64);
test_against_hashset(&[62u32, 63u32], 64);
}
#[test]
fn test_bitset_large() {
let arr = generate_nonunique_unsorted(1_000_000, 50_000);
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
let mut bitset = BitSet::with_max_value(1_000_000);
for el in arr {
btreeset.insert(el);
bitset.insert(el);
}
for i in 0..1_000_000 {
assert_eq!(btreeset.contains(&i), bitset.contains(i));
}
assert_eq!(btreeset.len(), bitset.len());
let mut bitset_docset = BitSetDocSet::from(bitset);
for el in btreeset.into_iter() {
bitset_docset.advance();
assert_eq!(bitset_docset.doc(), el);
}
assert!(!bitset_docset.advance());
}
#[test]
fn test_bitset_num_buckets() {
use super::num_buckets;
assert_eq!(num_buckets(0u32), 0);
assert_eq!(num_buckets(1u32), 1);
assert_eq!(num_buckets(64u32), 1);
assert_eq!(num_buckets(65u32), 2);
assert_eq!(num_buckets(128u32), 2);
assert_eq!(num_buckets(129u32), 3);
}
#[test]
fn test_tinyset_range() {
assert_eq!(
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
[0, 1, 2]
);
assert!(TinySet::range_lower(0).is_empty());
assert_eq!(
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
(0u32..63u32).collect::<Vec<_>>()
);
assert_eq!(
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
[0]
);
assert_eq!(
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
[0, 1]
);
assert_eq!(
TinySet::range_greater_or_equal(3)
.into_iter()
.collect::<Vec<u32>>(),
(3u32..64u32).collect::<Vec<_>>()
);
}
#[test]
fn test_bitset_len() {
let mut bitset = BitSet::with_max_value(1_000);
assert_eq!(bitset.len(), 0);
bitset.insert(3u32);
assert_eq!(bitset.len(), 1);
bitset.insert(103u32);
assert_eq!(bitset.len(), 2);
bitset.insert(3u32);
assert_eq!(bitset.len(), 2);
bitset.insert(103u32);
assert_eq!(bitset.len(), 2);
bitset.insert(104u32);
assert_eq!(bitset.len(), 3);
}
#[test]
fn test_bitset_clear() {
let mut bitset = BitSet::with_max_value(1_000);
let els = tests::sample(1_000, 0.01f32);
for &el in &els {
bitset.insert(el);
}
assert!(els.iter().all(|el| bitset.contains(*el)));
bitset.clear();
for el in 0u32..1000u32 {
assert!(!bitset.contains(el));
}
}
#[bench]
fn bench_tinyset_pop(b: &mut test::Bencher) {
b.iter(|| {
let mut tinyset = TinySet::singleton(test::black_box(31u32));
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
});
}
#[bench]
fn bench_tinyset_sum(b: &mut test::Bencher) {
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
b.iter(|| {
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
});
}
#[bench]
fn bench_tinyarr_sum(b: &mut test::Bencher) {
let v = [10u32, 14u32, 21u32];
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
}
#[bench]
fn bench_bitset_initialize(b: &mut test::Bencher) {
b.iter(|| BitSet::with_max_value(1_000_000));
}
}

View File

@@ -0,0 +1,225 @@
use std::io::Write;
use common::CountingWriter;
use std::collections::HashMap;
use schema::Field;
use common::VInt;
use directory::WritePtr;
use std::io::{self, Read};
use directory::ReadOnlySource;
use common::BinarySerializable;
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
pub struct FileAddr {
field: Field,
idx: usize,
}
impl FileAddr {
fn new(field: Field, idx: usize) -> FileAddr {
FileAddr { field, idx }
}
}
impl BinarySerializable for FileAddr {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.field.serialize(writer)?;
VInt(self.idx as u64).serialize(writer)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let field = Field::deserialize(reader)?;
let idx = VInt::deserialize(reader)?.0 as usize;
Ok(FileAddr {
field,
idx,
})
}
}
/// A `CompositeWrite` is used to write a `CompositeFile`.
pub struct CompositeWrite<W = WritePtr> {
write: CountingWriter<W>,
offsets: HashMap<FileAddr, usize>,
}
impl<W: Write> CompositeWrite<W> {
/// Crate a new API writer that writes a composite file
/// in a given write.
pub fn wrap(w: W) -> CompositeWrite<W> {
CompositeWrite {
write: CountingWriter::wrap(w),
offsets: HashMap::new(),
}
}
/// Start writing a new field.
pub fn for_field(&mut self, field: Field) -> &mut CountingWriter<W> {
self.for_field_with_idx(field, 0)
}
/// Start writing a new field.
pub fn for_field_with_idx(&mut self, field: Field, idx: usize) -> &mut CountingWriter<W> {
let offset = self.write.written_bytes();
let file_addr = FileAddr::new(field, idx);
assert!(!self.offsets.contains_key(&file_addr));
self.offsets.insert(file_addr, offset);
&mut self.write
}
/// Close the composite file.
///
/// An index of the different field offsets
/// will be written as a footer.
pub fn close(mut self) -> io::Result<()> {
let footer_offset = self.write.written_bytes();
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
let mut offset_fields: Vec<_> = self.offsets
.iter()
.map(|(file_addr, offset)| (*offset, *file_addr))
.collect();
offset_fields.sort();
let mut prev_offset = 0;
for (offset, file_addr) in offset_fields {
VInt((offset - prev_offset) as u64).serialize(&mut self.write)?;
file_addr.serialize(&mut self.write)?;
prev_offset = offset;
}
let footer_len = (self.write.written_bytes() - footer_offset) as u32;
footer_len.serialize(&mut self.write)?;
self.write.flush()?;
Ok(())
}
}
/// A composite file is an abstraction to store a
/// file partitioned by field.
///
/// The file needs to be written field by field.
/// A footer describes the start and stop offsets
/// for each field.
#[derive(Clone)]
pub struct CompositeFile {
data: ReadOnlySource,
offsets_index: HashMap<FileAddr, (usize, usize)>,
}
impl CompositeFile {
/// Opens a composite file stored in a given
/// `ReadOnlySource`.
pub fn open(data: &ReadOnlySource) -> io::Result<CompositeFile> {
let end = data.len();
let footer_len_data = data.slice_from(end - 4);
let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize;
let footer_start = end - 4 - footer_len;
let footer_data = data.slice(footer_start, footer_start + footer_len);
let mut footer_buffer = footer_data.as_slice();
let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize;
let mut file_addrs = vec![];
let mut offsets = vec![];
let mut field_index = HashMap::new();
let mut offset = 0;
for _ in 0..num_fields {
offset += VInt::deserialize(&mut footer_buffer)?.0 as usize;
let file_addr = FileAddr::deserialize(&mut footer_buffer)?;
offsets.push(offset);
file_addrs.push(file_addr);
}
offsets.push(footer_start);
for i in 0..num_fields {
let file_addr = file_addrs[i];
let start_offset = offsets[i];
let end_offset = offsets[i + 1];
field_index.insert(file_addr, (start_offset, end_offset));
}
Ok(CompositeFile {
data: data.slice_to(footer_start),
offsets_index: field_index,
})
}
/// Returns a composite file that stores
/// no fields.
pub fn empty() -> CompositeFile {
CompositeFile {
offsets_index: HashMap::new(),
data: ReadOnlySource::empty(),
}
}
/// Returns the `ReadOnlySource` associated
/// to a given `Field` and stored in a `CompositeFile`.
pub fn open_read(&self, field: Field) -> Option<ReadOnlySource> {
self.open_read_with_idx(field, 0)
}
/// Returns the `ReadOnlySource` associated
/// to a given `Field` and stored in a `CompositeFile`.
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<ReadOnlySource> {
self.offsets_index
.get(&FileAddr { field, idx, })
.map(|&(from, to)| self.data.slice(from, to))
}
}
#[cfg(test)]
mod test {
use std::io::Write;
use super::{CompositeFile, CompositeWrite};
use directory::{Directory, RAMDirectory};
use schema::Field;
use common::VInt;
use common::BinarySerializable;
use std::path::Path;
#[test]
fn test_composite_file() {
let path = Path::new("test_path");
let mut directory = RAMDirectory::create();
{
let w = directory.open_write(path).unwrap();
let mut composite_write = CompositeWrite::wrap(w);
{
let mut write_0 = composite_write.for_field(Field(0u32));
VInt(32431123u64).serialize(&mut write_0).unwrap();
write_0.flush().unwrap();
}
{
let mut write_4 = composite_write.for_field(Field(4u32));
VInt(2).serialize(&mut write_4).unwrap();
write_4.flush().unwrap();
}
composite_write.close().unwrap();
}
{
let r = directory.open_read(path).unwrap();
let composite_file = CompositeFile::open(&r).unwrap();
{
let file0 = composite_file.open_read(Field(0u32)).unwrap();
let mut file0_buf = file0.as_slice();
let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0;
assert_eq!(file0_buf.len(), 0);
assert_eq!(payload_0, 32431123u64);
}
{
let file4 = composite_file.open_read(Field(4u32)).unwrap();
let mut file4_buf = file4.as_slice();
let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0;
assert_eq!(file4_buf.len(), 0);
assert_eq!(payload_4, 2u64);
}
}
}
}

View File

@@ -0,0 +1,55 @@
use std::io::Write;
use std::io;
pub struct CountingWriter<W> {
underlying: W,
written_bytes: usize,
}
impl<W: Write> CountingWriter<W> {
pub fn wrap(underlying: W) -> CountingWriter<W> {
CountingWriter {
underlying,
written_bytes: 0,
}
}
pub fn written_bytes(&self) -> usize {
self.written_bytes
}
pub fn finish(mut self) -> io::Result<(W, usize)> {
self.flush()?;
Ok((self.underlying, self.written_bytes))
}
}
impl<W: Write> Write for CountingWriter<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
let written_size = self.underlying.write(buf)?;
self.written_bytes += written_size;
Ok(written_size)
}
fn flush(&mut self) -> io::Result<()> {
self.underlying.flush()
}
}
#[cfg(test)]
mod test {
use super::CountingWriter;
use std::io::Write;
#[test]
fn test_counting_writer() {
let buffer: Vec<u8> = vec![];
let mut counting_writer = CountingWriter::wrap(buffer);
let bytes = (0u8..10u8).collect::<Vec<u8>>();
counting_writer.write_all(&bytes).unwrap();
let (w, len): (Vec<u8>, usize) = counting_writer.finish().unwrap();
assert_eq!(len, 10);
assert_eq!(w.len(), 10);
}
}

View File

@@ -1,32 +1,137 @@
mod serialize;
mod timer;
mod serialize;
mod vint;
mod counting_writer;
mod composite_file;
pub mod bitpacker;
mod bitset;
pub use self::serialize::BinarySerializable;
pub use self::timer::Timing;
pub use self::timer::TimerTree;
pub use self::timer::OpenTimer;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::serialize::{BinarySerializable, FixedSize};
pub use self::vint::VInt;
pub use self::counting_writer::CountingWriter;
pub use self::bitset::BitSet;
pub(crate) use self::bitset::TinySet;
pub use byteorder::LittleEndian as Endianness;
use std::io;
pub fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
/// Computes the number of bits that will be used for bitpacking.
///
/// In general the target is the minimum number of bits
/// required to express the amplitude given in argument.
///
/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
///
/// The logic is slightly more convoluted here as for optimization
/// reasons, we want to ensure that a value spawns over at most 8 bytes
/// of aligns bytes.
///
/// Spanning over 9 bytes is possible for instance, if we do
/// bitpacking with an amplitude of 63 bits.
/// In this case, the second int will start on bit
/// 63 (which belongs to byte 7) and ends at byte 15;
/// Hence 9 bytes (from byte 7 to byte 15 included).
///
/// To avoid this, we force the number of bits to 64bits
/// when the result is greater than `64-8 = 56 bits`.
///
/// Note that this only affects rare use cases spawning over
/// a very large range of values. Even in this case, it results
/// in an extra cost of at most 12% compared to the optimal
/// number of bits.
pub(crate) fn compute_num_bits(n: u64) -> u8 {
let amplitude = (64u32 - n.leading_zeros()) as u8;
if amplitude <= 64 - 8 {
amplitude
} else {
64
}
}
pub(crate) fn is_power_of_2(n: usize) -> bool {
(n > 0) && (n & (n - 1) == 0)
}
/// Create a default io error given a string.
pub(crate) fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
/// Has length trait
pub trait HasLen {
/// Return length
fn len(&self,) -> usize;
fn len(&self) -> usize;
/// Returns true iff empty.
fn is_empty(&self,) -> bool {
fn is_empty(&self) -> bool {
self.len() == 0
}
}
const HIGHEST_BIT: u64 = 1 << 63;
/// Maps a `i64` to `u64`
///
/// For simplicity, tantivy internally handles `i64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `i64` to `u64` so that
/// `-2^63 .. 2^63-1` is mapped
/// to
/// `0 .. 2^64-1`
/// in that order.
///
/// This is more suited than simply casting (`val as u64`)
/// because of bitpacking.
///
/// Imagine a list of `i64` ranging from -10 to 10.
/// When casting negative values, the negative values are projected
/// to values over 2^63, and all values end up requiring 64 bits.
///
/// # See also
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
#[inline(always)]
pub fn i64_to_u64(val: i64) -> u64 {
(val as u64) ^ HIGHEST_BIT
}
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
#[inline(always)]
pub fn u64_to_i64(val: u64) -> i64 {
(val ^ HIGHEST_BIT) as i64
}
#[cfg(test)]
pub(crate) mod test {
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
pub use super::serialize::test::fixed_size_test;
fn test_i64_converter_helper(val: i64) {
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
}
#[test]
fn test_i64_converter() {
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
test_i64_converter_helper(0i64);
test_i64_converter_helper(i64::min_value());
test_i64_converter_helper(i64::max_value());
for i in -1000i64..1000i64 {
test_i64_converter_helper(i);
}
}
#[test]
fn test_compute_num_bits() {
assert_eq!(compute_num_bits(1), 1u8);
assert_eq!(compute_num_bits(0), 0u8);
assert_eq!(compute_num_bits(2), 2u8);
assert_eq!(compute_num_bits(3), 2u8);
assert_eq!(compute_num_bits(4), 3u8);
assert_eq!(compute_num_bits(255), 8u8);
assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
}

View File

@@ -1,180 +1,210 @@
use byteorder::{ReadBytesExt, WriteBytesExt};
use byteorder::LittleEndian as Endianness;
use common::Endianness;
use std::fmt;
use std::io::Write;
use std::io::Read;
use std::io;
use common::VInt;
use byteorder;
pub trait BinarySerializable : fmt::Debug + Sized {
fn serialize(&self, writer: &mut Write) -> io::Result<usize>;
fn deserialize(reader: &mut Read) -> io::Result<Self>;
/// Trait for a simple binary serialization.
pub trait BinarySerializable: fmt::Debug + Sized {
/// Serialize
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()>;
/// Deserialize
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self>;
}
fn convert_byte_order_error(byteorder_error: byteorder::Error) -> io::Error {
match byteorder_error {
byteorder::Error::UnexpectedEOF => io::Error::new(io::ErrorKind::InvalidData, "Reached EOF unexpectedly"),
byteorder::Error::Io(e) => e,
}
/// `FixedSize` marks a `BinarySerializable` as
/// always serializing to the same size.
pub trait FixedSize: BinarySerializable {
const SIZE_IN_BYTES: usize;
}
impl BinarySerializable for () {
fn serialize(&self, _: &mut Write) -> io::Result<usize> {
Ok(0)
fn serialize<W: Write>(&self, _: &mut W) -> io::Result<()> {
Ok(())
}
fn deserialize(_: &mut Read) -> io::Result<Self> {
fn deserialize<R: Read>(_: &mut R) -> io::Result<Self> {
Ok(())
}
}
impl FixedSize for () {
const SIZE_IN_BYTES: usize = 0;
}
impl<T: BinarySerializable> BinarySerializable for Vec<T> {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let mut total_size = try!(VInt(self.len() as u64).serialize(writer));
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?;
for it in self {
total_size += try!(it.serialize(writer));
it.serialize(writer)?;
}
Ok(total_size)
Ok(())
}
fn deserialize(reader: &mut Read) -> io::Result<Vec<T>> {
let num_items = try!(VInt::deserialize(reader)).val();
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
let num_items = VInt::deserialize(reader)?.val();
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = try!(T::deserialize(reader));
let item = T::deserialize(reader)?;
items.push(item);
}
Ok(items)
}
}
impl<Left: BinarySerializable, Right: BinarySerializable> BinarySerializable for (Left, Right) {
fn serialize(&self, write: &mut Write) -> io::Result<usize> {
Ok(try!(self.0.serialize(write)) + try!(self.1.serialize(write)))
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
self.0.serialize(write)?;
self.1.serialize(write)
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
Ok( (try!(Left::deserialize(reader)), try!(Right::deserialize(reader))) )
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
Ok((Left::deserialize(reader)?, Right::deserialize(reader)?))
}
}
impl BinarySerializable for u32 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u32::<Endianness>(*self)
.map(|_| 4)
.map_err(convert_byte_order_error)
}
fn deserialize(reader: &mut Read) -> io::Result<u32> {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u32> {
reader.read_u32::<Endianness>()
.map_err(convert_byte_order_error)
}
}
impl FixedSize for u32 {
const SIZE_IN_BYTES: usize = 4;
}
impl BinarySerializable for u64 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u64::<Endianness>(*self)
.map(|_| 8)
.map_err(convert_byte_order_error)
}
fn deserialize(reader: &mut Read) -> io::Result<u64> {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
reader.read_u64::<Endianness>()
.map_err(convert_byte_order_error)
}
}
impl FixedSize for u64 {
const SIZE_IN_BYTES: usize = 8;
}
impl BinarySerializable for i64 {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_i64::<Endianness>(*self)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
reader.read_i64::<Endianness>()
}
}
impl FixedSize for i64 {
const SIZE_IN_BYTES: usize = 8;
}
impl BinarySerializable for u8 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
// TODO error
try!(writer.write_u8(*self).map_err(convert_byte_order_error));
Ok(1)
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
writer.write_u8(*self)
}
fn deserialize(reader: &mut Read) -> io::Result<u8> {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u8> {
reader.read_u8()
.map_err(convert_byte_order_error)
}
}
impl FixedSize for u8 {
const SIZE_IN_BYTES: usize = 1;
}
impl BinarySerializable for String {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
let mut size = try!(VInt(data.len() as u64).serialize(writer));
size += data.len();
try!(writer.write_all(data));
Ok(size)
VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data)
}
fn deserialize(reader: &mut Read) -> io::Result<String> {
let string_length = try!(VInt::deserialize(reader)).val() as usize;
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
try!(reader.take(string_length as u64).read_to_string(&mut result));
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
Ok(result)
}
}
#[cfg(test)]
mod test {
pub mod test {
use common::VInt;
use super::*;
fn serialize_test<T: BinarySerializable + Eq>(v: T, num_bytes: usize) {
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
fn serialize_test<T: BinarySerializable + Eq>(v: T) -> usize {
let mut buffer: Vec<u8> = Vec::new();
if num_bytes != 0 {
assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes);
assert_eq!(buffer.len(), num_bytes);
}
else {
v.serialize(&mut buffer).unwrap();
}
v.serialize(&mut buffer).unwrap();
let num_bytes = buffer.len();
let mut cursor = &buffer[..];
let deser = T::deserialize(&mut cursor).unwrap();
assert_eq!(deser, v);
num_bytes
}
#[test]
fn test_serialize_u8() {
serialize_test(3u8, 1);
serialize_test(5u8, 1);
fixed_size_test::<u8>();
}
#[test]
fn test_serialize_u32() {
serialize_test(3u32, 4);
serialize_test(5u32, 4);
serialize_test(u32::max_value(), 4);
fixed_size_test::<u32>();
assert_eq!(4, serialize_test(3u32));
assert_eq!(4, serialize_test(5u32));
assert_eq!(4, serialize_test(u32::max_value()));
}
#[test]
fn test_serialize_i64() {
fixed_size_test::<i64>();
}
#[test]
fn test_serialize_u64() {
fixed_size_test::<u64>();
}
#[test]
fn test_serialize_string() {
serialize_test(String::from(""), 1);
serialize_test(String::from("ぽよぽよ"), 1 + 3*4);
serialize_test(String::from("富士さん見える。"), 1 + 3*8);
assert_eq!(serialize_test(String::from("")), 1);
assert_eq!(serialize_test(String::from("ぽよぽよ")), 1 + 3 * 4);
assert_eq!(
serialize_test(String::from("富士さん見える。")),
1 + 3 * 8
);
}
#[test]
fn test_serialize_vec() {
let v: Vec<u8> = Vec::new();
serialize_test(v, 1);
serialize_test(vec!(1u32, 3u32), 1 + 4*2);
assert_eq!(serialize_test(Vec::<u8>::new()), 1);
assert_eq!(serialize_test(vec![1u32, 3u32]), 1 + 4 * 2);
}
#[test]
fn test_serialize_vint() {
for i in 0..10_000 {
serialize_test(VInt(i as u64), 0);
serialize_test(VInt(i as u64));
}
serialize_test(VInt(7u64), 1);
serialize_test(VInt(127u64), 1);
serialize_test(VInt(128u64), 2);
serialize_test(VInt(129u64), 2);
serialize_test(VInt(1234u64), 2);
serialize_test(VInt(16_383), 2);
serialize_test(VInt(16_384), 3);
serialize_test(VInt(u64::max_value()), 10);
assert_eq!(serialize_test(VInt(7u64)), 1);
assert_eq!(serialize_test(VInt(127u64)), 1);
assert_eq!(serialize_test(VInt(128u64)), 2);
assert_eq!(serialize_test(VInt(129u64)), 2);
assert_eq!(serialize_test(VInt(1234u64)), 2);
assert_eq!(serialize_test(VInt(16_383u64)), 2);
assert_eq!(serialize_test(VInt(16_384u64)), 3);
assert_eq!(serialize_test(VInt(u64::max_value())), 10);
}
}

View File

@@ -1,98 +0,0 @@
use time::PreciseTime;
pub struct OpenTimer<'a> {
name: &'static str,
timer_tree: &'a mut TimerTree,
start: PreciseTime,
depth: u32,
}
impl<'a> OpenTimer<'a> {
/// Starts timing a new named subtask
///
/// The timer is stopped automatically
/// when the `OpenTimer` is dropped.
pub fn open(&mut self, name: &'static str) -> OpenTimer {
OpenTimer {
name: name,
timer_tree: self.timer_tree,
start: PreciseTime::now(),
depth: self.depth + 1,
}
}
}
impl<'a> Drop for OpenTimer<'a> {
fn drop(&mut self,) {
self.timer_tree.timings.push(Timing {
name: self.name,
duration: self.start.to(PreciseTime::now()).num_microseconds().unwrap(),
depth: self.depth,
});
}
}
/// Timing recording
#[derive(Debug, RustcEncodable)]
pub struct Timing {
name: &'static str,
duration: i64,
depth: u32,
}
/// Timer tree
#[derive(Debug, RustcEncodable)]
pub struct TimerTree {
timings: Vec<Timing>,
}
impl TimerTree {
/// Returns the total time elapsed in microseconds
pub fn total_time(&self,) -> i64 {
self.timings.last().unwrap().duration
}
/// Open a new named subtask
pub fn open(&mut self, name: &'static str) -> OpenTimer {
OpenTimer {
name: name,
timer_tree: self,
start: PreciseTime::now(),
depth: 0,
}
}
}
impl Default for TimerTree {
fn default() -> TimerTree {
TimerTree {
timings: Vec::new(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_timer() {
let mut timer_tree = TimerTree::default();
{
let mut a = timer_tree.open("a");
{
let mut ab = a.open("b");
{
let _abc = ab.open("c");
}
{
let _abd = ab.open("d");
}
}
}
assert_eq!(timer_tree.timings.len(), 4);
}
}

View File

@@ -3,59 +3,59 @@ use std::io;
use std::io::Write;
use std::io::Read;
/// Wrapper over a `u64` that serializes as a variable int.
/// Wrapper over a `u64` that serializes as a variable int.
#[derive(Debug, Eq, PartialEq)]
pub struct VInt(pub u64);
impl VInt {
pub fn val(&self,) -> u64 {
pub fn val(&self) -> u64 {
self.0
}
pub fn deserialize_u64<R: Read>(reader: &mut R) -> io::Result<u64> {
VInt::deserialize(reader).map(|vint| vint.0)
}
}
impl BinarySerializable for VInt {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
let mut remaining = self.0;
let mut written: usize = 0;
let mut buffer = [0u8; 10];
let mut i = 0;
loop {
let next_byte: u8 = (remaining % 128u64) as u8;
remaining /= 128u64;
if remaining == 0u64 {
buffer[written] = next_byte | 128u8;
written += 1;
break;
}
else {
buffer[written] = next_byte;
written += 1;
buffer[i] = next_byte | 128u8;
return writer.write_all(&buffer[0..i + 1]);
} else {
buffer[i] = next_byte;
}
i += 1;
}
try!(writer.write_all(&buffer[0..written]));
Ok(written)
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let mut bytes = reader.bytes();
let mut result = 0u64;
let mut shift = 0u64;
loop {
match bytes.next() {
Some(Ok(b)) => {
result += ((b % 128u8) as u64) << shift;
result += u64::from(b % 128u8) << shift;
if b & 128u8 != 0u8 {
break;
}
shift += 7;
}
_ => {
return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer"))
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"Reach end of buffer",
))
}
}
}
Ok(VInt(result))
}
}

View File

@@ -1,159 +0,0 @@
use super::{BlockEncoder, BlockDecoder};
use super::NUM_DOCS_PER_BLOCK;
use compression::{VIntEncoder, VIntDecoder};
pub struct CompositeEncoder {
block_encoder: BlockEncoder,
output: Vec<u8>,
}
impl CompositeEncoder {
pub fn new() -> CompositeEncoder {
CompositeEncoder {
block_encoder: BlockEncoder::new(),
output: Vec::with_capacity(500_000),
}
}
pub fn compress_sorted(&mut self, vals: &[u32]) -> &[u8] {
self.output.clear();
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
let mut offset = 0u32;
for i in 0..num_blocks {
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK .. (i + 1) * NUM_DOCS_PER_BLOCK];
let block_compressed = self.block_encoder.compress_block_sorted(vals_slice, offset);
offset = vals_slice[NUM_DOCS_PER_BLOCK - 1];
self.output.extend_from_slice(block_compressed);
}
let vint_compressed = self.block_encoder.compress_vint_sorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..], offset);
self.output.extend_from_slice(vint_compressed);
&self.output
}
pub fn compress_unsorted(&mut self, vals: &[u32]) -> &[u8] {
self.output.clear();
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
for i in 0..num_blocks {
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK .. (i + 1) * NUM_DOCS_PER_BLOCK];
let block_compressed = self.block_encoder.compress_block_unsorted(vals_slice);
self.output.extend_from_slice(block_compressed);
}
let vint_compressed = self.block_encoder.compress_vint_unsorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..]);
self.output.extend_from_slice(vint_compressed);
&self.output
}
}
pub struct CompositeDecoder {
block_decoder: BlockDecoder,
vals: Vec<u32>,
}
impl CompositeDecoder {
pub fn new() -> CompositeDecoder {
CompositeDecoder {
block_decoder: BlockDecoder::new(),
vals: Vec::with_capacity(500_000),
}
}
pub fn uncompress_sorted(&mut self, mut compressed_data: &[u8], uncompressed_len: usize) -> &[u32] {
if uncompressed_len > self.vals.capacity() {
let extra_capacity = uncompressed_len - self.vals.capacity();
self.vals.reserve(extra_capacity);
}
let mut offset = 0u32;
self.vals.clear();
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
for _ in 0..num_blocks {
compressed_data = self.block_decoder.uncompress_block_sorted(compressed_data, offset);
offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
self.vals.extend_from_slice(self.block_decoder.output_array());
}
self.block_decoder.uncompress_vint_sorted(compressed_data, offset, uncompressed_len % NUM_DOCS_PER_BLOCK);
self.vals.extend_from_slice(self.block_decoder.output_array());
&self.vals
}
pub fn uncompress_unsorted(&mut self, mut compressed_data: &[u8], uncompressed_len: usize) -> &[u32] {
self.vals.clear();
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
for _ in 0..num_blocks {
compressed_data = self.block_decoder.uncompress_block_unsorted(compressed_data);
self.vals.extend_from_slice(self.block_decoder.output_array());
}
self.block_decoder.uncompress_vint_unsorted(compressed_data, uncompressed_len % NUM_DOCS_PER_BLOCK);
self.vals.extend_from_slice(self.block_decoder.output_array());
&self.vals
}
}
impl Into<Vec<u32>> for CompositeDecoder {
fn into(self) -> Vec<u32> {
self.vals
}
}
#[cfg(test)]
pub mod tests {
use test::Bencher;
use super::*;
use compression::tests::generate_array;
#[test]
fn test_composite_unsorted() {
let data = generate_array(10_000, 0.1);
let mut encoder = CompositeEncoder::new();
let compressed = encoder.compress_unsorted(&data);
assert_eq!(compressed.len(), 19_790);
let mut decoder = CompositeDecoder::new();
let result = decoder.uncompress_unsorted(&compressed, data.len());
for i in 0..data.len() {
assert_eq!(data[i], result[i]);
}
}
#[test]
fn test_composite_sorted() {
let data = generate_array(10_000, 0.1);
let mut encoder = CompositeEncoder::new();
let compressed = encoder.compress_sorted(&data);
assert_eq!(compressed.len(), 7_822);
let mut decoder = CompositeDecoder::new();
let result = decoder.uncompress_sorted(&compressed, data.len());
for i in 0..data.len() {
assert_eq!(data[i], result[i]);
}
}
const BENCH_NUM_INTS: usize = 99_968;
#[bench]
fn bench_compress(b: &mut Bencher) {
let mut encoder = CompositeEncoder::new();
let data = generate_array(BENCH_NUM_INTS, 0.1);
b.iter(|| {
encoder.compress_sorted(&data);
});
}
#[bench]
fn bench_uncompress(b: &mut Bencher) {
let mut encoder = CompositeEncoder::new();
let data = generate_array(BENCH_NUM_INTS, 0.1);
let compressed = encoder.compress_sorted(&data);
let mut decoder = CompositeDecoder::new();
b.iter(|| {
decoder.uncompress_sorted(compressed, BENCH_NUM_INTS);
});
}
}

View File

@@ -1,127 +0,0 @@
use common::bitpacker::compute_num_bits;
use common::bitpacker::{BitPacker, BitUnpacker};
use std::cmp;
use std::io::Write;
use super::NUM_DOCS_PER_BLOCK;
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) -> usize {
let mut max_delta = 0;
{
let mut local_offset = offset;
for i in 0..NUM_DOCS_PER_BLOCK {
let val = vals[i];
let delta = val - local_offset;
max_delta = cmp::max(max_delta, delta);
vals[i] = delta;
local_offset = val;
}
}
let num_bits = compute_num_bits(max_delta);
output.write_all(&[num_bits]).unwrap();
let mut bit_packer = BitPacker::new(num_bits as usize);
for val in vals {
bit_packer.write(*val, &mut output).unwrap();
}
1 + bit_packer.close(&mut output).expect("packing in memory should never fail")
}
pub struct BlockEncoder {
pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
pub output_len: usize,
input_buffer: [u32; NUM_DOCS_PER_BLOCK],
}
impl BlockEncoder {
pub fn new() -> BlockEncoder {
BlockEncoder {
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
output_len: 0,
input_buffer: [0u32; NUM_DOCS_PER_BLOCK],
}
}
pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
self.input_buffer.clone_from_slice(vals);
let compressed_size = compress_sorted(&mut self.input_buffer, &mut self.output, offset);
&self.output[..compressed_size]
}
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
let compressed_size: usize = {
let mut output: &mut [u8] = &mut self.output;
let max = vals.iter().cloned().max().expect("compress unsorted called with an empty array");
let num_bits = compute_num_bits(max);
output.write_all(&[num_bits]).unwrap();
let mut bit_packer = BitPacker::new(num_bits as usize);
for val in vals {
bit_packer.write(*val, &mut output).unwrap();
}
1 + bit_packer.close(&mut output).expect("packing in memory should never fail")
};
&self.output[..compressed_size]
}
}
pub struct BlockDecoder {
pub output: [u32; COMPRESSED_BLOCK_MAX_SIZE],
pub output_len: usize,
}
impl BlockDecoder {
pub fn new() -> BlockDecoder {
BlockDecoder::with_val(0u32)
}
pub fn with_val(val: u32) -> BlockDecoder {
BlockDecoder {
output: [val; COMPRESSED_BLOCK_MAX_SIZE],
output_len: 0,
}
}
pub fn uncompress_block_sorted<'a>(&mut self, compressed_data: &'a [u8], mut offset: u32) -> &'a[u8] {
let consumed_size = {
let num_bits = compressed_data[0];
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
for i in 0..NUM_DOCS_PER_BLOCK {
let delta = bit_unpacker.get(i);
let val = offset + delta;
self.output[i] = val;
offset = val;
}
1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8
};
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
}
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a[u8] {
let num_bits = compressed_data[0];
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
for i in 0..NUM_DOCS_PER_BLOCK {
self.output[i] = bit_unpacker.get(i);
}
let consumed_size = 1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8;
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
}
#[inline]
pub fn output_array(&self,) -> &[u32] {
&self.output[..self.output_len]
}
#[inline]
pub fn output(&self, idx: usize) -> u32 {
self.output[idx]
}
}

View File

@@ -1,112 +0,0 @@
use super::NUM_DOCS_PER_BLOCK;
use libc::size_t;
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
extern {
fn compress_sorted_cpp(
data: *const u32,
output: *mut u8,
offset: u32) -> size_t;
fn uncompress_sorted_cpp(
compressed_data: *const u8,
output: *mut u32,
offset: u32) -> size_t;
fn compress_unsorted_cpp(
data: *const u32,
output: *mut u8) -> size_t;
fn uncompress_unsorted_cpp(
compressed_data: *const u8,
output: *mut u32) -> size_t;
}
fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize {
unsafe { compress_sorted_cpp(vals.as_ptr(), output.as_mut_ptr(), offset) }
}
fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
unsafe { uncompress_sorted_cpp(compressed_data.as_ptr(), output.as_mut_ptr(), offset) }
}
fn compress_unsorted(vals: &[u32], output: &mut [u8]) -> usize {
unsafe { compress_unsorted_cpp(vals.as_ptr(), output.as_mut_ptr()) }
}
fn uncompress_unsorted(compressed_data: &[u8], output: &mut [u32]) -> usize {
unsafe { uncompress_unsorted_cpp(compressed_data.as_ptr(), output.as_mut_ptr()) }
}
pub struct BlockEncoder {
pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
pub output_len: usize,
}
impl BlockEncoder {
pub fn new() -> BlockEncoder {
BlockEncoder {
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
output_len: 0,
}
}
pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
let compressed_size = compress_sorted(vals, &mut self.output, offset);
&self.output[..compressed_size]
}
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
let compressed_size = compress_unsorted(vals, &mut self.output);
&self.output[..compressed_size]
}
}
pub struct BlockDecoder {
pub output: [u32; COMPRESSED_BLOCK_MAX_SIZE],
pub output_len: usize,
}
impl BlockDecoder {
pub fn new() -> BlockDecoder {
BlockDecoder::with_val(0u32)
}
pub fn with_val(val: u32) -> BlockDecoder {
BlockDecoder {
output: [val; COMPRESSED_BLOCK_MAX_SIZE],
output_len: 0,
}
}
pub fn uncompress_block_sorted<'a>(&mut self, compressed_data: &'a [u8], offset: u32) -> &'a[u8] {
let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset);
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
}
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a[u8] {
let consumed_size = uncompress_unsorted(compressed_data, &mut self.output);
self.output_len = NUM_DOCS_PER_BLOCK;
&compressed_data[consumed_size..]
}
#[inline]
pub fn output_array(&self,) -> &[u32] {
&self.output[..self.output_len]
}
#[inline]
pub fn output(&self, idx: usize) -> u32 {
self.output[idx]
}
}

View File

@@ -1,164 +1,191 @@
#![allow(dead_code)]
mod composite;
pub use self::composite::{CompositeEncoder, CompositeDecoder};
mod stream;
#[cfg(feature="simdcompression")]
mod compression_simd;
#[cfg(feature="simdcompression")]
pub use self::compression_simd::{BlockEncoder, BlockDecoder};
pub const COMPRESSION_BLOCK_SIZE: usize = 128;
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
pub use self::stream::CompressedIntStream;
#[cfg(not(feature="simdcompression"))]
mod compression_nosimd;
#[cfg(not(feature="simdcompression"))]
pub use self::compression_nosimd::{BlockEncoder, BlockDecoder};
use bitpacking::{BitPacker, BitPacker4x};
/// Returns the size in bytes of a compressed block, given `num_bits`.
pub fn compressed_block_size(num_bits: u8) -> usize {
1 + (num_bits as usize) * COMPRESSION_BLOCK_SIZE / 8
}
pub struct BlockEncoder {
bitpacker: BitPacker4x,
pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
pub output_len: usize,
}
impl BlockEncoder {
pub fn new() -> BlockEncoder {
BlockEncoder {
bitpacker: BitPacker4x::new(),
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
output_len: 0,
}
}
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> &[u8] {
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
self.output[0] = num_bits;
let written_size = 1 + self.bitpacker.compress_sorted(offset, block, &mut self.output[1..], num_bits);
&self.output[..written_size]
}
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> &[u8] {
let num_bits = self.bitpacker.num_bits(block);
self.output[0] = num_bits;
let written_size = 1 + self.bitpacker.compress(block, &mut self.output[1..], num_bits);
&self.output[..written_size]
}
}
pub struct BlockDecoder {
bitpacker: BitPacker4x,
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
pub output_len: usize,
}
impl BlockDecoder {
pub fn new() -> BlockDecoder {
BlockDecoder::with_val(0u32)
}
pub fn with_val(val: u32) -> BlockDecoder {
let mut output = [val; COMPRESSION_BLOCK_SIZE + 1];
output[COMPRESSION_BLOCK_SIZE] = 0u32;
BlockDecoder {
bitpacker: BitPacker4x::new(),
output,
output_len: 0,
}
}
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
let num_bits = compressed_data[0];
self.output_len = COMPRESSION_BLOCK_SIZE;
1 + self.bitpacker.decompress_sorted(offset, &compressed_data[1..], &mut self.output, num_bits)
}
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
let num_bits = compressed_data[0];
self.output_len = COMPRESSION_BLOCK_SIZE;
1 + self.bitpacker.decompress(&compressed_data[1..], &mut self.output, num_bits)
}
#[inline]
pub fn output_array(&self) -> &[u32] {
&self.output[..self.output_len]
}
#[inline]
pub fn output(&self, idx: usize) -> u32 {
self.output[idx]
}
}
mod vint;
pub trait VIntEncoder {
/// Compresses an array of `u32` integers,
/// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
/// and variable bytes encoding.
///
/// The method takes an array of ints to compress, and returns
/// a `&[u8]` representing the compressed data.
///
/// The method also takes an offset to give the value of the
/// hypothetical previous element in the delta-encoding.
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8];
/// Compresses an array of `u32` integers,
/// using variable bytes encoding.
///
/// The method takes an array of ints to compress, and returns
/// a `&[u8]` representing the compressed data.
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8];
}
pub trait VIntDecoder {
fn uncompress_vint_sorted<'a>(&mut self, compressed_data: &'a [u8], offset: u32, num_els: usize) -> &'a [u8];
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> &'a [u8];
}
impl VIntEncoder for BlockEncoder{
fn compress_vint_sorted(&mut self, input: &[u32], mut offset: u32) -> &[u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v - offset;
offset = v;
loop {
let next_byte: u8 = (to_encode % 128u32) as u8;
to_encode /= 128u32;
if to_encode == 0u32 {
self.output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
}
else {
self.output[byte_written] = next_byte;
byte_written += 1;
}
}
}
&self.output[..byte_written]
}
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v;
loop {
let next_byte: u8 = (to_encode % 128u32) as u8;
to_encode /= 128u32;
if to_encode == 0u32 {
self.output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
}
else {
self.output[byte_written] = next_byte;
byte_written += 1;
}
}
}
&self.output[..byte_written]
}
}
impl VIntDecoder for BlockDecoder {
/// Uncompress an array of `u32` integers,
/// that were compressed using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
/// and variable bytes encoding.
///
/// The method takes a number of int to decompress, and returns
/// the amount of bytes that were read to decompress them.
///
/// The method also takes an offset to give the value of the
/// hypothetical previous element in the delta-encoding.
///
/// For instance, if delta encoded are `1, 3, 9`, and the
/// `offset` is 5, then the output will be:
/// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18`
fn uncompress_vint_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize) -> &'a [u8] {
let mut read_byte = 0;
let mut result = offset;
for i in 0..num_els {
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[read_byte];
read_byte += 1;
result += ((cur_byte % 128u8) as u32) << shift;
if cur_byte & 128u8 != 0u8 {
break;
}
shift += 7;
}
self.output[i] = result;
}
self.output_len = num_els;
&compressed_data[read_byte..]
}
fn uncompress_vint_unsorted<'a>(
&mut self,
compressed_data: &'a [u8],
num_els: usize) -> &'a [u8] {
let mut read_byte = 0;
for i in 0..num_els {
let mut result = 0u32;
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[read_byte];
read_byte += 1;
result += ((cur_byte % 128u8) as u32) << shift;
if cur_byte & 128u8 != 0u8 {
break;
}
shift += 7;
}
self.output[i] = result;
}
self.output_len = num_els;
&compressed_data[read_byte..]
}
num_els: usize,
) -> usize;
/// Uncompress an array of `u32s`, compressed using variable
/// byte encoding.
///
/// The method takes a number of int to decompress, and returns
/// the amount of bytes that were read to decompress them.
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize;
}
impl VIntEncoder for BlockEncoder {
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8] {
vint::compress_sorted(input, &mut self.output, offset)
}
pub const NUM_DOCS_PER_BLOCK: usize = 128; //< should be a power of 2 to let the compiler optimize.
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8] {
vint::compress_unsorted(input, &mut self.output)
}
}
impl VIntDecoder for BlockDecoder {
fn uncompress_vint_sorted<'a>(
&mut self,
compressed_data: &'a [u8],
offset: u32,
num_els: usize,
) -> usize {
self.output_len = num_els;
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
}
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
self.output_len = num_els;
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
}
}
#[cfg(test)]
pub mod tests {
use rand::Rng;
use rand::SeedableRng;
use rand::XorShiftRng;
use super::*;
use tests;
use test::Bencher;
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
(0..u32::max_value())
.filter(|_| rng.next_f32()< ratio)
.take(n)
.collect()
}
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
generate_array_with_seed(n, ratio, 4)
}
#[test]
fn test_encode_sorted_block() {
let vals: Vec<u32> = (0u32..128u32).map(|i| i*7).collect();
let vals: Vec<u32> = (0u32..128u32).map(|i| i * 7).collect();
let mut encoder = BlockEncoder::new();
let compressed_data = encoder.compress_block_sorted(&vals, 0);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_sorted(compressed_data, 0);
assert_eq!(remaining_data.len(), 0);
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0);
assert_eq!(consumed_num_bytes, compressed_data.len());
}
for i in 0..128 {
assert_eq!(vals[i], decoder.output(i));
@@ -167,33 +194,33 @@ pub mod tests {
#[test]
fn test_encode_sorted_block_with_offset() {
let vals: Vec<u32> = (0u32..128u32).map(|i| 11 + i*7).collect();
let vals: Vec<u32> = (0u32..128u32).map(|i| 11 + i * 7).collect();
let mut encoder = BlockEncoder::new();
let compressed_data = encoder.compress_block_sorted(&vals, 10);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_sorted(compressed_data, 10);
assert_eq!(remaining_data.len(), 0);
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10);
assert_eq!(consumed_num_bytes, compressed_data.len());
}
for i in 0..128 {
assert_eq!(vals[i], decoder.output(i));
}
}
#[test]
fn test_encode_sorted_block_with_junk() {
let mut compressed: Vec<u8> = Vec::new();
let n = 128;
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32)*7u32).collect();
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32).collect();
let mut encoder = BlockEncoder::new();
let compressed_data = encoder.compress_block_sorted(&vals, 10);
compressed.extend_from_slice(compressed_data);
compressed.push(173u8);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_sorted(&compressed, 10);
assert_eq!(remaining_data.len(), 1);
assert_eq!(remaining_data[0], 173u8);
let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10);
assert_eq!(consumed_num_bytes, compressed.len() - 1);
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
for i in 0..n {
assert_eq!(vals[i], decoder.output(i));
@@ -204,72 +231,94 @@ pub mod tests {
fn test_encode_unsorted_block_with_junk() {
let mut compressed: Vec<u8> = Vec::new();
let n = 128;
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32)*7u32 % 12).collect();
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect();
let mut encoder = BlockEncoder::new();
let compressed_data = encoder.compress_block_unsorted(&vals);
compressed.extend_from_slice(compressed_data);
compressed.push(173u8);
let mut decoder = BlockDecoder::new();
{
let remaining_data = decoder.uncompress_block_unsorted(&compressed);
assert_eq!(remaining_data.len(), 1);
assert_eq!(remaining_data[0], 173u8);
let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed);
assert_eq!(consumed_num_bytes + 1, compressed.len());
assert_eq!(compressed[consumed_num_bytes], 173u8);
}
for i in 0..n {
assert_eq!(vals[i], decoder.output(i));
}
}
#[test]
fn test_encode_vint() {
{
let expected_length = 123;
let expected_length = 154;
let mut encoder = BlockEncoder::new();
let input: Vec<u32> = (0u32..123u32)
.map(|i| 4 + i * 7 / 2)
.into_iter()
.collect();
let input: Vec<u32> = (0u32..123u32).map(|i| 4 + i * 7 / 2).into_iter().collect();
for offset in &[0u32, 1u32, 2u32] {
let encoded_data = encoder.compress_vint_sorted(&input, *offset);
assert_eq!(encoded_data.len(), expected_length);
assert!(encoded_data.len() <= expected_length);
let mut decoder = BlockDecoder::new();
let remaining_data = decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
assert_eq!(0, remaining_data.len());
let consumed_num_bytes =
decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
assert_eq!(consumed_num_bytes, encoded_data.len());
assert_eq!(input, decoder.output_array());
}
}
{
let mut encoder = BlockEncoder::new();
let input = vec!(3u32, 17u32, 187u32);
let encoded_data = encoder.compress_vint_sorted(&input, 0);
assert_eq!(encoded_data.len(), 4);
assert_eq!(encoded_data[0], 3u8 + 128u8);
assert_eq!(encoded_data[1], (17u8 - 3u8) + 128u8);
assert_eq!(encoded_data[2], (187u8 - 17u8 - 128u8));
assert_eq!(encoded_data[3], (1u8 + 128u8));
}
}
#[bench]
fn bench_compress(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_DOCS_PER_BLOCK, 0.1);
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
b.iter(|| {
encoder.compress_block_sorted(&data, 0u32);
});
}
#[bench]
fn bench_uncompress(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_DOCS_PER_BLOCK, 0.1);
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let compressed = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
let mut decoder = BlockDecoder::new();
b.iter(|| {
decoder.uncompress_block_sorted(compressed, 0u32);
});
}
#[test]
fn test_all_docs_compression_numbits() {
for num_bits in 0..33 {
let mut data = [0u32; 128];
if num_bits > 0 {
data[0] = 1 << (num_bits - 1);
}
let mut encoder = BlockEncoder::new();
let compressed = encoder.compress_block_unsorted(&data);
assert_eq!(compressed[0] as usize, num_bits);
assert_eq!(compressed.len(), compressed_block_size(compressed[0]));
}
}
const NUM_INTS_BENCH_VINT: usize = 10;
#[bench]
fn bench_compress_vint(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
b.iter(|| {
encoder.compress_vint_sorted(&data, 0u32);
});
}
#[bench]
fn bench_uncompress_vint(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
});
}
}

158
src/compression/stream.rs Normal file
View File

@@ -0,0 +1,158 @@
use compression::BlockDecoder;
use compression::COMPRESSION_BLOCK_SIZE;
use compression::compressed_block_size;
use directory::{ReadOnlySource, SourceRead};
/// Reads a stream of compressed ints.
///
/// Tantivy uses `CompressedIntStream` to read
/// the position file.
/// The `.skip(...)` makes it possible to avoid
/// decompressing blocks that are not required.
pub struct CompressedIntStream {
buffer: SourceRead,
block_decoder: BlockDecoder,
cached_addr: usize, // address of the currently decoded block
cached_next_addr: usize, // address following the currently decoded block
addr: usize, // address of the block associated to the current position
inner_offset: usize,
}
impl CompressedIntStream {
/// Opens a compressed int stream.
pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream {
CompressedIntStream {
buffer: SourceRead::from(source),
block_decoder: BlockDecoder::new(),
cached_addr: usize::max_value(),
cached_next_addr: usize::max_value(),
addr: 0,
inner_offset: 0,
}
}
/// Loads the block at the given address and return the address of the
/// following block
pub fn read_block(&mut self, addr: usize) -> usize {
if self.cached_addr == addr {
// we are already on this block.
// no need to read.
self.cached_next_addr
} else {
let next_addr = addr + self.block_decoder.uncompress_block_unsorted(self.buffer.slice_from(addr));
self.cached_addr = addr;
self.cached_next_addr = next_addr;
next_addr
}
}
/// Fills a buffer with the next `output.len()` integers.
/// This does not consume / advance the stream.
pub fn read(&mut self, output: &mut [u32]) {
let mut cursor = self.addr;
let mut inner_offset = self.inner_offset;
let mut num_els: usize = output.len();
let mut start = 0;
loop {
cursor = self.read_block(cursor);
let block = &self.block_decoder.output_array()[inner_offset..];
let block_len = block.len();
if num_els >= block_len {
output[start..start + block_len].clone_from_slice(&block);
start += block_len;
num_els -= block_len;
inner_offset = 0;
} else {
output[start..].clone_from_slice(&block[..num_els]);
break;
}
}
}
/// Skip the next `skip_len` integer.
///
/// If a full block is skipped, calling
/// `.skip(...)` will avoid decompressing it.
///
/// May panic if the end of the stream is reached.
pub fn skip(&mut self, mut skip_len: usize) {
loop {
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
if available >= skip_len {
self.inner_offset += skip_len;
break;
} else {
skip_len -= available;
// entirely skip decompressing some blocks.
let num_bits: u8 = self.buffer.get(self.addr);
let block_len = compressed_block_size(num_bits);
self.addr += block_len;
self.inner_offset = 0;
}
}
}
}
#[cfg(test)]
pub mod tests {
use super::CompressedIntStream;
use compression::compressed_block_size;
use compression::COMPRESSION_BLOCK_SIZE;
use compression::BlockEncoder;
use directory::ReadOnlySource;
fn create_stream_buffer() -> ReadOnlySource {
let mut buffer: Vec<u8> = vec![];
let mut encoder = BlockEncoder::new();
let vals: Vec<u32> = (0u32..1152u32).collect();
for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) {
let compressed_block = encoder.compress_block_unsorted(chunk);
let num_bits = compressed_block[0];
assert_eq!(compressed_block_size(num_bits), compressed_block.len());
buffer.extend_from_slice(compressed_block);
}
if cfg!(simd) {
buffer.extend_from_slice(&[0u8; 7]);
}
ReadOnlySource::from(buffer)
}
#[test]
fn test_compressed_int_stream() {
let buffer = create_stream_buffer();
let mut stream = CompressedIntStream::wrap(buffer);
let mut block: [u32; COMPRESSION_BLOCK_SIZE] = [0u32; COMPRESSION_BLOCK_SIZE];
stream.read(&mut block[0..2]);
assert_eq!(block[0], 0);
assert_eq!(block[1], 1);
// reading does not consume the stream
stream.read(&mut block[0..2]);
assert_eq!(block[0], 0);
assert_eq!(block[1], 1);
stream.skip(2);
stream.skip(5);
stream.read(&mut block[0..3]);
stream.skip(3);
assert_eq!(block[0], 7);
assert_eq!(block[1], 8);
assert_eq!(block[2], 9);
stream.skip(500);
stream.read(&mut block[0..3]);
stream.skip(3);
assert_eq!(block[0], 510);
assert_eq!(block[1], 511);
assert_eq!(block[2], 512);
stream.skip(511);
stream.read(&mut block[..1]);
assert_eq!(block[0], 1024);
}
}

92
src/compression/vint.rs Normal file
View File

@@ -0,0 +1,92 @@
#[inline(always)]
pub(crate) fn compress_sorted<'a>(
input: &[u32],
output: &'a mut [u8],
mut offset: u32,
) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v - offset;
offset = v;
loop {
let next_byte: u8 = (to_encode % 128u32) as u8;
to_encode /= 128u32;
if to_encode == 0u32 {
output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
} else {
output[byte_written] = next_byte;
byte_written += 1;
}
}
}
&output[..byte_written]
}
#[inline(always)]
pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v;
loop {
let next_byte: u8 = (to_encode % 128u32) as u8;
to_encode /= 128u32;
if to_encode == 0u32 {
output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
} else {
output[byte_written] = next_byte;
byte_written += 1;
}
}
}
&output[..byte_written]
}
#[inline(always)]
pub(crate) fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32,
) -> usize {
let mut read_byte = 0;
let mut result = offset;
let num_els = output.len();
for i in 0..num_els {
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[read_byte];
read_byte += 1;
result += ((cur_byte % 128u8) as u32) << shift;
if cur_byte & 128u8 != 0u8 {
break;
}
shift += 7;
}
output[i] = result;
}
read_byte
}
#[inline(always)]
pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
let mut read_byte = 0;
let num_els = output.len();
for i in 0..num_els {
let mut result = 0u32;
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[read_byte];
read_byte += 1;
result += ((cur_byte % 128u8) as u32) << shift;
if cur_byte & 128u8 != 0u8 {
break;
}
shift += 7;
}
output[i] = result;
}
read_byte
}

View File

@@ -1,51 +1,49 @@
use Result;
use Error;
use error::{ErrorKind, ResultExt};
use serde_json;
use schema::Schema;
use std::sync::Arc;
use std::borrow::BorrowMut;
use std::fmt;
use rustc_serialize::json;
use core::SegmentId;
use directory::{Directory, MmapDirectory, RAMDirectory};
use indexer::IndexWriter;
#[cfg(feature="mmap")]
use directory::MmapDirectory;
use directory::{Directory, RAMDirectory};
use indexer::index_writer::open_index_writer;
use core::searcher::Searcher;
use std::convert::From;
use num_cpus;
use super::segment::Segment;
use core::SegmentReader;
use super::pool::Pool;
use core::SegmentMeta;
use super::pool::LeasedItem;
use std::path::Path;
use indexer::SegmentManager;
use core::IndexMeta;
use indexer::DirectoryLock;
use IndexWriter;
use directory::ManagedDirectory;
use core::META_FILEPATH;
use super::segment::create_segment;
use indexer::segment_updater::save_new_metas;
use tokenizer::TokenizerManager;
const NUM_SEARCHERS: usize = 12;
/// Accessor to the index segment manager
///
/// This method is not part of tantivy's public API
pub fn get_segment_manager(index: &Index) -> Arc<SegmentManager> {
index.segment_manager.clone()
}
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_file = try!(directory.open_read(&META_FILEPATH));
let meta_content = String::from_utf8_lossy(meta_file.as_slice());
json::decode(&meta_content)
.map_err(|e| Error::CorruptedFile(META_FILEPATH.clone(), Box::new(e)))
let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data);
serde_json::from_str(&meta_string).chain_err(|| ErrorKind::CorruptedFile(META_FILEPATH.clone()))
}
/// Tantivy's Search Index
/// Search Index
pub struct Index {
segment_manager: Arc<SegmentManager>,
directory: Box<Directory>,
directory: ManagedDirectory,
schema: Schema,
searcher_pool: Arc<Pool<Searcher>>,
docstamp: u64,
tokenizers: TokenizerManager,
}
impl Index {
@@ -54,18 +52,29 @@ impl Index {
/// The index will be allocated in anonymous memory.
/// This should only be used for unit tests.
pub fn create_in_ram(schema: Schema) -> Index {
let directory = Box::new(RAMDirectory::create());
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail") // unwrap is ok here
let ram_directory = RAMDirectory::create();
// unwrap is ok here
let directory = ManagedDirectory::new(ram_directory).expect(
"Creating a managed directory from a brand new RAM directory \
should never fail.",
);
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
}
/// Creates a new index in a given filepath.
/// The index will use the `MMapDirectory`.
///
/// If a previous index was in this directory, then its meta file will be destroyed.
pub fn create(directory_path: &Path, schema: Schema) -> Result<Index> {
let mut directory = MmapDirectory::open(directory_path)?;
save_new_metas(schema.clone(), 0, &mut directory)?;
Index::from_directory(box directory, schema)
#[cfg(feature="mmap")]
pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
let directory = ManagedDirectory::new(mmap_directory)?;
Index::from_directory(directory, schema)
}
/// Accessor for the tokenizer manager.
pub fn tokenizers(&self) -> &TokenizerManager {
&self.tokenizers
}
/// Creates a new index in a temp directory.
@@ -76,65 +85,82 @@ impl Index {
///
/// The temp directory is only used for testing the `MmapDirectory`.
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
#[cfg(feature="mmap")]
#[cfg(test)]
pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
let directory = Box::new(try!(MmapDirectory::create_from_tempdir()));
let mmap_directory = MmapDirectory::create_from_tempdir()?;
let directory = ManagedDirectory::new(mmap_directory)?;
Index::from_directory(directory, schema)
}
/// Creates a new index given a directory and an `IndexMeta`.
fn create_from_metas(directory: Box<Directory>, metas: IndexMeta) -> Result<Index> {
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
let schema = metas.schema.clone();
let docstamp = metas.docstamp;
let committed_segments = metas.committed_segments;
// TODO log somethings is uncommitted is not empty.
let index = Index {
segment_manager: Arc::new(SegmentManager::from_segments(committed_segments)),
directory: directory,
schema: schema,
directory,
schema,
searcher_pool: Arc::new(Pool::new()),
docstamp: docstamp,
tokenizers: TokenizerManager::default(),
};
try!(index.load_searchers());
index.load_searchers()?;
Ok(index)
}
/// Opens a new directory from a directory.
pub fn from_directory(directory: Box<Directory>, schema: Schema) -> Result<Index> {
Index::create_from_metas(directory, IndexMeta::with_schema(schema))
/// Create a new index from a directory.
pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas)
}
/// Opens a new directory from an index path.
pub fn open(directory_path: &Path) -> Result<Index> {
let directory = try!(MmapDirectory::open(directory_path));
let metas = try!(load_metas(&directory)); //< TODO does the directory already exists?
Index::create_from_metas(directory.box_clone(), metas)
#[cfg(feature="mmap")]
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
let directory = ManagedDirectory::new(mmap_directory)?;
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
}
/// Returns the index docstamp.
pub fn open_directory<TDirectory: Directory>(directory: TDirectory) -> Result<Index> {
let directory = ManagedDirectory::new(directory)?;
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
}
/// Reads the index meta file from the directory.
pub fn load_metas(&self) -> Result<IndexMeta> {
load_metas(self.directory())
}
/// Open a new index writer. Attempts to acquire a lockfile.
///
/// The docstamp is the number of documents that have been added
/// from the beginning of time, and until the moment of the last commit.
pub fn docstamp(&self) -> u64 {
self.docstamp
}
/// Creates a multithreaded writer.
/// Each writer produces an independent segment.
/// The lockfile should be deleted on drop, but it is possible
/// that due to a panic or other error, a stale lockfile will be
/// left in the index directory. If you are sure that no other
/// `IndexWriter` on the system is accessing the index directory,
/// it is safe to manually delete the lockfile.
///
/// num_threads specifies the number of indexing workers that
/// should work at the same time.
///
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn writer_with_num_threads(&self,
num_threads: usize,
heap_size_in_bytes: usize)
-> Result<IndexWriter> {
IndexWriter::open(self, num_threads, heap_size_in_bytes)
pub fn writer_with_num_threads(
&self,
num_threads: usize,
heap_size_in_bytes: usize,
) -> Result<IndexWriter> {
let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
open_index_writer(self, num_threads, heap_size_in_bytes, directory_lock)
}
/// Creates a multithreaded writer
/// It just calls `writer_with_num_threads` with the number of cores as `num_threads`
///
/// # Errors
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
@@ -151,65 +177,63 @@ impl Index {
}
/// Returns the list of segments that are searchable
pub fn searchable_segments(&self) -> Vec<Segment> {
self.searchable_segment_ids()
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
Ok(self.searchable_segment_metas()?
.into_iter()
.map(|segment_id| self.segment(segment_id))
.collect()
.map(|segment_meta| self.segment(segment_meta))
.collect())
}
/// Remove all of the file associated with the segment.
///
/// This method cannot fail. If a problem occurs,
/// some files may end up never being removed.
/// The error will only be logged.
pub fn delete_segment(&self, segment_id: SegmentId) {
self.segment(segment_id).delete();
}
/// Return a segment object given a `segment_id`
///
/// The segment may or may not exist.
pub fn segment(&self, segment_id: SegmentId) -> Segment {
create_segment(self.clone(), segment_id)
}
/// Return a reference to the index directory.
pub fn directory(&self) -> &Directory {
&*self.directory
}
/// Return a mutable reference to the index directory.
pub fn directory_mut(&mut self) -> &mut Directory {
&mut *self.directory
}
/// Returns the list of segment ids that are searchable.
fn searchable_segment_ids(&self) -> Vec<SegmentId> {
self.segment_manager.committed_segments()
#[doc(hidden)]
pub fn segment(&self, segment_meta: SegmentMeta) -> Segment {
create_segment(self.clone(), segment_meta)
}
/// Creates a new segment.
pub fn new_segment(&self) -> Segment {
self.segment(SegmentId::generate_random())
let segment_meta = SegmentMeta::new(SegmentId::generate_random());
create_segment(self.clone(), segment_meta)
}
/// Return a reference to the index directory.
pub fn directory(&self) -> &ManagedDirectory {
&self.directory
}
/// Return a mutable reference to the index directory.
pub fn directory_mut(&mut self) -> &mut ManagedDirectory {
&mut self.directory
}
/// Reads the meta.json and returns the list of
/// `SegmentMeta` from the last commit.
pub fn searchable_segment_metas(&self) -> Result<Vec<SegmentMeta>> {
Ok(self.load_metas()?.segments)
}
/// Returns the list of segment ids that are searchable.
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
Ok(self.searchable_segment_metas()?
.iter()
.map(|segment_meta| segment_meta.id())
.collect())
}
/// Creates a new generation of searchers after
/// a change of the set of searchable indexes.
///
/// This needs to be called when a new segment has been
/// published or after a merge.
pub fn load_searchers(&self) -> Result<()> {
let searchable_segments = self.searchable_segments();
let mut searchers = Vec::new();
for _ in 0..NUM_SEARCHERS {
let searchable_segments_clone = searchable_segments.clone();
let segment_readers: Vec<SegmentReader> = try!(searchable_segments_clone.into_iter()
.map(SegmentReader::open)
.collect());
let searcher = Searcher::from(segment_readers);
searchers.push(searcher);
}
let searchable_segments = self.searchable_segments()?;
let segment_readers: Vec<SegmentReader> = searchable_segments
.iter()
.map(SegmentReader::open)
.collect::<Result<_>>()?;
let searchers = (0..NUM_SEARCHERS)
.map(|_| Searcher::from(segment_readers.clone()))
.collect();
self.searcher_pool.publish_new_generation(searchers);
Ok(())
}
@@ -229,7 +253,6 @@ impl Index {
}
}
impl fmt::Debug for Index {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Index({:?})", self.directory)
@@ -239,12 +262,10 @@ impl fmt::Debug for Index {
impl Clone for Index {
fn clone(&self) -> Index {
Index {
segment_manager: self.segment_manager.clone(),
directory: self.directory.box_clone(),
directory: self.directory.clone(),
schema: self.schema.clone(),
searcher_pool: self.searcher_pool.clone(),
docstamp: self.docstamp,
searcher_pool: Arc::clone(&self.searcher_pool),
tokenizers: self.tokenizers.clone(),
}
}
}

View File

@@ -1,47 +1,68 @@
use schema::Schema;
use core::SegmentId;
use core::SegmentMeta;
use std::fmt;
use serde_json;
/// Meta information about the `Index`.
///
///
/// This object is serialized on disk in the `meta.json` file.
/// It keeps information about
/// It keeps information about
/// * the searchable segments,
/// * the index docstamp
/// * the index `docstamp`
/// * the schema
///
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
#[derive(Clone, Serialize, Deserialize)]
pub struct IndexMeta {
pub committed_segments: Vec<SegmentMeta>,
pub uncommitted_segments: Vec<SegmentMeta>,
pub segments: Vec<SegmentMeta>,
pub schema: Schema,
pub docstamp: u64,
pub opstamp: u64,
#[serde(skip_serializing_if = "Option::is_none")]
pub payload: Option<String>,
}
impl IndexMeta {
pub fn with_schema(schema: Schema) -> IndexMeta {
IndexMeta {
committed_segments: Vec::new(),
uncommitted_segments: Vec::new(),
schema: schema,
docstamp: 0u64,
segments: vec![],
schema,
opstamp: 0u64,
payload: None,
}
}
}
#[derive(Clone, Debug, RustcDecodable,RustcEncodable)]
pub struct SegmentMeta {
pub segment_id: SegmentId,
pub num_docs: u32,
impl fmt::Debug for IndexMeta {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"{}",
serde_json::ser::to_string(self)
.expect("JSON serialization for IndexMeta should never fail.")
)
}
}
#[cfg(test)]
impl SegmentMeta {
pub fn new(segment_id: SegmentId, num_docs: u32) -> SegmentMeta {
SegmentMeta {
segment_id: segment_id,
num_docs: num_docs,
}
mod tests {
use serde_json;
use super::IndexMeta;
use schema::{SchemaBuilder, TEXT};
#[test]
fn test_serialize_metas() {
let schema = {
let mut schema_builder = SchemaBuilder::new();
schema_builder.add_text_field("text", TEXT);
schema_builder.build()
};
let index_metas = IndexMeta {
segments: Vec::new(),
schema: schema,
opstamp: 0u64,
payload: None,
};
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(json, r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#);
}
}
}

View File

@@ -0,0 +1,184 @@
use directory::{ReadOnlySource, SourceRead};
use termdict::{TermDictionary, TermDictionaryImpl};
use postings::{BlockSegmentPostings, SegmentPostings};
use postings::TermInfo;
use schema::IndexRecordOption;
use schema::Term;
use compression::CompressedIntStream;
use postings::FreqReadingOption;
use common::BinarySerializable;
use schema::FieldType;
/// The inverted index reader is in charge of accessing
/// the inverted index associated to a specific field.
///
/// # Note
///
/// It is safe to delete the segment associated to
/// an `InvertedIndexReader`. As long as it is open,
/// the `ReadOnlySource` it is relying on should
/// stay available.
///
///
/// `InvertedIndexReader` are created by calling
/// the `SegmentReader`'s [`.inverted_index(...)`] method
pub struct InvertedIndexReader {
termdict: TermDictionaryImpl,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
record_option: IndexRecordOption,
total_num_tokens: u64
}
impl InvertedIndexReader {
pub(crate) fn new(
termdict: TermDictionaryImpl,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
record_option: IndexRecordOption,
) -> InvertedIndexReader {
let total_num_tokens_data = postings_source.slice(0, 8);
let mut total_num_tokens_cursor = total_num_tokens_data.as_slice();
let total_num_tokens = u64::deserialize(&mut total_num_tokens_cursor).unwrap_or(0u64);
InvertedIndexReader {
termdict,
postings_source: postings_source.slice_from(8),
positions_source,
record_option,
total_num_tokens
}
}
/// Creates an empty `InvertedIndexReader` object, which
/// contains no terms at all.
pub fn empty(field_type: FieldType) -> InvertedIndexReader {
let record_option = field_type
.get_index_record_option()
.unwrap_or(IndexRecordOption::Basic);
InvertedIndexReader {
termdict: TermDictionaryImpl::empty(field_type),
postings_source: ReadOnlySource::empty(),
positions_source: ReadOnlySource::empty(),
record_option,
total_num_tokens: 0u64
}
}
/// Returns the term info associated with the term.
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
self.termdict.get(term.value_bytes())
}
/// Return the term dictionary datastructure.
pub fn terms(&self) -> &TermDictionaryImpl {
&self.termdict
}
/// Resets the block segment to another position of the postings
/// file.
///
/// This is useful for enumerating through a list of terms,
/// and consuming the associated posting lists while avoiding
/// reallocating a `BlockSegmentPostings`.
///
/// # Warning
///
/// This does not reset the positions list.
pub fn reset_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings,
) {
let offset = term_info.postings_offset as usize;
let end_source = self.postings_source.len();
let postings_slice = self.postings_source.slice(offset, end_source);
let postings_reader = SourceRead::from(postings_slice);
block_postings.reset(term_info.doc_freq as usize, postings_reader);
}
/// Returns a block postings given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_block_postings_from_terminfo(
&self,
term_info: &TermInfo,
requested_option: IndexRecordOption,
) -> BlockSegmentPostings {
let offset = term_info.postings_offset as usize;
let postings_data = self.postings_source.slice_from(offset);
let freq_reading_option = match (self.record_option, requested_option) {
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
(_, _) => FreqReadingOption::ReadFreq,
};
BlockSegmentPostings::from_data(
term_info.doc_freq as usize,
SourceRead::from(postings_data),
freq_reading_option,
)
}
/// Returns a posting object given a `term_info`.
/// This method is for an advanced usage only.
///
/// Most user should prefer using `read_postings` instead.
pub fn read_postings_from_terminfo(
&self,
term_info: &TermInfo,
option: IndexRecordOption,
) -> SegmentPostings {
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
let position_stream = {
if option.has_positions() {
let position_offset = term_info.positions_offset;
let positions_source = self.positions_source.slice_from(position_offset as usize);
let mut stream = CompressedIntStream::wrap(positions_source);
stream.skip(term_info.positions_inner_offset as usize);
Some(stream)
} else {
None
}
};
SegmentPostings::from_block_postings(block_postings, position_stream)
}
/// Returns the total number of tokens recorded for all documents
/// (including deleted documents).
pub fn total_num_tokens(&self) -> u64 {
self.total_num_tokens
}
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encountered and indexed.
///
/// If the field was not indexed with the indexing options that cover
/// the requested options, the returned `SegmentPostings` the method does not fail
/// and returns a `SegmentPostings` with as much information as possible.
///
/// For instance, requesting `IndexRecordOption::Freq` for a
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
/// with `DocId`s and frequencies.
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
let term_info = get!(self.get_term_info(term));
Some(self.read_postings_from_terminfo(&term_info, option))
}
pub(crate) fn read_postings_no_deletes(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
let term_info = get!(self.get_term_info(term));
Some(self.read_postings_from_terminfo(&term_info, option))
}
/// Returns the number of documents containing the term.
pub fn doc_freq(&self, term: &Term) -> u32 {
self.get_term_info(term)
.map(|term_info| term_info.doc_freq)
.unwrap_or(0u32)
}
}

View File

@@ -1,5 +1,4 @@
pub mod searcher;
pub mod index;
mod segment_reader;
mod segment_id;
@@ -7,20 +6,37 @@ mod segment_component;
mod segment;
mod index_meta;
mod pool;
mod segment_meta;
mod inverted_index_reader;
use std::path::PathBuf;
pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::searcher::Searcher;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
pub use self::segment_reader::SegmentReader;
pub use self::segment::Segment;
pub use self::segment::SegmentInfo;
pub use self::segment::SerializableSegment;
pub use self::index::Index;
pub use self::index_meta::{IndexMeta, SegmentMeta};
pub use self::segment_meta::SegmentMeta;
pub use self::index_meta::IndexMeta;
use std::path::PathBuf;
lazy_static! {
/// The meta file contains all the information about the list of segments and the schema
/// of the index.
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");
}
/// The managed file contains a list of files that were created by the tantivy
/// and will therefore be garbage collected when they are deemed useless by tantivy.
///
/// Removing this file is safe, but will prevent the garbage collection of all of the file that
/// are currently in the directory
pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json");
/// Only one process should be able to write tantivy's index at a time.
/// This file, when present, is in charge of preventing other processes to open an IndexWriter.
///
/// If the process is killed and this file remains, it is safe to remove it manually.
pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock");
}

View File

@@ -17,10 +17,10 @@ pub struct Pool<T> {
}
impl<T> Pool<T> {
pub fn new() -> Pool<T> {
let queue = Arc::new(MsQueue::new());
Pool {
queue: Arc::new(MsQueue::new()),
queue,
freshest_generation: AtomicUsize::default(),
next_generation: AtomicUsize::default(),
}
@@ -30,52 +30,52 @@ impl<T> Pool<T> {
let next_generation = self.next_generation.fetch_add(1, Ordering::SeqCst) + 1;
for item in items {
let gen_item = GenerationItem {
item: item,
item,
generation: next_generation,
};
self.queue.push(gen_item);
}
self.advertise_generation(next_generation);
}
/// At the exit of this method,
/// At the exit of this method,
/// - freshest_generation has a value greater or equal than generation
/// - freshest_generation has a value that has been advertised
/// - freshest_generation has
/// - freshest_generation has)
fn advertise_generation(&self, generation: usize) {
// not optimal at all but the easiest to read proof.
// not optimal at all but the easiest to read proof.
loop {
let former_generation = self.freshest_generation.load(Ordering::Acquire);
if former_generation >= generation {
break;
}
self.freshest_generation.compare_and_swap(former_generation, generation, Ordering::SeqCst);
}
self.freshest_generation.compare_and_swap(
former_generation,
generation,
Ordering::SeqCst,
);
}
}
fn generation(&self,) -> usize {
fn generation(&self) -> usize {
self.freshest_generation.load(Ordering::Acquire)
}
pub fn acquire(&self,) -> LeasedItem<T> {
pub fn acquire(&self) -> LeasedItem<T> {
let generation = self.generation();
loop {
let gen_item = self.queue.pop();
if gen_item.generation >= generation {
return LeasedItem {
gen_item: Some(gen_item),
recycle_queue: self.queue.clone(),
}
}
else {
recycle_queue: Arc::clone(&self.queue),
};
} else {
// this searcher is obsolete,
// removing it from the pool.
}
}
}
}
pub struct LeasedItem<T> {
@@ -84,29 +84,33 @@ pub struct LeasedItem<T> {
}
impl<T> Deref for LeasedItem<T> {
type Target = T;
fn deref(&self) -> &T {
&self.gen_item.as_ref().expect("Unwrapping a leased item should never fail").item // unwrap is safe here
&self.gen_item
.as_ref()
.expect("Unwrapping a leased item should never fail")
.item // unwrap is safe here
}
}
impl<T> DerefMut for LeasedItem<T> {
fn deref_mut(&mut self) -> &mut T {
&mut self.gen_item.as_mut().expect("Unwrapping a mut leased item should never fail").item // unwrap is safe here
&mut self.gen_item
.as_mut()
.expect("Unwrapping a mut leased item should never fail")
.item // unwrap is safe here
}
}
impl<T> Drop for LeasedItem<T> {
fn drop(&mut self) {
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None).expect("Unwrapping a leased item should never fail");
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
.expect("Unwrapping a leased item should never fail");
self.recycle_queue.push(gen_item);
}
}
#[cfg(test)]
mod tests {
@@ -127,4 +131,4 @@ mod tests {
assert_eq!(*pool.acquire(), 11);
}
}
}
}

View File

@@ -2,72 +2,108 @@ use Result;
use core::SegmentReader;
use schema::Document;
use collector::Collector;
use common::TimerTree;
use query::Query;
use DocId;
use DocAddress;
use schema::Term;
use schema::{Field, Term};
use termdict::{TermDictionary, TermMerger};
use std::sync::Arc;
use std::fmt;
use core::InvertedIndexReader;
/// Holds a list of `SegmentReader`s ready for search.
///
/// It guarantees that the `Segment` will not be removed before
/// It guarantees that the `Segment` will not be removed before
/// the destruction of the `Searcher`.
///
#[derive(Debug)]
///
pub struct Searcher {
segment_readers: Vec<SegmentReader>,
}
impl Searcher {
/// Fetches a document from tantivy's store given a `DocAddress`.
///
/// The searcher uses the segment ordinal to route the
/// the request to the right `Segment`.
/// the request to the right `Segment`.
pub fn doc(&self, doc_address: &DocAddress) -> Result<Document> {
let DocAddress(segment_local_id, doc_id) = *doc_address;
let segment_reader = &self.segment_readers[segment_local_id as usize];
segment_reader.doc(doc_id)
}
/// Returns the overall number of documents in the index.
pub fn num_docs(&self,) -> DocId {
pub fn num_docs(&self) -> u64 {
self.segment_readers
.iter()
.map(|segment_reader| segment_reader.num_docs())
.fold(0u32, |acc, val| acc + val)
.map(|segment_reader| segment_reader.num_docs() as u64)
.sum::<u64>()
}
/// Return the overall number of documents containing
/// the given term.
pub fn doc_freq(&self, term: &Term) -> u32 {
/// the given term.
pub fn doc_freq(&self, term: &Term) -> u64 {
self.segment_readers
.iter()
.map(|segment_reader| segment_reader.doc_freq(term))
.fold(0u32, |acc, val| acc + val)
.map(|segment_reader| segment_reader.inverted_index(term.field()).doc_freq(term) as u64)
.sum::<u64>()
}
/// Return the list of segment readers
pub fn segment_readers(&self,) -> &Vec<SegmentReader> {
pub fn segment_readers(&self) -> &[SegmentReader] {
&self.segment_readers
}
/// Returns the segment_reader associated with the given segment_ordinal
pub fn segment_reader(&self, segment_ord: usize) -> &SegmentReader {
&self.segment_readers[segment_ord]
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
&self.segment_readers[segment_ord as usize]
}
/// Runs a query on the segment readers wrapped by the searcher
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<TimerTree> {
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> {
query.search(self, collector)
}
/// Return the field searcher associated to a `Field`.
pub fn field(&self, field: Field) -> FieldSearcher {
let inv_index_readers = self.segment_readers
.iter()
.map(|segment_reader| segment_reader.inverted_index(field))
.collect::<Vec<_>>();
FieldSearcher::new(inv_index_readers)
}
}
pub struct FieldSearcher {
inv_index_readers: Vec<Arc<InvertedIndexReader>>,
}
impl FieldSearcher {
fn new(inv_index_readers: Vec<Arc<InvertedIndexReader>>) -> FieldSearcher {
FieldSearcher { inv_index_readers }
}
/// Returns a Stream over all of the sorted unique terms of
/// for the given field.
pub fn terms(&self) -> TermMerger {
let term_streamers: Vec<_> = self.inv_index_readers
.iter()
.map(|inverted_index| inverted_index.terms().stream())
.collect();
TermMerger::new(term_streamers)
}
}
impl From<Vec<SegmentReader>> for Searcher {
fn from(segment_readers: Vec<SegmentReader>) -> Searcher {
Searcher {
segment_readers: segment_readers,
}
Searcher { segment_readers }
}
}
}
impl fmt::Debug for Searcher {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let segment_ids = self.segment_readers
.iter()
.map(|segment_reader| segment_reader.segment_id())
.collect::<Vec<_>>();
write!(f, "Searcher({:?})", segment_ids)
}
}

View File

@@ -1,100 +1,98 @@
use Result;
use std::path::PathBuf;
use schema::Schema;
use DocId;
use std::fmt;
use core::SegmentId;
use directory::{ReadOnlySource, WritePtr};
use directory::{FileProtection, ReadOnlySource, WritePtr};
use indexer::segment_serializer::SegmentSerializer;
use super::SegmentComponent;
use core::Index;
use std::result;
use directory::error::{FileError, OpenWriteError};
use directory::Directory;
use core::SegmentMeta;
use directory::error::{OpenReadError, OpenWriteError};
/// A segment is a piece of the index.
#[derive(Clone)]
pub struct Segment {
index: Index,
segment_id: SegmentId,
meta: SegmentMeta,
}
impl fmt::Debug for Segment {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Segment({:?})", self.segment_id.uuid_string())
write!(f, "Segment({:?})", self.id().uuid_string())
}
}
/// Creates a new segment given an `Index` and a `SegmentId`
///
/// The function is here to make it private outside `tantivy`.
pub fn create_segment(index: Index, segment_id: SegmentId) -> Segment {
Segment {
index: index,
segment_id: segment_id,
}
///
/// The function is here to make it private outside `tantivy`.
pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
Segment { index, meta }
}
impl Segment {
/// Returns the index the segment belongs to.
pub fn index(&self) -> &Index {
&self.index
}
/// Returns our index's schema.
pub fn schema(&self,) -> Schema {
pub fn schema(&self) -> Schema {
self.index.schema()
}
/// Returns the segment's id.
pub fn id(&self,) -> SegmentId {
self.segment_id
/// Returns the segment meta-information
pub fn meta(&self) -> &SegmentMeta {
&self.meta
}
#[doc(hidden)]
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
self.meta.set_delete_meta(num_deleted_docs, opstamp);
}
/// Returns the segment's id.
pub fn id(&self) -> SegmentId {
self.meta.id()
}
/// Returns the relative path of a component of our segment.
///
/// It just joins the segment id with the extension
///
/// It just joins the segment id with the extension
/// associated to a segment component.
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
self.segment_id.relative_path(component)
self.meta.relative_path(component)
}
/// Deletes all of the document of the segment.
/// This is called when there is a merge or a rollback.
/// Protects a specific component file from being deleted.
///
/// # Disclaimer
/// If deletion of a file fails (e.g. a file
/// was read-only.), the method does not
/// fail and just logs an error
pub fn delete(&self,) {
for component in SegmentComponent::values() {
let rel_path = self.relative_path(component);
if let Err(err) = self.index.directory().delete(&rel_path) {
match err {
FileError::FileDoesNotExist(_) => {
// this is normal behavior.
// the position file for instance may not exists.
}
FileError::IOError(err) => {
error!("Failed to remove {:?} : {:?}", self.segment_id, err);
}
}
}
}
/// Returns a FileProtection object. The file is guaranteed
/// to not be garbage collected as long as this `FileProtection` object
/// lives.
pub fn protect_from_delete(&self, component: SegmentComponent) -> FileProtection {
let path = self.relative_path(component);
self.index.directory().protect_file_from_delete(&path)
}
/// Open one of the component file for read.
pub fn open_read(&self, component: SegmentComponent) -> result::Result<ReadOnlySource, FileError> {
/// Open one of the component file for a *regular* read.
pub fn open_read(
&self,
component: SegmentComponent,
) -> result::Result<ReadOnlySource, OpenReadError> {
let path = self.relative_path(component);
let source = try!(self.index.directory().open_read(&path));
let source = self.index.directory().open_read(&path)?;
Ok(source)
}
/// Open one of the component file for write.
pub fn open_write(&mut self, component: SegmentComponent) -> result::Result<WritePtr, OpenWriteError> {
/// Open one of the component file for *regular* write.
pub fn open_write(
&mut self,
component: SegmentComponent,
) -> result::Result<WritePtr, OpenWriteError> {
let path = self.relative_path(component);
let write = try!(self.index.directory_mut().open_write(&path));
let write = self.index.directory_mut().open_write(&path)?;
Ok(write)
}
}
@@ -108,7 +106,34 @@ pub trait SerializableSegment {
fn write(&self, serializer: SegmentSerializer) -> Result<u32>;
}
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
pub struct SegmentInfo {
pub max_doc: DocId,
}
#[cfg(test)]
mod tests {
use core::SegmentComponent;
use directory::Directory;
use std::collections::HashSet;
use schema::SchemaBuilder;
use Index;
#[test]
fn test_segment_protect_component() {
let mut index = Index::create_in_ram(SchemaBuilder::new().build());
let segment = index.new_segment();
let path = segment.relative_path(SegmentComponent::POSTINGS);
let directory = index.directory_mut();
directory.atomic_write(&*path, &vec![0u8]).unwrap();
let living_files = HashSet::new();
{
let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
assert!(directory.exists(&*path));
directory.garbage_collect(|| living_files.clone());
assert!(directory.exists(&*path));
}
directory.garbage_collect(|| living_files);
assert!(!directory.exists(&*path));
}
}

View File

@@ -1,41 +1,41 @@
use std::vec::IntoIter;
/// Enum describing each component of a tantivy segment.
/// Each component is stored in its own file,
/// using the pattern `segment_uuid`.`component_extension`,
/// except the delete component that takes an `segment_uuid`.`delete_opstamp`.`component_extension`
#[derive(Copy, Clone)]
pub enum SegmentComponent {
INFO,
/// Postings (or inverted list). Sorted lists of document ids, associated to terms
POSTINGS,
/// Positions of terms in each document.
POSITIONS,
/// Column-oriented random-access storage of fields.
FASTFIELDS,
/// Stores the sum of the length (in terms) of each field for each document.
/// Field norms are stored as a special u64 fast field.
FIELDNORMS,
/// Dictionary associating `Term`s to `TermInfo`s which is
/// simply an address into the `postings` file and the `positions` file.
TERMS,
/// Row-oriented, LZ4-compressed storage of the documents.
/// Accessing a document from the store is relatively slow, as it
/// requires to decompress the entire block it belongs to.
STORE,
/// Bitset describing which document of the segment is deleted.
DELETE,
}
impl SegmentComponent {
pub fn values() -> IntoIter<SegmentComponent> {
vec!(
SegmentComponent::INFO,
/// Iterates through the components.
pub fn iterator() -> impl Iterator<Item = &'static SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS,
SegmentComponent::FASTFIELDS,
SegmentComponent::FIELDNORMS,
SegmentComponent::TERMS,
SegmentComponent::STORE,
).into_iter()
}
pub fn path_suffix(&self)-> &'static str {
match *self {
SegmentComponent::POSITIONS => ".pos",
SegmentComponent::INFO => ".info",
SegmentComponent::POSTINGS => ".idx",
SegmentComponent::TERMS => ".term",
SegmentComponent::STORE => ".store",
SegmentComponent::FASTFIELDS => ".fast",
SegmentComponent::FIELDNORMS => ".fieldnorm",
}
SegmentComponent::DELETE,
];
SEGMENT_COMPONENTS.into_iter()
}
}

View File

@@ -1,34 +1,36 @@
use uuid::Uuid;
use std::fmt;
use rustc_serialize::{Encoder, Decoder, Encodable, Decodable};
use core::SegmentComponent;
use std::path::PathBuf;
use std::cmp::{Ordering, Ord};
use std::cmp::{Ord, Ordering};
#[cfg(test)]
use std::sync::atomic;
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
/// Uuid identifying a segment.
///
/// Tantivy's segment are identified
/// by a UUID which is used to prefix the filenames
/// of all of the file associated with the segment.
///
/// In unit test, for reproducability, the `SegmentId` are
/// simply generated in an autoincrement fashion.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct SegmentId(Uuid);
#[cfg(test)]
lazy_static! {
static ref AUTO_INC_COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::default();
static ref EMPTY_ARR: [u8; 8] = [0u8; 8];
}
// During tests, we generate the segment id in a autoincrement manner
// for consistency of segment id between run.
//
// The order of the test execution is not guaranteed, but the order
// The order of the test execution is not guaranteed, but the order
// of segments within a single test is guaranteed.
#[cfg(test)]
fn create_uuid() -> Uuid {
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*EMPTY_ARR)
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*EMPTY_ARR).unwrap()
}
#[cfg(not(test))]
@@ -37,43 +39,34 @@ fn create_uuid() -> Uuid {
}
impl SegmentId {
#[doc(hidden)]
pub fn generate_random() -> SegmentId {
SegmentId(create_uuid())
}
pub fn short_uuid_string(&self,) -> String {
(&self.0.to_simple_string()[..8]).to_string()
}
pub fn uuid_string(&self,) -> String {
self.0.to_simple_string()
}
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
let filename = self.uuid_string() + component.path_suffix();
PathBuf::from(filename)
}
}
impl Encodable for SegmentId {
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
self.0.encode(s)
/// Returns a shorter identifier of the segment.
///
/// We are using UUID4, so only 6 bits are fixed,
/// and the rest is random.
///
/// Picking the first 8 chars is ok to identify
/// segments in a display message.
pub fn short_uuid_string(&self) -> String {
(&self.0.simple().to_string()[..8]).to_string()
}
}
impl Decodable for SegmentId {
fn decode<D: Decoder>(d: &mut D) -> Result<Self, D::Error> {
Uuid::decode(d).map(SegmentId)
/// Returns a segment uuid string.
pub fn uuid_string(&self) -> String {
self.0.simple().to_string()
}
}
impl fmt::Debug for SegmentId {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "SegmentId({:?})", self.uuid_string())
write!(f, "Seg({:?})", self.short_uuid_string())
}
}
impl PartialOrd for SegmentId {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))

115
src/core/segment_meta.rs Normal file
View File

@@ -0,0 +1,115 @@
use core::SegmentId;
use super::SegmentComponent;
use std::path::PathBuf;
use std::collections::HashSet;
#[derive(Clone, Debug, Serialize, Deserialize)]
struct DeleteMeta {
num_deleted_docs: u32,
opstamp: u64,
}
/// `SegmentMeta` contains simple meta information about a segment.
///
/// For instance the number of docs it contains,
/// how many are deleted, etc.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SegmentMeta {
segment_id: SegmentId,
max_doc: u32,
deletes: Option<DeleteMeta>,
}
impl SegmentMeta {
/// Creates a new segment meta for
/// a segment with no deletes and no documents.
pub fn new(segment_id: SegmentId) -> SegmentMeta {
SegmentMeta {
segment_id,
max_doc: 0,
deletes: None,
}
}
/// Returns the segment id.
pub fn id(&self) -> SegmentId {
self.segment_id
}
/// Returns the number of deleted documents.
pub fn num_deleted_docs(&self) -> u32 {
self.deletes
.as_ref()
.map(|delete_meta| delete_meta.num_deleted_docs)
.unwrap_or(0u32)
}
/// Returns the list of files that
/// are required for the segment meta.
///
/// This is useful as the way tantivy removes files
/// is by removing all files that have been created by tantivy
/// and are not used by any segment anymore.
pub fn list_files(&self) -> HashSet<PathBuf> {
SegmentComponent::iterator()
.map(|component| self.relative_path(*component))
.collect::<HashSet<PathBuf>>()
}
/// Returns the relative path of a component of our segment.
///
/// It just joins the segment id with the extension
/// associated to a segment component.
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
let mut path = self.id().uuid_string();
path.push_str(&*match component {
SegmentComponent::POSITIONS => ".pos".to_string(),
SegmentComponent::POSTINGS => ".idx".to_string(),
SegmentComponent::TERMS => ".term".to_string(),
SegmentComponent::STORE => ".store".to_string(),
SegmentComponent::FASTFIELDS => ".fast".to_string(),
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
});
PathBuf::from(path)
}
/// Return the highest doc id + 1
///
/// If there are no deletes, then num_docs = max_docs
/// and all the doc ids contains in this segment
/// are exactly (0..max_doc).
pub fn max_doc(&self) -> u32 {
self.max_doc
}
/// Return the number of documents in the segment.
pub fn num_docs(&self) -> u32 {
self.max_doc() - self.num_deleted_docs()
}
/// Returns the opstamp of the last delete operation
/// taken in account in this segment.
pub fn delete_opstamp(&self) -> Option<u64> {
self.deletes.as_ref().map(|delete_meta| delete_meta.opstamp)
}
/// Returns true iff the segment meta contains
/// delete information.
pub fn has_deletes(&self) -> bool {
self.deletes.is_some()
}
#[doc(hidden)]
pub fn set_max_doc(&mut self, max_doc: u32) {
self.max_doc = max_doc;
}
#[doc(hidden)]
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
self.deletes = Some(DeleteMeta {
num_deleted_docs,
opstamp,
});
}
}

View File

@@ -2,28 +2,30 @@ use Result;
use core::Segment;
use core::SegmentId;
use core::SegmentComponent;
use schema::Term;
use std::sync::RwLock;
use common::HasLen;
use core::SegmentMeta;
use fastfield::{self, FastFieldNotAvailableError};
use fastfield::DeleteBitSet;
use store::StoreReader;
use schema::Document;
use directory::ReadOnlySource;
use DocId;
use std::io;
use std::str;
use postings::TermInfo;
use datastruct::FstMap;
use std::sync::Arc;
use std::collections::HashMap;
use common::CompositeFile;
use std::fmt;
use rustc_serialize::json;
use core::SegmentInfo;
use core::InvertedIndexReader;
use schema::Field;
use postings::SegmentPostingsOption;
use postings::SegmentPostings;
use fastfield::{U32FastFieldsReader, U32FastFieldReader};
use schema::Schema;
use schema::FieldType;
use postings::FreqHandler;
use schema::TextIndexingOptions;
use error::Error;
use error::ErrorKind;
use termdict::TermDictionaryImpl;
use fastfield::FacetReader;
use fastfield::FastFieldReader;
use schema::Schema;
use termdict::TermDictionary;
use fastfield::{FastValue, MultiValueIntFastFieldReader};
use schema::Cardinality;
use fieldnorm::FieldNormReader;
/// Entry point to access all of the datastructures of the `Segment`
///
@@ -36,15 +38,23 @@ use error::Error;
/// The segment reader has a very low memory footprint,
/// as close to all of the memory data is mmapped.
///
///
/// TODO fix not decoding docfreq
#[derive(Clone)]
pub struct SegmentReader {
segment_info: SegmentInfo,
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
segment_id: SegmentId,
term_infos: FstMap<TermInfo>,
postings_data: ReadOnlySource,
segment_meta: SegmentMeta,
termdict_composite: CompositeFile,
postings_composite: CompositeFile,
positions_composite: CompositeFile,
fast_fields_composite: CompositeFile,
fieldnorms_composite: CompositeFile,
store_reader: StoreReader,
fast_fields_reader: U32FastFieldsReader,
fieldnorms_reader: U32FastFieldsReader,
positions_data: ReadOnlySource,
delete_bitset_opt: Option<DeleteBitSet>,
schema: Schema,
}
@@ -54,108 +64,239 @@ impl SegmentReader {
/// Today, `tantivy` does not handle deletes, so it happens
/// to also be the number of documents in the index.
pub fn max_doc(&self) -> DocId {
self.segment_info.max_doc
self.segment_meta.max_doc()
}
/// Returns the number of documents.
/// Deleted documents are not counted.
///
/// Today, `tantivy` does not handle deletes so max doc and
/// num_docs are the same.
pub fn num_docs(&self) -> DocId {
self.segment_info.max_doc
self.segment_meta.num_docs()
}
/// Return the number of documents that have been
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
self.delete_bitset()
.map(|delete_set| delete_set.len() as DocId)
.unwrap_or(0u32)
}
/// Returns true iff some of the documents of the segment have been deleted.
pub fn has_deletes(&self) -> bool {
self.delete_bitset().is_some()
}
/// Accessor to a segment's fast field reader given a field.
pub fn get_fast_field_reader(&self, field: Field) -> io::Result<U32FastFieldReader> {
///
/// Returns the u64 fast value reader if the field
/// is a u64 field indexed as "fast".
///
/// Return a FastFieldNotAvailableError if the field is not
/// declared as a fast field in the schema.
///
/// # Panics
/// May panic if the index is corrupted.
pub fn fast_field_reader<Item: FastValue>(
&self,
field: Field,
) -> fastfield::Result<FastFieldReader<Item>> {
let field_entry = self.schema.get_field_entry(field);
match *field_entry.field_type() {
FieldType::Str(_) => {
Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
},
FieldType::U32(_) => {
// TODO check that the schema allows that
//Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields."))
self.fast_fields_reader.get_field(field)
},
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
{
self.fast_fields_composite
.open_read(field)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)
} else {
Err(FastFieldNotAvailableError::new(field_entry))
}
}
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
/// May panick if the field is not a multivalued fastfield of the type `Item`.
pub fn multi_fast_field_reader<Item: FastValue>(
&self,
field: Field,
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
let field_entry = self.schema.get_field_entry(field);
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
{
let idx_reader = self.fast_fields_composite
.open_read_with_idx(field, 0)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
let vals_reader = self.fast_fields_composite
.open_read_with_idx(field, 1)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
} else {
Err(FastFieldNotAvailableError::new(field_entry))
}
}
/// Accessor to the `FacetReader` associated to a given `Field`.
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
let field_entry = self.schema.get_field_entry(field);
if field_entry.field_type() != &FieldType::HierarchicalFacet {
return Err(ErrorKind::InvalidArgument(format!(
"The field {:?} is not a \
hierarchical facet.",
field_entry
)).into());
}
let term_ords_reader = self.multi_fast_field_reader(field)?;
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
ErrorKind::InvalidArgument(format!(
"The field \"{}\" is a hierarchical \
but this segment does not seem to have the field term \
dictionary.",
field_entry.name()
))
})?;
let termdict = TermDictionaryImpl::from_source(termdict_source);
let facet_reader = FacetReader::new(term_ords_reader, termdict);
Ok(facet_reader)
}
/// Accessor to the segment's `Field norms`'s reader.
///
/// Field norms are the length (in tokens) of the fields.
/// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
/// It is used in the computation of the [TfIdf]
/// (https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
///
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> io::Result<U32FastFieldReader> {
self.fieldnorms_reader.get_field(field)
}
/// Returns the number of documents containing the term.
pub fn doc_freq(&self, term: &Term) -> u32 {
match self.get_term_info(term) {
Some(term_info) => term_info.doc_freq,
None => 0,
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
if let Some(fieldnorm_source) = self.fieldnorms_composite
.open_read(field) {
FieldNormReader::open(fieldnorm_source)
} else {
let field_name = self.schema.get_field_name(field);
let err_msg= format!("Field norm not found for field {:?}. Was it market as indexed during indexing.", field_name);
panic!(err_msg);
}
}
}
/// Accessor to the segment's `StoreReader`.
pub fn get_store_reader(&self) -> &StoreReader {
&self.store_reader
}
/// Open a new segment for reading.
pub fn open(segment: Segment) -> Result<SegmentReader> {
let segment_info_reader = try!(segment.open_read(SegmentComponent::INFO));
let segment_info_data = try!(
str::from_utf8(&*segment_info_reader)
.map_err(|err| {
let segment_info_filepath = segment.relative_path(SegmentComponent::INFO);
Error::CorruptedFile(segment_info_filepath, Box::new(err))
})
);
let segment_info: SegmentInfo = try!(
json::decode(&segment_info_data)
.map_err(|err| {
let file_path = segment.relative_path(SegmentComponent::INFO);
Error::CorruptedFile(file_path, Box::new(err))
})
);
let source = try!(segment.open_read(SegmentComponent::TERMS));
let term_infos = try!(FstMap::from_source(source));
let store_reader = StoreReader::from(try!(segment.open_read(SegmentComponent::STORE)));
let postings_shared_mmap = try!(segment.open_read(SegmentComponent::POSTINGS));
let fast_field_data = try!(segment.open_read(SegmentComponent::FASTFIELDS));
let fast_fields_reader = try!(U32FastFieldsReader::open(fast_field_data));
let fieldnorms_data = try!(segment.open_read(SegmentComponent::FIELDNORMS));
let fieldnorms_reader = try!(U32FastFieldsReader::open(fieldnorms_data));
let positions_data = segment
.open_read(SegmentComponent::POSITIONS)
.unwrap_or_else(|_| ReadOnlySource::empty());
pub fn open(segment: &Segment) -> Result<SegmentReader> {
let termdict_source = segment.open_read(SegmentComponent::TERMS)?;
let termdict_composite = CompositeFile::open(&termdict_source)?;
let store_source = segment.open_read(SegmentComponent::STORE)?;
let store_reader = StoreReader::from_source(store_source);
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
let postings_composite = CompositeFile::open(&postings_source)?;
let positions_composite = {
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) {
CompositeFile::open(&source)?
} else {
CompositeFile::empty()
}
};
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
let delete_bitset_opt =
if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
Some(DeleteBitSet::open(delete_data))
} else {
None
};
let schema = segment.schema();
Ok(SegmentReader {
segment_info: segment_info,
postings_data: postings_shared_mmap,
term_infos: term_infos,
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
segment_meta: segment.meta().clone(),
termdict_composite,
postings_composite,
fast_fields_composite,
fieldnorms_composite,
segment_id: segment.id(),
store_reader: store_reader,
fast_fields_reader: fast_fields_reader,
fieldnorms_reader: fieldnorms_reader,
positions_data: positions_data,
schema: schema,
store_reader,
delete_bitset_opt,
positions_composite,
schema,
})
}
/// Return the term dictionary datastructure.
pub fn term_infos(&self) -> &FstMap<TermInfo> {
&self.term_infos
/// Returns a field reader associated to the field given in argument.
/// If the field was not present in the index during indexing time,
/// the InvertedIndexReader is empty.
///
/// The field reader is in charge of iterating through the
/// term dictionary associated to a specific field,
/// and opening the posting list associated to any term.
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
if let Some(inv_idx_reader) = self.inv_idx_reader_cache
.read()
.expect("Lock poisoned. This should never happen")
.get(&field)
{
return Arc::clone(inv_idx_reader);
}
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let record_option_opt = field_type.get_index_record_option();
if record_option_opt.is_none() {
panic!("Field {:?} does not seem indexed.", field_entry.name());
}
let record_option = record_option_opt.unwrap();
let postings_source_opt = self.postings_composite.open_read(field);
if postings_source_opt.is_none() {
// no documents in the segment contained this field.
// As a result, no data is associated to the inverted index.
//
// Returns an empty inverted index.
return Arc::new(InvertedIndexReader::empty(field_type.clone()));
}
let postings_source = postings_source_opt.unwrap();
let termdict_source = self.termdict_composite
.open_read(field)
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
let positions_source = self.positions_composite
.open_read(field)
.expect("Index corrupted. Failed to open field positions in composite file.");
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
TermDictionaryImpl::from_source(termdict_source),
postings_source,
positions_source,
record_option,
));
// by releasing the lock in between, we may end up opening the inverting index
// twice, but this is fine.
self.inv_idx_reader_cache
.write()
.expect("Field reader cache lock poisoned. This should never happen.")
.insert(field, Arc::clone(&inv_idx_reader));
inv_idx_reader
}
/// Returns the document (or to be accurate, its stored field)
/// bearing the given doc id.
/// This method is slow and should seldom be called from
@@ -164,82 +305,26 @@ impl SegmentReader {
self.store_reader.get(doc_id)
}
/// Returns the segment id
pub fn segment_id(&self) -> SegmentId {
self.segment_id
}
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encounterred and indexed.
///
/// If the field was not indexed with the indexing options that cover
/// the requested options, the returned `SegmentPostings` the method does not fail
/// and returns a `SegmentPostings` with as much information as possible.
///
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions`
/// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies.
pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<SegmentPostings> {
let field = term.field();
let field_entry = self.schema.get_field_entry(field);
let term_info = get!(self.get_term_info(&term));
let offset = term_info.postings_offset as usize;
let postings_data = &self.postings_data[offset..];
let freq_handler = match *field_entry.field_type() {
FieldType::Str(ref options) => {
let indexing_options = options.get_indexing_options();
match option {
SegmentPostingsOption::NoFreq => {
FreqHandler::new_without_freq()
}
SegmentPostingsOption::Freq => {
if indexing_options.is_termfreq_enabled() {
FreqHandler::new_with_freq()
}
else {
FreqHandler::new_without_freq()
}
}
SegmentPostingsOption::FreqAndPositions => {
if indexing_options == TextIndexingOptions::TokenizedWithFreqAndPosition {
let offseted_position_data = &self.positions_data[term_info.positions_offset as usize ..];
FreqHandler::new_with_freq_and_position(offseted_position_data)
}
else if indexing_options.is_termfreq_enabled()
{
FreqHandler::new_with_freq()
}
else {
FreqHandler::new_without_freq()
}
}
}
}
_ => {
FreqHandler::new_without_freq()
}
};
Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, freq_handler))
/// Returns the bitset representing
/// the documents that have been deleted.
pub fn delete_bitset(&self) -> Option<&DeleteBitSet> {
self.delete_bitset_opt.as_ref()
}
/// Returns the posting list associated with a term.
pub fn read_postings_all_info(&self, term: &Term) -> Option<SegmentPostings> {
let field_entry = self.schema.get_field_entry(term.field());
let segment_posting_option = match *field_entry.field_type() {
FieldType::Str(ref text_options) => {
match text_options.get_indexing_options() {
TextIndexingOptions::TokenizedWithFreq => SegmentPostingsOption::Freq,
TextIndexingOptions::TokenizedWithFreqAndPosition => SegmentPostingsOption::FreqAndPositions,
_ => SegmentPostingsOption::NoFreq,
}
}
FieldType::U32(_) => SegmentPostingsOption::NoFreq
};
self.read_postings(term, segment_posting_option)
}
/// Returns the term info associated with the term.
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
self.term_infos.get(term.as_slice())
/// Returns true iff the `doc` is marked
/// as deleted.
pub fn is_deleted(&self, doc: DocId) -> bool {
self.delete_bitset()
.map(|delete_set| delete_set.is_deleted(doc))
.unwrap_or(false)
}
}
impl fmt::Debug for SegmentReader {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "SegmentReader({:?})", self.segment_id)

View File

@@ -1,149 +0,0 @@
#![allow(should_implement_trait)]
use std::io;
use std::io::Write;
use fst;
use fst::raw::Fst;
use fst::Streamer;
use directory::ReadOnlySource;
use common::BinarySerializable;
use std::marker::PhantomData;
fn convert_fst_error(e: fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
}
pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
fst_builder: fst::MapBuilder<W>,
data: Vec<u8>,
_phantom_: PhantomData<V>,
}
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
pub fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
let fst_builder = try!(fst::MapBuilder::new(w).map_err(convert_fst_error));
Ok(FstMapBuilder {
fst_builder: fst_builder,
data: Vec::new(),
_phantom_: PhantomData,
})
}
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()>{
try!(self.fst_builder
.insert(key, self.data.len() as u64)
.map_err(convert_fst_error));
try!(value.serialize(&mut self.data));
Ok(())
}
pub fn finish(self,) -> io::Result<W> {
let mut file = try!(
self.fst_builder
.into_inner()
.map_err(convert_fst_error));
let footer_size = self.data.len() as u32;
try!(file.write_all(&self.data));
try!((footer_size as u32).serialize(&mut file));
try!(file.flush());
Ok(file)
}
}
pub struct FstMap<V: BinarySerializable> {
fst_index: fst::Map,
values_mmap: ReadOnlySource,
_phantom_: PhantomData<V>,
}
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
Ok(fst::Map::from(match source {
ReadOnlySource::Anonymous(data) => try!(Fst::from_shared_bytes(data.data, data.start, data.len).map_err(convert_fst_error)),
ReadOnlySource::Mmap(mmap_readonly) => try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)),
}))
}
pub struct FstKeyIter<'a, V: 'static + BinarySerializable> {
streamer: fst::map::Stream<'a>,
__phantom__: PhantomData<V>
}
impl<'a, V: 'static + BinarySerializable> FstKeyIter<'a, V> {
pub fn next(&mut self) -> Option<(&[u8])> {
self.streamer
.next()
.map(|(k, _)| k)
}
}
impl<V: BinarySerializable> FstMap<V> {
pub fn keys(&self,) -> FstKeyIter<V> {
FstKeyIter {
streamer: self.fst_index.stream(),
__phantom__: PhantomData,
}
}
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
let total_len = source.len();
let length_offset = total_len - 4;
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
let footer_size = try!(u32::deserialize(&mut split_len_buffer)) as usize;
let split_len = length_offset - footer_size;
let fst_source = source.slice(0, split_len);
let values_source = source.slice(split_len, length_offset);
let fst_index = try!(open_fst_index(fst_source));
Ok(FstMap {
fst_index: fst_index,
values_mmap: values_source,
_phantom_: PhantomData,
})
}
fn read_value(&self, offset: u64) -> V {
let buffer = self.values_mmap.as_slice();
let mut cursor = &buffer[(offset as usize)..];
V::deserialize(&mut cursor).expect("Data in FST is corrupted")
}
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<V> {
self.fst_index
.get(key)
.map(|offset| self.read_value(offset))
}
}
#[cfg(test)]
mod tests {
use super::*;
use directory::{RAMDirectory, Directory};
use std::path::PathBuf;
#[test]
fn test_fstmap() {
let mut directory = RAMDirectory::create();
let path = PathBuf::from("fstmap");
{
let write = directory.open_write(&path).unwrap();
let mut fstmap_builder = FstMapBuilder::new(write).unwrap();
fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap();
fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap();
fstmap_builder.finish().unwrap();
}
let source = directory.open_read(&path).unwrap();
let fstmap: FstMap<u32> = FstMap::from_source(source).unwrap();
assert_eq!(fstmap.get("abc"), Some(34u32));
assert_eq!(fstmap.get("abcd"), Some(346u32));
let mut keys = fstmap.keys();
assert_eq!(keys.next().unwrap(), "abc".as_bytes());
assert_eq!(keys.next().unwrap(), "abcd".as_bytes());
assert_eq!(keys.next(), None);
}
}

View File

@@ -1,8 +1,4 @@
mod fstmap;
mod skip;
pub mod stacker;
pub use self::fstmap::FstMapBuilder;
pub use self::fstmap::FstMap;
pub use self::fstmap::FstKeyIter;
pub use self::skip::{SkipListBuilder, SkipList};
pub use self::skip::{SkipList, SkipListBuilder};

View File

@@ -6,17 +6,15 @@ mod skiplist;
pub use self::skiplist_builder::SkipListBuilder;
pub use self::skiplist::SkipList;
#[cfg(test)]
mod tests {
use super::*;
use super::{SkipList, SkipListBuilder};
#[test]
fn test_skiplist() {
let mut output: Vec<u8> = Vec::new();
let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(10);
let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(8);
skip_list_builder.insert(2, &3).unwrap();
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
let mut skip_list: SkipList<u32> = SkipList::from(output.as_slice());
@@ -26,7 +24,7 @@ mod tests {
#[test]
fn test_skiplist2() {
let mut output: Vec<u8> = Vec::new();
let skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(10);
let skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(8);
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
let mut skip_list: SkipList<u32> = SkipList::from(output.as_slice());
assert_eq!(skip_list.next(), None);
@@ -73,7 +71,7 @@ mod tests {
#[test]
fn test_skiplist5() {
let mut output: Vec<u8> = Vec::new();
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3);
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
skip_list_builder.insert(2, &()).unwrap();
skip_list_builder.insert(3, &()).unwrap();
skip_list_builder.insert(5, &()).unwrap();
@@ -105,7 +103,7 @@ mod tests {
#[test]
fn test_skiplist7() {
let mut output: Vec<u8> = Vec::new();
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3);
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
for i in 0..1000 {
skip_list_builder.insert(i, &()).unwrap();
}
@@ -114,44 +112,57 @@ mod tests {
let mut skip_list: SkipList<()> = SkipList::from(output.as_slice());
assert_eq!(skip_list.next().unwrap(), (0, ()));
skip_list.seek(431);
assert_eq!(skip_list.next().unwrap(), (431,()) );
assert_eq!(skip_list.next().unwrap(), (431, ()));
skip_list.seek(1003);
assert_eq!(skip_list.next().unwrap(), (1004,()) );
assert_eq!(skip_list.next().unwrap(), (1004, ()));
assert_eq!(skip_list.next(), None);
}
#[test]
fn test_skiplist8() {
let mut output: Vec<u8> = Vec::new();
let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(10);
let mut skip_list_builder: SkipListBuilder<u64> = SkipListBuilder::new(8);
skip_list_builder.insert(2, &3).unwrap();
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
assert_eq!(output.len(), 13);
assert_eq!(output.len(), 11);
assert_eq!(output[0], 1u8 + 128u8);
}
#[test]
fn test_skiplist9() {
let mut output: Vec<u8> = Vec::new();
let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(3);
for i in 0..9 {
let mut skip_list_builder: SkipListBuilder<u64> = SkipListBuilder::new(4);
for i in 0..4 * 4 * 4 {
skip_list_builder.insert(i, &i).unwrap();
}
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
assert_eq!(output.len(), 117);
assert_eq!(output[0], 3u8 + 128u8);
assert_eq!(output.len(), 774);
assert_eq!(output[0], 4u8 + 128u8);
}
#[test]
fn test_skiplist10() {
// checking that void gets serialized to nothing.
let mut output: Vec<u8> = Vec::new();
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3);
for i in 0..9 {
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
for i in 0..((4 * 4 * 4) - 1) {
skip_list_builder.insert(i, &()).unwrap();
}
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
assert_eq!(output.len(), 81);
assert_eq!(output.len(), 230);
assert_eq!(output[0], 128u8 + 3u8);
}
#[test]
fn test_skiplist11() {
// checking that void gets serialized to nothing.
let mut output: Vec<u8> = Vec::new();
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
for i in 0..(4 * 4) {
skip_list_builder.insert(i, &()).unwrap();
}
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
assert_eq!(output.len(), 65);
assert_eq!(output[0], 128u8 + 3u8);
}

View File

@@ -1,6 +1,5 @@
use common::BinarySerializable;
use common::{BinarySerializable, VInt};
use std::marker::PhantomData;
use DocId;
use std::cmp::max;
static EMPTY: [u8; 0] = [];
@@ -8,130 +7,127 @@ static EMPTY: [u8; 0] = [];
struct Layer<'a, T> {
data: &'a [u8],
cursor: &'a [u8],
next_id: DocId,
next_id: Option<u64>,
_phantom_: PhantomData<T>,
}
impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
type Item = (u64, T);
type Item = (DocId, T);
fn next(&mut self,)-> Option<(DocId, T)> {
if self.next_id == u32::max_value() {
None
}
else {
fn next(&mut self) -> Option<(u64, T)> {
if let Some(cur_id) = self.next_id {
let cur_val = T::deserialize(&mut self.cursor).unwrap();
let cur_id = self.next_id;
self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
self.next_id = VInt::deserialize_u64(&mut self.cursor).ok();
Some((cur_id, cur_val))
} else {
None
}
}
}
impl<'a, T: BinarySerializable> From<&'a [u8]> for Layer<'a, T> {
fn from(data: &'a [u8]) -> Layer<'a, T> {
let mut cursor = data;
let next_id = u32::deserialize(&mut cursor).unwrap_or(u32::max_value());
let mut cursor = data;
let next_id = VInt::deserialize_u64(&mut cursor).ok();
Layer {
data: data,
cursor: cursor,
next_id: next_id,
data,
cursor,
next_id,
_phantom_: PhantomData,
}
}
}
impl<'a, T: BinarySerializable> Layer<'a, T> {
fn empty() -> Layer<'a, T> {
Layer {
data: &EMPTY,
cursor: &EMPTY,
next_id: DocId::max_value(),
next_id: None,
_phantom_: PhantomData,
}
}
fn seek_offset(&mut self, offset: usize) {
fn seek_offset(&mut self, offset: usize) {
self.cursor = &self.data[offset..];
self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
self.next_id = VInt::deserialize_u64(&mut self.cursor).ok();
}
// Returns the last element (key, val)
// such that (key < doc_id)
//
// If there is no such element anymore,
// returns None.
fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> {
let mut val = None;
while self.next_id < doc_id {
match self.next() {
None => { break; },
v => { val = v; }
//
// If the element exists, it will be returned
// at the next call to `.next()`.
fn seek(&mut self, key: u64) -> Option<(u64, T)> {
let mut result: Option<(u64, T)> = None;
loop {
if let Some(next_id) = self.next_id {
if next_id < key {
if let Some(v) = self.next() {
result = Some(v);
continue;
}
}
}
return result;
}
val
}
}
pub struct SkipList<'a, T: BinarySerializable> {
data_layer: Layer<'a, T>,
skip_layers: Vec<Layer<'a, u32>>,
skip_layers: Vec<Layer<'a, u64>>,
}
impl<'a, T: BinarySerializable> Iterator for SkipList<'a, T> {
type Item = (u64, T);
type Item = (DocId, T);
fn next(&mut self,)-> Option<(DocId, T)> {
fn next(&mut self) -> Option<(u64, T)> {
self.data_layer.next()
}
}
impl<'a, T: BinarySerializable> SkipList<'a, T> {
pub fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> {
let mut next_layer_skip: Option<(DocId, u32)> = None;
pub fn seek(&mut self, key: u64) -> Option<(u64, T)> {
let mut next_layer_skip: Option<(u64, u64)> = None;
for skip_layer in &mut self.skip_layers {
if let Some((_, offset)) = next_layer_skip {
skip_layer.seek_offset(offset as usize);
}
next_layer_skip = skip_layer.seek(doc_id);
}
if let Some((_, offset)) = next_layer_skip {
self.data_layer.seek_offset(offset as usize);
}
self.data_layer.seek(doc_id)
next_layer_skip = skip_layer.seek(key);
}
if let Some((_, offset)) = next_layer_skip {
self.data_layer.seek_offset(offset as usize);
}
self.data_layer.seek(key)
}
}
impl<'a, T: BinarySerializable> From<&'a [u8]> for SkipList<'a, T> {
fn from(mut data: &'a [u8]) -> SkipList<'a, T> {
let offsets: Vec<u32> = Vec::deserialize(&mut data).unwrap();
let offsets: Vec<u64> = Vec::<VInt>::deserialize(&mut data)
.unwrap()
.into_iter()
.map(|el| el.0)
.collect();
let num_layers = offsets.len();
let layers_data: &[u8] = data;
let data_layer: Layer<'a, T> =
if num_layers == 0 { Layer::empty() }
else {
let first_layer_data: &[u8] = &layers_data[..offsets[0] as usize];
Layer::from(first_layer_data)
};
let data_layer: Layer<'a, T> = if num_layers == 0 {
Layer::empty()
} else {
let first_layer_data: &[u8] = &layers_data[..offsets[0] as usize];
Layer::from(first_layer_data)
};
let skip_layers = (0..max(1, num_layers) - 1)
.map(|i| (offsets[i] as usize, offsets[i + 1] as usize))
.map(|(start, stop)| {
Layer::from(&layers_data[start..stop])
})
.map(|(start, stop)| Layer::from(&layers_data[start..stop]))
.collect();
SkipList {
skip_layers: skip_layers,
data_layer: data_layer,
skip_layers,
data_layer,
}
}
}

View File

@@ -1,71 +1,65 @@
use std::io::Write;
use common::BinarySerializable;
use common::{BinarySerializable, VInt, is_power_of_2};
use std::marker::PhantomData;
use DocId;
use std::io;
struct LayerBuilder<T: BinarySerializable> {
period: usize,
period_mask: usize,
buffer: Vec<u8>,
remaining: usize,
len: usize,
_phantom_: PhantomData<T>,
}
impl<T: BinarySerializable> LayerBuilder<T> {
fn written_size(&self,) -> usize {
fn written_size(&self) -> usize {
self.buffer.len()
}
fn write(&self, output: &mut Write) -> Result<(), io::Error> {
try!(output.write_all(&self.buffer));
output.write_all(&self.buffer)?;
Ok(())
}
fn with_period(period: usize) -> LayerBuilder<T> {
assert!(is_power_of_2(period), "The period has to be a power of 2.");
LayerBuilder {
period: period,
period_mask: (period - 1),
buffer: Vec::new(),
remaining: period,
len: 0,
_phantom_: PhantomData,
}
}
fn insert(&mut self, doc_id: DocId, value: &T) -> io::Result<Option<(DocId, u32)>> {
self.remaining -= 1;
fn insert(&mut self, key: u64, value: &T) -> io::Result<Option<(u64, u64)>> {
self.len += 1;
let offset = self.written_size() as u32; // TODO not sure if we want after or here
try!(doc_id.serialize(&mut self.buffer));
try!(value.serialize(&mut self.buffer));
Ok(if self.remaining == 0 {
self.remaining = self.period;
Some((doc_id, offset))
}
else { None })
let offset = self.written_size() as u64;
VInt(key).serialize(&mut self.buffer)?;
value.serialize(&mut self.buffer)?;
let emit_skip_info = (self.period_mask & self.len) == 0;
if emit_skip_info {
Ok(Some((key, offset)))
} else {
Ok(None)
}
}
}
pub struct SkipListBuilder<T: BinarySerializable> {
period: usize,
data_layer: LayerBuilder<T>,
skip_layers: Vec<LayerBuilder<u32>>,
skip_layers: Vec<LayerBuilder<u64>>,
}
impl<T: BinarySerializable> SkipListBuilder<T> {
pub fn new(period: usize) -> SkipListBuilder<T> {
SkipListBuilder {
period: period,
period,
data_layer: LayerBuilder::with_period(period),
skip_layers: Vec::new(),
}
}
fn get_skip_layer(&mut self, layer_id: usize) -> &mut LayerBuilder<u32> {
fn get_skip_layer(&mut self, layer_id: usize) -> &mut LayerBuilder<u64> {
if layer_id == self.skip_layers.len() {
let layer_builder = LayerBuilder::with_period(self.period);
self.skip_layers.push(layer_builder);
@@ -73,34 +67,32 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
&mut self.skip_layers[layer_id]
}
pub fn insert(&mut self, doc_id: DocId, dest: &T) -> io::Result<()> {
pub fn insert(&mut self, key: u64, dest: &T) -> io::Result<()> {
let mut layer_id = 0;
let mut skip_pointer = try!(self.data_layer.insert(doc_id, dest));
let mut skip_pointer = self.data_layer.insert(key, dest)?;
loop {
skip_pointer = match skip_pointer {
Some((skip_doc_id, skip_offset)) =>
try!(self
.get_skip_layer(layer_id)
.insert(skip_doc_id, &skip_offset)),
None => { return Ok(()); }
Some((skip_doc_id, skip_offset)) => self.get_skip_layer(layer_id)
.insert(skip_doc_id, &skip_offset)?,
None => {
return Ok(());
}
};
layer_id += 1;
}
}
pub fn write<W: Write>(self, output: &mut Write) -> io::Result<()> {
let mut size: u32 = 0;
let mut layer_sizes: Vec<u32> = Vec::new();
size += self.data_layer.buffer.len() as u32;
layer_sizes.push(size);
pub fn write<W: Write>(self, output: &mut W) -> io::Result<()> {
let mut size: u64 = self.data_layer.buffer.len() as u64;
let mut layer_sizes = vec![VInt(size)];
for layer in self.skip_layers.iter().rev() {
size += layer.buffer.len() as u32;
layer_sizes.push(size);
size += layer.buffer.len() as u64;
layer_sizes.push(VInt(size));
}
try!(layer_sizes.serialize(output));
try!(self.data_layer.write(output));
layer_sizes.serialize(output)?;
self.data_layer.write(output)?;
for layer in self.skip_layers.iter().rev() {
try!(layer.write(output));
layer.write(output)?;
}
Ok(())
}

View File

@@ -1,7 +1,6 @@
use std::mem;
use super::heap::{Heap, HeapAllocable};
#[inline]
pub fn is_power_of_2(val: u32) -> bool {
val & (val - 1) == 0
@@ -9,11 +8,10 @@ pub fn is_power_of_2(val: u32) -> bool {
#[inline]
pub fn jump_needed(val: u32) -> bool {
val > 3 && is_power_of_2(val)
val > 3 && is_power_of_2(val)
}
#[derive(Debug)]
#[derive(Debug, Clone)]
pub struct ExpUnrolledLinkedList {
len: u32,
end: u32,
@@ -24,10 +22,9 @@ pub struct ExpUnrolledLinkedList {
}
impl ExpUnrolledLinkedList {
pub fn iter<'a>(&self, addr: u32, heap: &'a Heap) -> ExpUnrolledLinkedListIterator<'a> {
ExpUnrolledLinkedListIterator {
heap: heap,
heap,
addr: addr + 2u32 * (mem::size_of::<u32>() as u32),
len: self.len,
consumed: 0,
@@ -42,16 +39,21 @@ impl ExpUnrolledLinkedList {
// the next block as a size of (length so far),
// and we need to add 1u32 to store the pointer
// to the next element.
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
let new_block_addr: u32 = heap.allocate_space(new_block_size);
heap.set(self.end, &new_block_addr);
self.end = new_block_addr;
self.end = new_block_addr;
}
heap.set(self.end, &val);
self.end += mem::size_of::<u32>() as u32;
}
}
impl HeapAllocable for u32 {
fn with_addr(_addr: u32) -> u32 {
0u32
}
}
impl HeapAllocable for ExpUnrolledLinkedList {
fn with_addr(addr: u32) -> ExpUnrolledLinkedList {
@@ -77,33 +79,26 @@ pub struct ExpUnrolledLinkedListIterator<'a> {
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
type Item = u32;
fn next(&mut self,) -> Option<u32> {
fn next(&mut self) -> Option<u32> {
if self.consumed == self.len {
None
}
else {
} else {
let addr: u32;
self.consumed += 1;
if jump_needed(self.consumed) {
addr = *self.heap.get_mut_ref(self.addr);
}
else {
} else {
addr = self.addr;
}
self.addr = addr + mem::size_of::<u32>() as u32;
Some(*self.heap.get_mut_ref(addr))
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use super::super::heap::Heap;
use test::Bencher;
@@ -147,7 +142,7 @@ mod tests {
#[bench]
fn bench_push_stack(bench: &mut Bencher) {
let heap = Heap::with_capacity(64_000_000);
let heap = Heap::with_capacity(64_000_000);
bench.iter(|| {
let mut stacks = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
@@ -163,4 +158,4 @@ mod tests {
heap.clear();
});
}
}
}

View File

@@ -1,26 +1,77 @@
use std::iter;
use std::marker::PhantomData;
use super::heap::{Heap, HeapAllocable, BytesRef};
use std::mem;
use postings::UnorderedTermId;
use super::heap::{BytesRef, Heap, HeapAllocable};
mod murmurhash2 {
const SEED: u32 = 3_242_157_231u32;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let m: u32 = 0x5bd1_e995;
let r = 24;
let len = key.len() as u32;
/// dbj2 hash function
fn djb2(key: &[u8]) -> u64 {
let mut state: u64 = 5381;
for &b in key {
state = (state << 5).wrapping_add(state).wrapping_add(b as u64);
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { *key_ptr };
k = k.wrapping_mul(m);
k ^= k >> r;
k = k.wrapping_mul(m);
k = k.wrapping_mul(m);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining = len & 3;
let key_ptr_u8: *const u8 = key_ptr as *const u8;
match remaining {
3 => {
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(2)) } << 16;
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
h ^= unsafe { u32::from(*key_ptr_u8) };
h = h.wrapping_mul(m);
}
2 => {
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
h ^= unsafe { u32::from(*key_ptr_u8) };
h = h.wrapping_mul(m);
}
1 => {
h ^= unsafe { u32::from(*key_ptr_u8) };
h = h.wrapping_mul(m);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(m);
h ^ (h >> 15)
}
state
}
impl Default for BytesRef {
fn default() -> BytesRef {
BytesRef {
start: 0u32,
stop: 0u32,
}
}
/// Split the thread memory budget into
/// - the heap size
/// - the hash table "table" itself.
///
/// Returns (the heap size in bytes, the hash table size in number of bits)
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
let table_size_limit: usize = per_thread_memory_budget / 3;
let compute_table_size = |num_bits: usize| (1 << num_bits) * mem::size_of::<KeyValue>();
let table_num_bits: usize = (1..)
.into_iter()
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
.last()
.expect(&format!(
"Per thread memory is too small: {}",
per_thread_memory_budget
));
let table_size = compute_table_size(table_num_bits);
let heap_size = per_thread_memory_budget - table_size;
(heap_size, table_num_bits)
}
/// `KeyValue` is the item stored in the hash table.
@@ -29,27 +80,21 @@ impl Default for BytesRef {
///
/// The key and the value are actually stored contiguously.
/// For this reason, the (start, stop) information is actually redundant
/// and can be simplified in the future
/// and can be simplified in the future
#[derive(Copy, Clone, Default)]
struct KeyValue {
key: BytesRef,
value_addr: u32,
key_value_addr: BytesRef,
hash: u32,
}
impl KeyValue {
fn is_empty(&self,) -> bool {
self.key.stop == 0u32
fn is_empty(&self) -> bool {
self.key_value_addr.is_null()
}
}
pub enum Entry {
Vacant(usize),
Occupied(u32),
}
/// Customized `HashMap` with string keys
///
///
/// This `HashMap` takes String as keys. Keys are
/// stored in a user defined heap.
///
@@ -57,114 +102,115 @@ pub enum Entry {
/// the computation of the hash of the key twice,
/// or copying the key as long as there is no insert.
///
pub struct HashMap<'a, V> where V: HeapAllocable {
pub struct TermHashMap<'a> {
table: Box<[KeyValue]>,
heap: &'a Heap,
_phantom: PhantomData<V>,
mask: usize,
occupied: Vec<usize>,
}
impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
struct QuadraticProbing {
hash: usize,
i: usize,
mask: usize,
}
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a, V> {
impl QuadraticProbing {
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
QuadraticProbing {
hash,
i: 0,
mask,
}
}
#[inline]
fn next_probe(&mut self) -> usize {
self.i += 1;
(self.hash + self.i * self.i) & self.mask
}
}
impl<'a> TermHashMap<'a> {
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> {
let table_size = 1 << num_bucket_power_of_2;
let table: Vec<KeyValue> = iter::repeat(KeyValue::default())
.take(table_size)
.collect();
HashMap {
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
TermHashMap {
table: table.into_boxed_slice(),
heap: heap,
_phantom: PhantomData,
heap,
mask: table_size - 1,
occupied: Vec::with_capacity(table_size / 2),
}
}
#[inline]
fn bucket(&self, key: &[u8]) -> usize {
let hash: u64 = djb2(key);
(hash as usize) & self.mask
fn probe(&self, hash: u32) -> QuadraticProbing {
QuadraticProbing::compute(hash as usize, self.mask)
}
fn get_key(&self, bytes_ref: BytesRef) -> &[u8] {
self.heap.get_slice(bytes_ref)
pub fn is_saturated(&self) -> bool {
self.table.len() < self.occupied.len() * 3
}
pub fn set_bucket(&mut self, key_bytes: &[u8], bucket: usize, addr: u32) -> u32 {
#[inline(never)]
fn get_key_value(&self, bytes_ref: BytesRef) -> (&[u8], u32) {
let key_bytes: &[u8] = self.heap.get_slice(bytes_ref);
let expull_addr: u32 = bytes_ref.addr() + 2 + key_bytes.len() as u32;
(key_bytes, expull_addr)
}
pub fn set_bucket(&mut self, hash: u32, key_value_addr: BytesRef, bucket: usize) {
self.occupied.push(bucket);
self.table[bucket] = KeyValue {
key: self.heap.allocate_and_set(key_bytes),
value_addr: addr,
key_value_addr, hash
};
addr
}
pub fn iter<'b: 'a>(&'b self,) -> impl Iterator<Item=(&'a [u8], (u32, &'a V))> + 'b {
let heap: &'a Heap = self.heap;
let table: &'b [KeyValue] = &self.table;
self.occupied
.iter()
.cloned()
.map(move |bucket: usize| {
let kv = table[bucket];
let addr = kv.value_addr;
let v: &V = heap.get_mut_ref::<V>(addr);
(heap.get_slice(kv.key), (addr, v))
})
// .map(move |addr: u32| (heap.get_mut_ref::<V>(addr)) )
}
pub fn values_mut<'b: 'a>(&'b self,) -> impl Iterator<Item=&'a mut V> + 'b {
let heap: &'a Heap = self.heap;
let table: &'b [KeyValue] = &self.table;
self.occupied
.iter()
.cloned()
.map(move |bucket: usize| table[bucket].value_addr)
.map(move |addr: u32| heap.get_mut_ref::<V>(addr))
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32, UnorderedTermId)> + 'b {
self.occupied.iter().cloned().map(move |bucket: usize| {
let kv = self.table[bucket];
let (key, offset) = self.get_key_value(kv.key_value_addr);
(key, offset, bucket as UnorderedTermId)
})
}
pub fn get_or_create<S: AsRef<[u8]>>(&mut self, key: S) -> &mut V {
let entry = self.lookup(key.as_ref());
match entry {
Entry::Occupied(addr) => {
self.heap.get_mut_ref(addr)
}
Entry::Vacant(bucket) => {
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
self.set_bucket(key.as_ref(), bucket, addr);
val
}
}
}
pub fn lookup<S: AsRef<[u8]>>(&self, key: S) -> Entry {
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(
&mut self,
key: S,
) -> (UnorderedTermId, &mut V) {
let key_bytes: &[u8] = key.as_ref();
let mut bucket = self.bucket(key_bytes);
let hash = murmurhash2::murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
return Entry::Vacant(bucket);
let key_bytes_ref = self.heap.allocate_and_set(key_bytes);
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
self.set_bucket(hash, key_bytes_ref, bucket);
return (bucket as UnorderedTermId, val);
} else if kv.hash == hash {
let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr);
if stored_key == key_bytes {
return (
bucket as UnorderedTermId,
self.heap.get_mut_ref(expull_addr),
);
}
}
if self.get_key(kv.key) == key_bytes {
return Entry::Occupied(kv.value_addr);
}
bucket = (bucket + 1) & self.mask;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use super::super::heap::{Heap, HeapAllocable};
use super::djb2;
use super::murmurhash2::murmurhash2;
use test::Bencher;
use std::collections::hash_map::DefaultHasher;
use std::hash::Hasher;
use std::collections::HashSet;
use super::split_memory;
struct TestValue {
val: u32,
@@ -180,54 +226,84 @@ mod tests {
}
}
#[test]
fn test_hashmap_size() {
assert_eq!(split_memory(100_000), (67232, 12));
assert_eq!(split_memory(1_000_000), (737856, 15));
assert_eq!(split_memory(10_000_000), (7902848, 18));
}
#[test]
fn test_hash_map() {
let heap = Heap::with_capacity(2_000_000);
let mut hash_map: HashMap<TestValue> = HashMap::new(18, &heap);
let mut hash_map: TermHashMap = TermHashMap::new(18, &heap);
{
{
let v: &mut TestValue = hash_map.get_or_create("abc");
let v: &mut TestValue = hash_map.get_or_create("abc").1;
assert_eq!(v.val, 0u32);
v.val = 3u32;
}
}
{
let v: &mut TestValue = hash_map.get_or_create("abcd");
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
assert_eq!(v.val, 0u32);
v.val = 4u32;
}
{
let v: &mut TestValue = hash_map.get_or_create("abc");
let v: &mut TestValue = hash_map.get_or_create("abc").1;
assert_eq!(v.val, 3u32);
}
{
let v: &mut TestValue = hash_map.get_or_create("abcd");
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
assert_eq!(v.val, 4u32);
}
let mut iter_values = hash_map.values_mut();
assert_eq!(iter_values.next().unwrap().val, 3u32);
assert_eq!(iter_values.next().unwrap().val, 4u32);
assert!(!iter_values.next().is_some());
let mut iter_values = hash_map.iter();
{
let (_, addr, _) = iter_values.next().unwrap();
let val: &TestValue = heap.get_ref(addr);
assert_eq!(val.val, 3u32);
}
{
let (_, addr, _) = iter_values.next().unwrap();
let val: &TestValue = heap.get_ref(addr);
assert_eq!(val.val, 4u32);
}
assert!(iter_values.next().is_none());
}
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
#[bench]
fn bench_djb2(bench: &mut Bencher) {
let v = String::from("abwer");
bench.iter(|| {
djb2(v.as_bytes())
});
}
#[bench]
fn bench_siphasher(bench: &mut Bencher) {
let v = String::from("abwer");
bench.iter(|| {
let mut h = DefaultHasher::new();
h.write(v.as_bytes());
h.finish()
fn bench_murmurhash_2(b: &mut Bencher) {
let keys: Vec<&'static str> =
vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
keys.iter()
.map(|&s| s.as_bytes())
.map(murmurhash2::murmurhash2)
.map(|h| h as u64)
.last()
.unwrap()
});
}
}

View File

@@ -1,12 +1,29 @@
use std::cell::UnsafeCell;
use std::mem;
use std::ptr;
use byteorder::{ByteOrder, NativeEndian};
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
///
/// The slice will encode the length of the `&[u8]` slice
/// on 16-bits, and then the data is encoded.
#[derive(Copy, Clone)]
pub struct BytesRef {
pub start: u32,
pub stop: u32,
pub struct BytesRef(u32);
impl BytesRef {
pub fn is_null(&self) -> bool {
self.0 == u32::max_value()
}
pub fn addr(&self) -> u32 {
self.0
}
}
impl Default for BytesRef {
fn default() -> BytesRef {
BytesRef(u32::max_value())
}
}
/// Object that can be allocated in tantivy's custom `Heap`.
@@ -19,20 +36,19 @@ pub struct Heap {
inner: UnsafeCell<InnerHeap>,
}
#[cfg_attr(feature = "cargo-clippy", allow(mut_from_ref))]
impl Heap {
/// Creates a new heap with a given capacity
pub fn with_capacity(num_bytes: usize) -> Heap {
Heap {
inner: UnsafeCell::new(
InnerHeap::with_capacity(num_bytes)
),
inner: UnsafeCell::new(InnerHeap::with_capacity(num_bytes)),
}
}
fn inner(&self,) -> &mut InnerHeap {
unsafe { &mut *self.inner.get() }
fn inner(&self) -> &mut InnerHeap {
unsafe { &mut *self.inner.get() }
}
/// Clears the heap. All the underlying data is lost.
///
/// This heap does not support deallocation.
@@ -40,20 +56,9 @@ impl Heap {
pub fn clear(&self) {
self.inner().clear();
}
/// Return the heap capacity.
pub fn capacity(&self,) -> u32 {
self.inner().capacity()
}
/// Return the amount of memory that has been allocated so far.
pub fn len(&self,) -> u32 {
self.inner().len()
}
/// Return amount of free space, in bytes.
pub fn num_free_bytes(&self,) -> u32 {
pub fn num_free_bytes(&self) -> u32 {
self.inner().num_free_bytes()
}
@@ -62,37 +67,42 @@ impl Heap {
pub fn allocate_space(&self, num_bytes: usize) -> u32 {
self.inner().allocate_space(num_bytes)
}
/// Allocate an object in the heap
pub fn allocate_object<V: HeapAllocable>(&self,) -> (u32, &mut V) {
pub fn allocate_object<V: HeapAllocable>(&self) -> (u32, &mut V) {
let addr = self.inner().allocate_space(mem::size_of::<V>());
let v: V = V::with_addr(addr);
self.inner().set(addr, &v);
(addr, self.inner().get_mut_ref(addr))
}
/// Stores a `&[u8]` in the heap and returns the destination BytesRef.
pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef {
self.inner().allocate_and_set(data)
}
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
/// given as argumetn
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
self.inner().get_slice(bytes_ref.start, bytes_ref.stop)
self.inner().get_slice(bytes_ref)
}
/// Stores an item's data in the heap, at the given `address`.
pub fn set<Item>(&self, addr: u32, val: &Item) {
self.inner().set(addr, val);
}
/// Returns a mutable reference for an object at a given Item.
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
self.inner().get_mut_ref(addr)
}
}
/// Returns a mutable reference to an `Item` at a given `addr`.
#[cfg(test)]
pub fn get_ref<Item>(&self, addr: u32) -> &mut Item {
self.get_mut_ref(addr)
}
}
struct InnerHeap {
buffer: Vec<u8>,
@@ -101,24 +111,11 @@ struct InnerHeap {
next_heap: Option<Box<InnerHeap>>,
}
/// initializing a long Vec<u8> is crazy slow in
/// debug mode.
/// We use this unsafe trick to make unit test
/// way faster.
fn allocate_fast(num_bytes: usize) -> Vec<u8> {
let mut buffer = Vec::with_capacity(num_bytes);
unsafe {
buffer.set_len(num_bytes);
}
buffer
}
impl InnerHeap {
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
let buffer: Vec<u8> = allocate_fast(num_bytes);
let buffer: Vec<u8> = vec![0u8; num_bytes];
InnerHeap {
buffer: buffer,
buffer,
buffer_len: num_bytes as u32,
next_heap: None,
used: 0u32,
@@ -130,23 +127,14 @@ impl InnerHeap {
self.next_heap = None;
}
pub fn capacity(&self,) -> u32 {
self.buffer.len() as u32
}
pub fn len(&self,) -> u32 {
self.used
}
// Returns the number of free bytes. If the buffer
// has reached it's capacity and overflowed to another buffer, return 0.
pub fn num_free_bytes(&self,) -> u32 {
pub fn num_free_bytes(&self) -> u32 {
if self.next_heap.is_some() {
0u32
}
else {
} else {
self.buffer_len - self.used
}
}
}
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
@@ -154,74 +142,85 @@ impl InnerHeap {
self.used += num_bytes as u32;
if self.used <= self.buffer_len {
addr
}
else {
} else {
if self.next_heap.is_none() {
warn!("Exceeded heap size. The margin was apparently unsufficient. The segment will be committed right after indexing this very last document.");
info!(
r#"Exceeded heap size. The segment will be committed right
after indexing this document."#,
);
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
}
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
}
}
fn get_slice(&self, start: u32, stop: u32) -> &[u8] {
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
let start = bytes_ref.0;
if start >= self.buffer_len {
self.next_heap.as_ref().unwrap().get_slice(start - self.buffer_len, stop - self.buffer_len)
}
else {
&self.buffer[start as usize..stop as usize]
self.next_heap
.as_ref()
.unwrap()
.get_slice(BytesRef(start - self.buffer_len))
} else {
let start = start as usize;
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
&self.buffer[start + 2..start + 2 + len]
}
}
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
if start >= self.buffer_len {
self.next_heap.as_mut().unwrap().get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
}
else {
self.next_heap
.as_mut()
.unwrap()
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
} else {
&mut self.buffer[start as usize..stop as usize]
}
}
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
let start = self.allocate_space(data.len());
let stop = start + data.len() as u32;
self.get_mut_slice(start, stop).clone_from_slice(data);
BytesRef {
start: start as u32,
stop: stop as u32,
}
assert!(data.len() < u16::max_value() as usize);
let total_len = 2 + data.len();
let start = self.allocate_space(total_len);
let total_buff = self.get_mut_slice(start, start + total_len as u32);
NativeEndian::write_u16(&mut total_buff[0..2], data.len() as u16);
total_buff[2..].clone_from_slice(data);
BytesRef(start)
}
fn get_mut(&mut self, addr: u32) -> *mut u8 {
if addr >= self.buffer_len {
self.next_heap.as_mut().unwrap().get_mut(addr - self.buffer_len)
}
else {
self.next_heap
.as_mut()
.unwrap()
.get_mut(addr - self.buffer_len)
} else {
let addr_isize = addr as isize;
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
}
}
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
if addr >= self.buffer_len {
self.next_heap.as_mut().unwrap().get_mut_ref(addr - self.buffer_len)
}
else {
self.next_heap
.as_mut()
.unwrap()
.get_mut_ref(addr - self.buffer_len)
} else {
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
let v_ptr = v_ptr_u8 as *mut Item;
unsafe { &mut *v_ptr }
}
}
fn set<Item>(&mut self, addr: u32, val: &Item) {
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
if addr >= self.buffer_len {
self.next_heap.as_mut().unwrap().set(addr - self.buffer_len, val);
}
else {
self.next_heap
.as_mut()
.unwrap()
.set(addr - self.buffer_len, val);
} else {
let v_ptr: *const Item = val as *const Item;
let v_ptr_u8: *const u8 = v_ptr as *const u8;
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
@@ -231,4 +230,4 @@ impl InnerHeap {
}
}
}
}
}

View File

@@ -1,46 +1,43 @@
mod hashmap;
pub(crate) mod hashmap;
mod heap;
mod expull;
pub use self::heap::{Heap, HeapAllocable};
pub use self::expull::ExpUnrolledLinkedList;
pub use self::hashmap::{HashMap, Entry};
pub use self::hashmap::TermHashMap;
#[test]
fn test_unrolled_linked_list() {
use std::collections;
let heap = Heap::with_capacity(30_000_000);
{
heap.clear();
let mut ks: Vec<usize> = (1..5).map(|k| k * 100).collect();
ks.push(2);
ks.push(3);
for k in (1..5).map(|k| k * 100) {
let mut hashmap: HashMap<ExpUnrolledLinkedList> = HashMap::new(10, &heap);
for k in (1..5).map(|k| k * 100) {
let mut hashmap: TermHashMap = TermHashMap::new(10, &heap);
for j in 0..k {
for i in 0..500 {
let mut list = hashmap.get_or_create(i.to_string());
list.push(i*j, &heap);
let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string()).1;
v.push(i * j, &heap);
}
}
let mut map_addr: collections::HashMap<Vec<u8>, u32> = collections::HashMap::new();
for (key, addr, _) in hashmap.iter() {
map_addr.insert(Vec::from(key), addr);
}
for i in 0..500 {
match hashmap.lookup(i.to_string()) {
Entry::Occupied(addr) => {
let v: &mut ExpUnrolledLinkedList = heap.get_mut_ref(addr);
let mut it = v.iter(addr, &heap);
for j in 0..k {
assert_eq!(it.next().unwrap(), i*j);
}
assert!(!it.next().is_some());
}
_ => {
panic!("should never happen");
}
let key: String = i.to_string();
let addr: u32 = *map_addr.get(key.as_bytes()).unwrap();
let exp_pull: &ExpUnrolledLinkedList = heap.get_ref(addr);
let mut it = exp_pull.iter(addr, &heap);
for j in 0..k {
assert_eq!(it.next().unwrap(), i * j);
}
assert!(!it.next().is_some());
}
}
}
}
}

View File

@@ -1,56 +1,56 @@
use std::marker::Send;
use std::fmt;
use std::path::Path;
use directory::error::{FileError, OpenWriteError};
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use std::result;
use std::io;
use std::marker::Sync;
/// Write-once read many (WORM) abstraction for where tantivy's index should be stored.
/// Write-once read many (WORM) abstraction for where
/// tantivy's data should be stored.
///
/// There are currently two implementations of `Directory`
///
///
/// - The [`MMapDirectory`](struct.MmapDirectory.html), this
/// should be your default choice.
/// - The [`RAMDirectory`](struct.RAMDirectory.html), which
/// should be your default choice.
/// - The [`RAMDirectory`](struct.RAMDirectory.html), which
/// should be used mostly for tests.
///
///
pub trait Directory: fmt::Debug + Send + Sync + 'static {
/// Opens a virtual file for read.
///
///
/// Once a virtual file is open, its data may not
/// change.
///
/// Specifically, subsequent writes or flushes should
/// have no effect on the returned `ReadOnlySource` object.
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, FileError>;
/// have no effect on the returned `ReadOnlySource` object.
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError>;
/// Removes a file
///
/// Removing a file will not affect an eventual
/// existing ReadOnlySource pointing to it.
///
///
/// Removing a nonexistent file, yields a
/// `FileError::DoesNotExist`.
fn delete(&self, path: &Path) -> result::Result<(), FileError>;
/// `DeleteError::DoesNotExist`.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError>;
/// Returns true iff the file exists
fn exists(&self, path: &Path) -> bool;
/// Opens a writer for the *virtual file* associated with
/// Opens a writer for the *virtual file* associated with
/// a Path.
///
/// Right after this call, the file should be created
/// and any subsequent call to `open_read` for the
/// and any subsequent call to `open_read` for the
/// same path should return a `ReadOnlySource`.
///
///
/// Write operations may be aggressively buffered.
/// The client of this trait is responsible for calling flush
/// to ensure that subsequent `read` operations
/// to ensure that subsequent `read` operations
/// will take into account preceding `write` operations.
///
///
/// Flush operation should also be persistent.
///
/// The user shall not rely on `Drop` triggering `flush`.
@@ -59,18 +59,21 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
///
/// The file may not previously exist.
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError>;
/// Reads the full content file that has been written using
/// atomic_write.
///
/// This should only be used for small files.
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError>;
/// Atomically replace the content of a file with data.
///
///
/// This calls ensure that reads can never *observe*
/// a partially written file.
///
///
/// The file may or may not previously exist.
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
/// Clones the directory and boxes the clone
/// Clones the directory and boxes the clone
fn box_clone(&self) -> Box<Directory>;
}

View File

@@ -1,44 +1,214 @@
use std::error::Error as StdError;
use std::path::PathBuf;
use std::io;
use std::fmt;
/// General IO error with an optional path to the offending file.
#[derive(Debug)]
pub struct IOError {
path: Option<PathBuf>,
err: io::Error,
}
impl fmt::Display for IOError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self.path {
Some(ref path) => write!(f, "io error occurred on path '{:?}': '{}'", path, self.err),
None => write!(f, "io error occurred: '{}'", self.err),
}
}
}
impl StdError for IOError {
fn description(&self) -> &str {
"io error occurred"
}
fn cause(&self) -> Option<&StdError> {
Some(&self.err)
}
}
impl IOError {
pub(crate) fn with_path(path: PathBuf, err: io::Error) -> Self {
IOError {
path: Some(path),
err,
}
}
}
impl From<io::Error> for IOError {
fn from(err: io::Error) -> IOError {
IOError { path: None, err }
}
}
/// Error that may occur when opening a directory
#[derive(Debug)]
pub enum OpenDirectoryError {
/// The underlying directory does not exists.
/// The underlying directory does not exists.
DoesNotExist(PathBuf),
/// The path exists but is not a directory.
NotADirectory(PathBuf),
}
impl fmt::Display for OpenDirectoryError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
OpenDirectoryError::DoesNotExist(ref path) => {
write!(f, "the underlying directory '{:?}' does not exist", path)
}
OpenDirectoryError::NotADirectory(ref path) => {
write!(f, "the path '{:?}' exists but is not a directory", path)
}
}
}
}
impl StdError for OpenDirectoryError {
fn description(&self) -> &str {
"error occurred while opening a directory"
}
fn cause(&self) -> Option<&StdError> {
None
}
}
/// Error that may occur when starting to write in a file
#[derive(Debug)]
pub enum OpenWriteError {
/// Our directory is WORM, writing an existing file is forbidden.
/// Checkout the `Directory` documentation.
/// Checkout the `Directory` documentation.
FileAlreadyExists(PathBuf),
/// Any kind of IO error that happens when
/// Any kind of IO error that happens when
/// writing in the underlying IO device.
IOError(io::Error),
IOError(IOError),
}
impl From<io::Error> for OpenWriteError {
fn from(err: io::Error) -> OpenWriteError {
impl From<IOError> for OpenWriteError {
fn from(err: IOError) -> OpenWriteError {
OpenWriteError::IOError(err)
}
}
/// Error that may occur when accessing a file (read, or delete)
#[derive(Debug)]
pub enum FileError {
/// The file does not exists.
FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when
/// interacting with the underlying IO device.
IOError(io::Error),
}
impl From<io::Error> for FileError {
fn from(err: io::Error) -> FileError {
FileError::IOError(err)
impl fmt::Display for OpenWriteError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
OpenWriteError::FileAlreadyExists(ref path) => {
write!(f, "the file '{:?}' already exists", path)
}
OpenWriteError::IOError(ref err) => write!(
f,
"an io error occurred while opening a file for writing: '{}'",
err
),
}
}
}
impl StdError for OpenWriteError {
fn description(&self) -> &str {
"error occurred while opening a file for writing"
}
fn cause(&self) -> Option<&StdError> {
match *self {
OpenWriteError::FileAlreadyExists(_) => None,
OpenWriteError::IOError(ref err) => Some(err),
}
}
}
/// Error that may occur when accessing a file read
#[derive(Debug)]
pub enum OpenReadError {
/// The file does not exists.
FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when
/// interacting with the underlying IO device.
IOError(IOError),
}
impl From<IOError> for OpenReadError {
fn from(err: IOError) -> OpenReadError {
OpenReadError::IOError(err)
}
}
impl fmt::Display for OpenReadError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
OpenReadError::FileDoesNotExist(ref path) => {
write!(f, "the file '{:?}' does not exist", path)
}
OpenReadError::IOError(ref err) => write!(
f,
"an io error occurred while opening a file for reading: '{}'",
err
),
}
}
}
impl StdError for OpenReadError {
fn description(&self) -> &str {
"error occurred while opening a file for reading"
}
fn cause(&self) -> Option<&StdError> {
match *self {
OpenReadError::FileDoesNotExist(_) => None,
OpenReadError::IOError(ref err) => Some(err),
}
}
}
/// Error that may occur when trying to delete a file
#[derive(Debug)]
pub enum DeleteError {
/// The file does not exists.
FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when
/// interacting with the underlying IO device.
IOError(IOError),
/// The file may not be deleted because it is
/// protected.
FileProtected(PathBuf),
}
impl From<IOError> for DeleteError {
fn from(err: IOError) -> DeleteError {
DeleteError::IOError(err)
}
}
impl fmt::Display for DeleteError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
DeleteError::FileDoesNotExist(ref path) => {
write!(f, "the file '{:?}' does not exist", path)
}
DeleteError::FileProtected(ref path) => {
write!(f, "the file '{:?}' is protected and can't be deleted", path)
}
DeleteError::IOError(ref err) => {
write!(f, "an io error occurred while deleting a file: '{}'", err)
}
}
}
}
impl StdError for DeleteError {
fn description(&self) -> &str {
"error occurred while deleting a file"
}
fn cause(&self) -> Option<&StdError> {
match *self {
DeleteError::FileDoesNotExist(_) | DeleteError::FileProtected(_) => None,
DeleteError::IOError(ref err) => Some(err),
}
}
}

View File

@@ -0,0 +1,399 @@
use std::path::{Path, PathBuf};
use serde_json;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use std::result;
use std::io;
use Directory;
use std::sync::{Arc, RwLock};
use std::collections::HashSet;
use std::sync::RwLockWriteGuard;
use std::io::Write;
use core::MANAGED_FILEPATH;
use std::collections::HashMap;
use std::fmt;
use error::{ErrorKind, Result, ResultExt};
/// Wrapper of directories that keeps track of files created by Tantivy.
///
/// A managed directory is just a wrapper of a directory
/// that keeps a (persisted) list of the files that
/// have been created (and not deleted) by tantivy so far.
///
/// Thanks to this list, it implements a `garbage_collect` method
/// that removes the files that were created by tantivy and are not
/// useful anymore.
#[derive(Debug)]
pub struct ManagedDirectory {
directory: Box<Directory>,
meta_informations: Arc<RwLock<MetaInformation>>,
}
#[derive(Debug, Default)]
struct MetaInformation {
managed_paths: HashSet<PathBuf>,
protected_files: HashMap<PathBuf, usize>,
}
/// A `FileProtection` prevents the garbage collection of a file.
///
/// See `ManagedDirectory.protect_file_from_delete`.
pub struct FileProtection {
directory: ManagedDirectory,
path: PathBuf,
}
fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
let mut meta_informations_wlock = directory
.meta_informations
.write()
.expect("Managed file lock poisoned");
if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) {
(*counter_ref_mut) -= 1;
}
}
impl fmt::Debug for FileProtection {
fn fmt(&self, formatter: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
write!(formatter, "FileProtectionFor({:?})", self.path)
}
}
impl Drop for FileProtection {
fn drop(&mut self) {
unprotect_file_from_delete(&self.directory, &*self.path);
}
}
/// Saves the file containing the list of existing files
/// that were created by tantivy.
fn save_managed_paths(
directory: &mut Directory,
wlock: &RwLockWriteGuard<MetaInformation>,
) -> io::Result<()> {
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
write!(&mut w, "\n")?;
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
Ok(())
}
impl ManagedDirectory {
/// Wraps a directory as managed directory.
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
match directory.atomic_read(&MANAGED_FILEPATH) {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
.chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?;
Ok(ManagedDirectory {
directory: box directory,
meta_informations: Arc::new(RwLock::new(MetaInformation {
managed_paths: managed_files,
protected_files: HashMap::default(),
})),
})
}
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
directory: box directory,
meta_informations: Arc::default(),
}),
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
}
}
/// Garbage collect unused files.
///
/// Removes the files that were created by `tantivy` and are not
/// used by any segment anymore.
///
/// * `living_files` - List of files that are still used by the index.
///
/// This method does not panick nor returns errors.
/// If a file cannot be deleted (for permission reasons for instance)
/// an error is simply logged, and the file remains in the list of managed
/// files.
pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
info!("Garbage collect");
let mut files_to_delete = vec![];
{
// releasing the lock as .delete() will use it too.
let meta_informations_rlock = self.meta_informations
.read()
.expect("Managed directory rlock poisoned in garbage collect.");
// It is crucial to get the living files after acquiring the
// read lock of meta informations. That way, we
// avoid the following scenario.
//
// 1) we get the list of living files.
// 2) someone creates a new file.
// 3) we start garbage collection and remove this file
// even though it is a living file.
let living_files = get_living_files();
for managed_path in &meta_informations_rlock.managed_paths {
if !living_files.contains(managed_path) {
files_to_delete.push(managed_path.clone());
}
}
}
let mut deleted_files = vec![];
{
for file_to_delete in files_to_delete {
match self.delete(&file_to_delete) {
Ok(_) => {
info!("Deleted {:?}", file_to_delete);
deleted_files.push(file_to_delete);
}
Err(file_error) => {
match file_error {
DeleteError::FileDoesNotExist(_) => {
deleted_files.push(file_to_delete);
}
DeleteError::IOError(_) => {
if !cfg!(target_os = "windows") {
// On windows, delete is expected to fail if the file
// is mmapped.
error!("Failed to delete {:?}", file_to_delete);
}
}
DeleteError::FileProtected(_) => {
// this is expected.
}
}
}
}
}
}
if !deleted_files.is_empty() {
// update the list of managed files by removing
// the file that were removed.
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed directory wlock poisoned (2).");
{
let managed_paths_write = &mut meta_informations_wlock.managed_paths;
for delete_file in &deleted_files {
managed_paths_write.remove(delete_file);
}
}
if save_managed_paths(self.directory.as_mut(), &meta_informations_wlock).is_err() {
error!("Failed to save the list of managed files.");
}
}
}
/// Protects a file from being garbage collected.
///
/// The method returns a `FileProtection` object.
/// The file will not be garbage collected as long as the
/// `FileProtection` object is kept alive.
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
let pathbuf = path.to_owned();
{
let mut meta_informations_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned on protect");
*meta_informations_wlock
.protected_files
.entry(pathbuf.clone())
.or_insert(0) += 1;
}
FileProtection {
directory: self.clone(),
path: pathbuf.clone(),
}
}
/// Registers a file as managed
///
/// This method must be called before the file is
/// actually created to ensure that a failure between
/// registering the filepath and creating the file
/// will not lead to garbage files that will
/// never get removed.
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
let mut meta_wlock = self.meta_informations
.write()
.expect("Managed file lock poisoned");
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
if has_changed {
save_managed_paths(self.directory.as_mut(), &meta_wlock)?;
}
Ok(())
}
}
impl Directory for ManagedDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
self.directory.open_read(path)
}
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
self.register_file_as_managed(path)
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
self.directory.open_write(path)
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
self.register_file_as_managed(path)?;
self.directory.atomic_write(path, data)
}
fn atomic_read(&self, path: &Path) -> result::Result<Vec<u8>, OpenReadError> {
self.directory.atomic_read(path)
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
{
let metas_rlock = self.meta_informations
.read()
.expect("poisoned lock in managed directory meta");
if let Some(counter) = metas_rlock.protected_files.get(path) {
if *counter > 0 {
return Err(DeleteError::FileProtected(path.to_owned()));
}
}
}
self.directory.delete(path)
}
fn exists(&self, path: &Path) -> bool {
self.directory.exists(path)
}
fn box_clone(&self) -> Box<Directory> {
box self.clone()
}
}
impl Clone for ManagedDirectory {
fn clone(&self) -> ManagedDirectory {
ManagedDirectory {
directory: self.directory.box_clone(),
meta_informations: Arc::clone(&self.meta_informations),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature="mmap")]
use directory::MmapDirectory;
use std::path::Path;
use std::io::Write;
use tempdir::TempDir;
lazy_static! {
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
}
#[test]
#[cfg(feature="mmap")]
fn test_managed_directory() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
{
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
write_file.flush().unwrap();
}
{
managed_directory
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
.unwrap();
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> =
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> = HashSet::new();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(!managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
}
#[test]
#[cfg(feature="mmap ")]
fn test_managed_directory_gc_while_mmapped() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(|| living_files.clone());
if cfg!(target_os = "windows") {
// On Windows, gc should try and fail the file as it is mmapped.
assert!(managed_directory.exists(*TEST_PATH1));
// unmap should happen here.
drop(_mmap_read);
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
managed_directory.garbage_collect(|| living_files);
assert!(!managed_directory.exists(*TEST_PATH1));
} else {
assert!(!managed_directory.exists(*TEST_PATH1));
}
}
#[test]
#[cfg(feature="mmap")]
fn test_managed_directory_protect() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
{
let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1);
managed_directory.garbage_collect(|| living_files.clone());
assert!(managed_directory.exists(*TEST_PATH1));
}
managed_directory.garbage_collect(|| living_files.clone());
assert!(!managed_directory.exists(*TEST_PATH1));
}
}

View File

@@ -1,81 +1,165 @@
use std::path::{Path, PathBuf};
use tempdir::TempDir;
use std::collections::HashMap;
use std::collections::hash_map::Entry as HashMapEntry;
use fst::raw::MmapReadOnly;
use std::fs::File;
use atomicwrites;
use std::sync::RwLock;
use std::fmt;
use std::io::Write;
use std::io;
use std::io::{Seek, SeekFrom};
use directory::Directory;
use directory::ReadOnlySource;
use directory::WritePtr;
use std::io::BufWriter;
use std::fs::OpenOptions;
use directory::error::{OpenWriteError, FileError, OpenDirectoryError};
use std::result;
use common::make_io_err;
use std::sync::Arc;
use std::fs;
use directory::Directory;
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use directory::ReadOnlySource;
use directory::shared_vec_slice::SharedVecSlice;
use directory::WritePtr;
use fst::raw::MmapReadOnly;
use std::collections::hash_map::Entry as HashMapEntry;
use std::collections::HashMap;
use std::convert::From;
use std::fmt;
use std::fs::{self, File};
use std::fs::OpenOptions;
use std::io::{self, Seek, SeekFrom};
use std::io::{BufWriter, Read, Write};
use std::path::{Path, PathBuf};
use std::result;
use std::sync::Arc;
use std::sync::RwLock;
use tempdir::TempDir;
/// Returns None iff the file exists, can be read, but is empty (and hence
/// cannot be mmapped).
///
fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadError> {
let file = File::open(full_path).map_err(|e| {
if e.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.to_owned())
} else {
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
}
})?;
let meta_data = file.metadata()
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
if meta_data.len() == 0 {
// if the file size is 0, it will not be possible
// to mmap the file, so we return None
// instead.
return Ok(None);
}
MmapReadOnly::open(&file)
.map(Some)
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
}
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
pub struct CacheCounters {
// Number of time the cache prevents to call `mmap`
pub hit: usize,
// Number of time tantivy had to call `mmap`
// as no entry was in the cache.
pub miss: usize,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct CacheInfo {
pub counters: CacheCounters,
pub mmapped: Vec<PathBuf>,
}
struct MmapCache {
counters: CacheCounters,
cache: HashMap<PathBuf, MmapReadOnly>,
}
impl Default for MmapCache {
fn default() -> MmapCache {
MmapCache {
counters: CacheCounters::default(),
cache: HashMap::new(),
}
}
}
impl MmapCache {
/// Removes a `MmapReadOnly` entry from the mmap cache.
fn discard_from_cache(&mut self, full_path: &Path) -> bool {
self.cache.remove(full_path).is_some()
}
fn get_info(&mut self) -> CacheInfo {
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
CacheInfo {
counters: self.counters.clone(),
mmapped: paths,
}
}
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<MmapReadOnly>, OpenReadError> {
Ok(match self.cache.entry(full_path.to_owned()) {
HashMapEntry::Occupied(occupied_entry) => {
let mmap = occupied_entry.get();
self.counters.hit += 1;
Some(mmap.clone())
}
HashMapEntry::Vacant(vacant_entry) => {
self.counters.miss += 1;
if let Some(mmap) = open_mmap(full_path)? {
vacant_entry.insert(mmap.clone());
Some(mmap)
} else {
None
}
}
})
}
}
/// Directory storing data in files, read via mmap.
///
/// The Mmap object are cached to limit the
/// system calls.
/// The Mmap object are cached to limit the
/// system calls.
#[derive(Clone)]
pub struct MmapDirectory {
root_path: PathBuf,
mmap_cache: Arc<RwLock<HashMap<PathBuf, MmapReadOnly>>>,
mmap_cache: Arc<RwLock<MmapCache>>,
_temp_directory: Arc<Option<TempDir>>,
}
impl fmt::Debug for MmapDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "MmapDirectory({:?})", self.root_path)
}
write!(f, "MmapDirectory({:?})", self.root_path)
}
}
impl MmapDirectory {
/// Creates a new MmapDirectory in a temporary directory.
///
/// This is mostly useful to test the MmapDirectory itself.
/// For your unit tests, prefer the RAMDirectory.
/// For your unit tests, prefer the RAMDirectory.
pub fn create_from_tempdir() -> io::Result<MmapDirectory> {
let tempdir = try!(TempDir::new("index"));
let tempdir = TempDir::new("index")?;
let tempdir_path = PathBuf::from(tempdir.path());
let directory = MmapDirectory {
root_path: PathBuf::from(tempdir_path),
mmap_cache: Arc::new(RwLock::new(HashMap::new())),
_temp_directory: Arc::new(Some(tempdir))
root_path: tempdir_path,
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(Some(tempdir)),
};
Ok(directory)
}
/// Opens a MmapDirectory in a directory.
///
/// Returns an error if the `directory_path` does not
/// exist or if it is not a directory.
pub fn open(directory_path: &Path) -> Result<MmapDirectory, OpenDirectoryError> {
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
let directory_path: &Path = directory_path.as_ref();
if !directory_path.exists() {
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(directory_path)))
}
else if !directory_path.is_dir() {
Err(OpenDirectoryError::NotADirectory(PathBuf::from(directory_path)))
}
else {
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
directory_path,
)))
} else if !directory_path.is_dir() {
Err(OpenDirectoryError::NotADirectory(PathBuf::from(
directory_path,
)))
} else {
Ok(MmapDirectory {
root_path: PathBuf::from(directory_path),
mmap_cache: Arc::new(RwLock::new(HashMap::new())),
_temp_directory: Arc::new(None)
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(None),
})
}
}
@@ -89,16 +173,45 @@ impl MmapDirectory {
/// Sync the root directory.
/// In certain FS, this is required to persistently create
/// a file.
fn sync_directory(&self,) -> Result<(), io::Error> {
let fd = try!(File::open(&self.root_path));
try!(fd.sync_all());
fn sync_directory(&self) -> Result<(), io::Error> {
let mut open_opts = OpenOptions::new();
// Linux needs read to be set, otherwise returns EINVAL
// write must not be set, or it fails with EISDIR
open_opts.read(true);
// On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
// and calling sync_all() only works if write access is requested.
#[cfg(windows)]
{
use std::os::windows::fs::OpenOptionsExt;
use winapi::winbase;
open_opts
.write(true)
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
}
let fd = open_opts.open(&self.root_path)?;
fd.sync_all()?;
Ok(())
}
/// Returns some statistical information
/// about the Mmap cache.
///
/// The `MmapDirectory` embeds a `MmapDirectory`
/// to avoid multiplying the `mmap` system calls.
pub fn get_cache_info(&mut self) -> CacheInfo {
self.mmap_cache
.write()
.expect("Mmap cache lock is poisoned.")
.get_info()
}
}
/// This Write wraps a File, but has the specificity of
/// call `sync_all` on flush.
/// This Write wraps a File, but has the specificity of
/// call `sync_all` on flush.
struct SafeFileWriter(File);
impl SafeFileWriter {
@@ -108,13 +221,12 @@ impl SafeFileWriter {
}
impl Write for SafeFileWriter {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.0.write(buf)
}
fn flush(&mut self) -> io::Result<()> {
try!(self.0.flush());
self.0.flush()?;
self.0.sync_all()
}
}
@@ -125,99 +237,86 @@ impl Seek for SafeFileWriter {
}
}
impl Directory for MmapDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, FileError> {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = try!(
self.mmap_cache
.write()
.map_err(|_| {
make_io_err(format!("Failed to acquired write lock on mmap cache while reading {:?}", path))
})
);
let mmap = match mmap_cache.entry(full_path.clone()) {
HashMapEntry::Occupied(e) => {
e.get().clone()
}
HashMapEntry::Vacant(vacant_entry) => {
let file = try!(
File::open(&full_path).map_err(|err| {
if err.kind() == io::ErrorKind::NotFound {
FileError::FileDoesNotExist(full_path.clone())
}
else {
FileError::IOError(err)
}
})
);
if try!(file.metadata()).len() == 0 {
// if the file size is 0, it will not be possible
// to mmap the file, so we return an anonymous mmap_cache
// instead.
return Ok(ReadOnlySource::Anonymous(SharedVecSlice::empty()))
}
let new_mmap = try!(MmapReadOnly::open(&file));
vacant_entry.insert(new_mmap.clone());
new_mmap
}
};
Ok(ReadOnlySource::Mmap(mmap))
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while reading {:?}",
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
Ok(mmap_cache
.get_mmap(&full_path)?
.map(ReadOnlySource::Mmap)
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
debug!("Open Write {:?}", path);
let full_path = self.resolve_path(path);
let open_res = OpenOptions::new()
.write(true)
.create_new(true)
.open(full_path);
let mut file = try!(
open_res.map_err(|err| {
if err.kind() == io::ErrorKind::AlreadyExists {
OpenWriteError::FileAlreadyExists(PathBuf::from(path))
}
else {
OpenWriteError::IOError(err)
}
})
);
let mut file = open_res.map_err(|err| {
if err.kind() == io::ErrorKind::AlreadyExists {
OpenWriteError::FileAlreadyExists(path.to_owned())
} else {
IOError::with_path(path.to_owned(), err).into()
}
})?;
// making sure the file is created.
try!(file.flush());
file.flush()
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
// Apparetntly, on some filesystem syncing the parent
// directory is required.
try!(self.sync_directory());
self.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
let writer = SafeFileWriter::new(file);
Ok(BufWriter::new(Box::new(writer)))
}
fn delete(&self, path: &Path) -> result::Result<(), FileError> {
debug!("Delete {:?}", path);
/// Any entry associated to the path in the mmap will be
/// removed before the file is deleted.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
debug!("Deleting file {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = try!(self.mmap_cache
.write()
.map_err(|_| {
make_io_err(format!("Failed to acquired write lock on mmap cache while deleting {:?}", path))
})
);
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while deleting {:?}",
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
mmap_cache.discard_from_cache(path);
// Removing the entry in the MMap cache.
// The munmap will appear on Drop,
// when the last reference is gone.
mmap_cache.remove(&full_path);
try!(fs::remove_file(&full_path));
try!(self.sync_directory());
Ok(())
mmap_cache.cache.remove(&full_path);
match fs::remove_file(&full_path) {
Ok(_) => self.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned()))
} else {
Err(IOError::with_path(path.to_owned(), e).into())
}
}
}
}
fn exists(&self, path: &Path) -> bool {
@@ -225,18 +324,100 @@ impl Directory for MmapDirectory {
full_path.exists()
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
let full_path = self.resolve_path(path);
let mut buffer = Vec::new();
match File::open(&full_path) {
Ok(mut file) => {
file.read_to_end(&mut buffer)
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
Ok(buffer)
}
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
} else {
Err(IOError::with_path(path.to_owned(), e).into())
}
}
}
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
debug!("Atomic Write {:?}", path);
let full_path = self.resolve_path(path);
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite);
try!(meta_file.write(|f| {
f.write_all(data)
}));
meta_file.write(|f| f.write_all(data))?;
Ok(())
}
fn box_clone(&self,) -> Box<Directory> {
fn box_clone(&self) -> Box<Directory> {
Box::new(self.clone())
}
}
#[cfg(test)]
mod tests {
// There are more tests in directory/mod.rs
// The following tests are specific to the MmapDirectory
use super::*;
#[test]
fn test_open_empty() {
// empty file is actually an edge case because those
// cannot be mmapped.
//
// In that case the directory returns a SharedVecSlice.
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
let path = PathBuf::from("test");
{
let mut w = mmap_directory.open_write(&path).unwrap();
w.flush().unwrap();
}
let readonlymap = mmap_directory.open_read(&path).unwrap();
assert_eq!(readonlymap.len(), 0);
}
#[test]
fn test_cache() {
let content = "abc".as_bytes();
// here we test if the cache releases
// mmaps correctly.
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
let num_paths = 10;
let paths: Vec<PathBuf> = (0..num_paths)
.map(|i| PathBuf::from(&*format!("file_{}", i)))
.collect();
{
for path in &paths {
let mut w = mmap_directory.open_write(path).unwrap();
w.write(content).unwrap();
w.flush().unwrap();
}
}
{
for (i, path) in paths.iter().enumerate() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
}
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
}
for (i, path) in paths.iter().enumerate() {
mmap_directory.delete(path).unwrap();
assert_eq!(
mmap_directory.get_cache_info().mmapped.len(),
num_paths - i - 1
);
}
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 10);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
}
}

View File

@@ -1,20 +1,36 @@
/*!
WORM directory abstraction.
*/
#[cfg(feature="mmap")]
mod mmap_directory;
mod ram_directory;
mod directory;
mod read_only_source;
mod shared_vec_slice;
mod managed_directory;
mod static_directory;
/// Errors specific to the directory module.
pub mod error;
use std::io::{Write, Seek};
use std::io::{BufWriter, Seek, Write};
use std::io::BufWriter;
pub use self::static_directory::StaticDirectory;
pub use self::static_directory::write_static_from_directory;
pub use self::read_only_source::ReadOnlySource;
pub use self::directory::Directory;
pub use self::ram_directory::RAMDirectory;
#[cfg(feature="mmap")]
pub use self::mmap_directory::MmapDirectory;
pub(crate) use self::read_only_source::SourceRead;
pub(crate) use self::managed_directory::{FileProtection, ManagedDirectory};
/// Synonym of Seek + Write
pub trait SeekableWrite: Seek + Write {}
impl<T: Seek + Write> SeekableWrite for T {}
@@ -29,8 +45,8 @@ pub type WritePtr = BufWriter<Box<SeekableWrite>>;
mod tests {
use super::*;
use std::path::Path;
use std::io::{Write, Seek, SeekFrom};
use std::path::Path;
use std::io::{Seek, SeekFrom, Write};
lazy_static! {
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
@@ -43,6 +59,7 @@ mod tests {
}
#[test]
#[cfg(feature="mmap")]
fn test_mmap_directory() {
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
test_directory(&mut mmap_directory);
@@ -58,31 +75,37 @@ mod tests {
fn test_simple(directory: &mut Directory) {
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7,3,5]).unwrap();
write_file.flush().unwrap();
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7, 3, 5]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
assert!(directory.delete(*TEST_PATH).is_ok());
assert!(!directory.exists(*TEST_PATH));
}
fn test_seek(directory: &mut Directory) {
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[4, 3, 7,3,5]).unwrap();
write_file.seek(SeekFrom::Start(0)).unwrap();
write_file.write_all(&[3,1]).unwrap();
write_file.flush().unwrap();
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
write_file.seek(SeekFrom::Start(0)).unwrap();
write_file.write_all(&[3, 1]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
assert!(directory.delete(*TEST_PATH).is_ok());
}
@@ -90,7 +113,6 @@ mod tests {
{
directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
}
{
assert!(directory.open_write(*TEST_PATH).is_err());
@@ -103,27 +125,37 @@ mod tests {
assert!(directory.open_read(*TEST_PATH).is_err());
let _w = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
if let Err(e) = directory.open_read(*TEST_PATH) {
println!("{:?}", e);
}
assert!(directory.open_read(*TEST_PATH).is_ok());
assert!(directory.delete(*TEST_PATH).is_ok());
}
}
fn test_delete(directory: &mut Directory) {
fn test_directory_delete(directory: &mut Directory) {
assert!(directory.open_read(*TEST_PATH).is_err());
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[1, 2, 3, 4]).unwrap();
write_file.flush().unwrap();
let read_handle = directory.open_read(*TEST_PATH).unwrap();
{
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
assert!(directory.delete(*TEST_PATH).is_ok());
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
let read_handle = directory.open_read(*TEST_PATH).unwrap();
{
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
// Mapped files can't be deleted on Windows
if !cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
}
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
}
}
if cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
}
assert!(directory.open_read(*TEST_PATH).is_err());
assert!(directory.delete(*TEST_PATH).is_err());
}
fn test_directory(directory: &mut Directory) {
@@ -131,7 +163,7 @@ mod tests {
test_seek(directory);
test_rewrite_forbidden(directory);
test_write_create_the_file(directory);
test_delete(directory);
test_directory_delete(directory);
}
}

View File

@@ -1,24 +1,24 @@
use std::collections::HashMap;
use std::fmt;
use std::io::{self, BufWriter, Cursor, Write, Seek, SeekFrom};
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};
use std::result;
use std::sync::{Arc, RwLock};
use common::make_io_err;
use directory::{Directory, ReadOnlySource};
use directory::error::{OpenWriteError, FileError};
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::WritePtr;
use super::shared_vec_slice::SharedVecSlice;
/// Writer associated with the `RAMDirectory`
///
///
/// The Writer just writes a buffer.
///
/// # Panics
///
/// On drop, if the writer was left in a *dirty* state.
/// That is, if flush was not called after the last call
/// to write.
/// to write.
///
struct VecWriter {
path: PathBuf,
@@ -32,7 +32,7 @@ impl VecWriter {
VecWriter {
path: path_buf,
data: Cursor::new(Vec::new()),
shared_directory: shared_directory,
shared_directory,
is_flushed: true,
}
}
@@ -40,8 +40,11 @@ impl VecWriter {
impl Drop for VecWriter {
fn drop(&mut self) {
if !self.is_flushed {
panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.", self.path)
if !self.is_flushed {
panic!(
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
self.path
)
}
}
}
@@ -55,13 +58,14 @@ impl Seek for VecWriter {
impl Write for VecWriter {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.is_flushed = false;
try!(self.data.write(buf));
self.data.write_all(buf)?;
Ok(buf.len())
}
fn flush(&mut self) -> io::Result<()> {
self.is_flushed = true;
try!(self.shared_directory.write(self.path.clone(), self.data.get_ref()));
self.shared_directory
.write(self.path.clone(), self.data.get_ref())?;
Ok(())
}
}
@@ -69,57 +73,58 @@ impl Write for VecWriter {
#[derive(Clone)]
struct InnerDirectory(Arc<RwLock<HashMap<PathBuf, Arc<Vec<u8>>>>>);
impl InnerDirectory {
fn new() -> InnerDirectory {
InnerDirectory(Arc::new(RwLock::new(HashMap::new())))
}
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
let mut map = try!(
self.0
.write()
.map_err(|_| make_io_err(format!("Failed to lock the directory, when trying to write {:?}", path)))
);
let mut map = self.0.write().map_err(|_| {
make_io_err(format!(
"Failed to lock the directory, when trying to write {:?}",
path
))
})?;
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
Ok(prev_value.is_some())
}
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, FileError> {
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
self.0
.read()
.map_err(|_| {
let io_err = make_io_err(format!("Failed to acquire read lock for the directory, when trying to read {:?}", path));
FileError::IOError(io_err)
let msg = format!(
"Failed to acquire read lock for the \
directory when trying to read {:?}",
path
);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|readable_map| {
readable_map
.get(path)
.ok_or_else(|| FileError::FileDoesNotExist(PathBuf::from(path)))
.map(|data| {
ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone()))
})
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
.map(Arc::clone)
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data)))
})
}
fn delete(&self, path: &Path) -> result::Result<(), FileError> {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
self.0
.write()
.map_err(|_| {
let io_err = make_io_err(format!("Failed to acquire write lock for the directory, when trying to delete {:?}", path));
FileError::IOError(io_err)
let msg = format!(
"Failed to acquire write lock for the \
directory when trying to delete {:?}",
path
);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|mut writable_map| {
match writable_map.remove(path) {
Some(_) => {
Ok(())
},
None => {
Err(FileError::FileDoesNotExist(PathBuf::from(path)))
}
}
.and_then(|mut writable_map| match writable_map.remove(path) {
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})
}
@@ -129,16 +134,14 @@ impl InnerDirectory {
.expect("Failed to get read lock directory.")
.contains_key(path)
}
}
impl fmt::Debug for RAMDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "RAMDirectory")
}
write!(f, "RAMDirectory")
}
}
/// A Directory storing everything in anonymous memory.
///
/// It is mainly meant for unit testing.
@@ -150,52 +153,58 @@ pub struct RAMDirectory {
}
impl RAMDirectory {
/// Constructor
pub fn create() -> RAMDirectory {
RAMDirectory {
fs: InnerDirectory::new()
fs: InnerDirectory::new(),
}
}
}
impl Directory for RAMDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, FileError> {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
self.fs.open_read(path)
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
let path_buf = PathBuf::from(path);
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
let exists = self.fs
.write(path_buf.clone(), &Vec::new())
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
// force the creation of the file to mimic the MMap directory.
if try!(self.fs.write(path_buf.clone(), &Vec::new())) {
if exists {
Err(OpenWriteError::FileAlreadyExists(path_buf))
}
else {
} else {
Ok(BufWriter::new(Box::new(vec_writer)))
}
}
fn delete(&self, path: &Path) -> result::Result<(), FileError> {
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
self.fs.delete(path)
}
fn exists(&self, path: &Path) -> bool {
self.fs.exists(path)
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
let read = self.open_read(path)?;
Ok(read.as_slice().to_owned())
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
let path_buf = PathBuf::from(path);
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
try!(self.fs.write(path_buf, &Vec::new()));
try!(vec_writer.write_all(data));
try!(vec_writer.flush());
self.fs.write(path_buf, &Vec::new())?;
vec_writer.write_all(data)?;
vec_writer.flush()?;
Ok(())
}
fn box_clone(&self,) -> Box<Directory> {
fn box_clone(&self) -> Box<Directory> {
Box::new(self.clone())
}
}

View File

@@ -1,24 +1,34 @@
#[cfg(feature="mmap")]
use fst::raw::MmapReadOnly;
use std::ops::Deref;
use super::shared_vec_slice::SharedVecSlice;
use common::HasLen;
use std::slice;
use std::io::{self, Read};
use stable_deref_trait::{CloneStableDeref, StableDeref};
const EMPTY_SLICE: [u8; 0] = [];
/// Read object that represents files in tantivy.
///
///
/// These read objects are only in charge to deliver
/// the data in the form of a constant read-only `&[u8]`.
/// Whatever happens to the directory file, the data
/// hold by this object should never be altered or destroyed.
pub enum ReadOnlySource {
/// Mmap source of data
#[cfg(feature="mmap")]
Mmap(MmapReadOnly),
/// Wrapping a `Vec<u8>`
/// Wrapping a `Vec<u8>`
Anonymous(SharedVecSlice),
/// Wrapping a static slice
Static(&'static [u8])
}
unsafe impl StableDeref for ReadOnlySource {}
unsafe impl CloneStableDeref for ReadOnlySource {}
impl Deref for ReadOnlySource {
type Target = [u8];
fn deref(&self) -> &[u8] {
@@ -27,49 +37,76 @@ impl Deref for ReadOnlySource {
}
impl ReadOnlySource {
/// Creates an empty ReadOnlySource
pub fn empty() -> ReadOnlySource {
ReadOnlySource::Anonymous(SharedVecSlice::empty())
ReadOnlySource::Static(&EMPTY_SLICE)
}
/// Returns the data underlying the ReadOnlySource object.
pub fn as_slice(&self,) -> &[u8] {
pub fn as_slice(&self) -> &[u8] {
match *self {
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe {
mmap_read_only.as_slice()
},
ReadOnlySource::Anonymous(ref shared_vec) => {
shared_vec.as_slice()
},
#[cfg(feature="mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
ReadOnlySource::Static(data) => data,
}
}
/// Creates a ReadOnlySource that is just a
/// Splits into 2 `ReadOnlySource`, at the offset given
/// as an argument.
pub fn split(self, addr: usize) -> (ReadOnlySource, ReadOnlySource) {
let left = self.slice(0, addr);
let right = self.slice_from(addr);
(left, right)
}
/// Creates a ReadOnlySource that is just a
/// view over a slice of the data.
///
///
/// Keep in mind that any living slice extends
/// the lifetime of the original ReadOnlySource,
///
///
/// For instance, if `ReadOnlySource` wraps 500MB
/// worth of data in anonymous memory, and only a
/// 1KB slice is remaining, the whole `500MBs`
/// 1KB slice is remaining, the whole `500MBs`
/// are retained in memory.
pub fn slice(&self, from_offset:usize, to_offset:usize) -> ReadOnlySource {
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
assert!(from_offset <= to_offset, "Requested negative slice [{}..{}]", from_offset, to_offset);
match *self {
#[cfg(feature="mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => {
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
ReadOnlySource::Mmap(sliced_mmap)
}
ReadOnlySource::Anonymous(ref shared_vec) => {
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
},
}
ReadOnlySource::Static(data) => {
ReadOnlySource::Static(&data[from_offset..to_offset])
}
}
}
/// Like `.slice(...)` but enforcing only the `from`
/// boundary.
///
/// Equivalent to `.slice(from_offset, self.len())`
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
let len = self.len();
self.slice(from_offset, len)
}
/// Like `.slice(...)` but enforcing only the `to`
/// boundary.
///
/// Equivalent to `.slice(0, to_offset)`
pub fn slice_to(&self, to_offset: usize) -> ReadOnlySource {
self.slice(0, to_offset)
}
}
impl HasLen for ReadOnlySource {
fn len(&self,) -> usize {
fn len(&self) -> usize {
self.as_slice().len()
}
}
@@ -79,3 +116,64 @@ impl Clone for ReadOnlySource {
self.slice(0, self.len())
}
}
impl From<Vec<u8>> for ReadOnlySource {
fn from(data: Vec<u8>) -> ReadOnlySource {
let shared_data = SharedVecSlice::from(data);
ReadOnlySource::Anonymous(shared_data)
}
}
impl From<&'static [u8]> for ReadOnlySource {
fn from(data: &'static [u8]) -> ReadOnlySource {
ReadOnlySource::Static(data)
}
}
/// Acts as a owning cursor over the data backed up by a `ReadOnlySource`
pub(crate) struct SourceRead {
_data_owner: ReadOnlySource,
cursor: &'static [u8],
}
impl SourceRead {
// Advance the cursor by a given number of bytes.
pub fn advance(&mut self, len: usize) {
self.cursor = &self.cursor[len..];
}
pub fn slice_from(&self, start: usize) -> &[u8] {
&self.cursor[start..]
}
pub fn get(&self, idx: usize) -> u8 {
self.cursor[idx]
}
}
impl AsRef<[u8]> for SourceRead {
fn as_ref(&self) -> &[u8] {
self.cursor
}
}
impl From<ReadOnlySource> for SourceRead {
// Creates a new `SourceRead` from a given `ReadOnlySource`
fn from(source: ReadOnlySource) -> SourceRead {
let len = source.len();
let slice_ptr = source.as_slice().as_ptr();
let static_slice = unsafe { slice::from_raw_parts(slice_ptr, len) };
SourceRead {
_data_owner: source,
cursor: static_slice,
}
}
}
impl Read for SourceRead {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
self.cursor.read(buf)
}
}

View File

@@ -1,15 +1,13 @@
use std::sync::Arc;
#[derive(Clone)]
pub struct SharedVecSlice {
pub data: Arc<Vec<u8>>,
pub start: usize,
pub len: usize
pub start: usize,
pub len: usize,
}
impl SharedVecSlice {
pub fn empty() -> SharedVecSlice {
SharedVecSlice::new(Arc::new(Vec::new()))
}
@@ -17,21 +15,27 @@ impl SharedVecSlice {
pub fn new(data: Arc<Vec<u8>>) -> SharedVecSlice {
let data_len = data.len();
SharedVecSlice {
data: data,
data,
start: 0,
len: data_len,
}
}
pub fn as_slice(&self,) -> &[u8] {
pub fn as_slice(&self) -> &[u8] {
&self.data[self.start..self.start + self.len]
}
pub fn slice(&self, from_offset: usize, to_offset:usize) -> SharedVecSlice {
pub fn slice(&self, from_offset: usize, to_offset: usize) -> SharedVecSlice {
SharedVecSlice {
data: self.data.clone(),
data: Arc::clone(&self.data),
start: self.start + from_offset,
len: to_offset - from_offset,
}
}
}
impl From<Vec<u8>> for SharedVecSlice {
fn from(data: Vec<u8>) -> SharedVecSlice {
SharedVecSlice::new(Arc::new(data))
}
}

View File

@@ -0,0 +1,123 @@
use std::collections::HashMap;
use Directory;
use std::path::PathBuf;
use directory::ReadOnlySource;
use std::io::BufWriter;
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
use std::path::Path;
use std::fmt::{Formatter, Debug, self};
use Result as TantivyResult;
use directory::SeekableWrite;
use std::io;
use std::fs;
use common::Endianness;
use common::BinarySerializable;
use common::VInt;
use byteorder::ByteOrder;
use std::str;
use std::fs::File;
use std::io::{Read, Write};
use std::ffi::OsString;
#[derive(Clone)]
pub struct StaticDirectory {
files: HashMap<PathBuf, &'static [u8]>,
}
impl Debug for StaticDirectory {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
write!(f, "StaticDirectory[{} files]", self.files.len())?;
Ok(())
}
}
impl StaticDirectory {
pub fn open(mut data: &'static [u8]) -> TantivyResult<StaticDirectory> {
assert!(data.len() > 8);
let footer_len_offset = data.len() - 8;
let body_len = Endianness::read_u64(&data[footer_len_offset..]) as usize;
let mut body = &data[..body_len];
let mut footer = &data[body_len..footer_len_offset];
let num_files = VInt::deserialize(&mut footer)?.0 as usize;
let mut files = HashMap::new();
for _ in 0..num_files {
let filename_len = VInt::deserialize(&mut footer)?.0 as usize;
let filename = &footer[..filename_len];
footer = &footer[filename_len..];
let data_len = VInt::deserialize(&mut footer)?.0 as usize;
let file_data = &body[..data_len];
body = &body[data_len..];
let filename_str = str::from_utf8(filename).expect("Invalid UTF8");
let filename = PathBuf::from(filename_str);
println!("{:?} {:?}", filename, data_len);
files.insert(filename, file_data);
}
Ok(StaticDirectory {
files
})
}
}
impl Directory for StaticDirectory {
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
if let Some(static_data) = self.files.get(path) {
Ok(ReadOnlySource::from(*static_data))
} else {
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
}
}
fn delete(&self, path: &Path) -> Result<(), DeleteError> {
unimplemented!("Static directory is read-only !")
}
fn exists(&self, path: &Path) -> bool {
self.files.contains_key(path)
}
fn open_write(&mut self, path: &Path) -> Result<BufWriter<Box<SeekableWrite>>, OpenWriteError> {
unimplemented!("Static directory is read-only !")
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
if let Some(static_data) = self.files.get(path) {
Ok(static_data.to_vec())
} else {
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
}
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
unimplemented!("Static directory is read-only !")
}
fn box_clone(&self) -> Box<Directory> {
box self.clone()
}
}
pub fn write_static_from_directory(directory_path: &Path) -> TantivyResult<Vec<u8>> {
assert!(directory_path.is_dir());
let mut file_data: Vec<(OsString, usize)> = Vec::new();
let mut write: Vec<u8> = Vec::new();
for entry in fs::read_dir(directory_path)? {
let entry = entry?;
let path = entry.path();
if path.is_file() {
info!("Appending {}", path.to_string_lossy());
let mut open_file = File::open(&path)?;
let file_len = open_file.read_to_end(&mut write)?;
file_data.push((entry.file_name(), file_len));
}
}
// write footer
let body_len = write.len();
VInt(file_data.len() as u64).serialize(&mut write)?;
for (filename, filelen) in file_data {
VInt(filename.len() as u64).serialize(&mut write)?;
write.write_all(filename.to_string_lossy().as_bytes())?;
VInt(filelen as u64).serialize(&mut write)?;
}
(body_len as u64).serialize(&mut write)?;
Ok(write)
}

View File

@@ -2,7 +2,7 @@ use DocId;
use std::borrow::Borrow;
use std::borrow::BorrowMut;
use std::cmp::Ordering;
use common::BitSet;
/// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`.
#[derive(PartialEq, Eq, Debug)]
@@ -16,7 +16,6 @@ pub enum SkipResult {
End,
}
/// Represents an iterable set of sorted doc ids.
pub trait DocSet {
/// Goes to the next element.
@@ -35,6 +34,9 @@ pub trait DocSet {
/// More specifically, if the docset is already positionned on the target
/// skipping will advance to the next position and return SkipResult::Overstep.
///
/// If `.skip_next()` oversteps, then the docset must be positionned correctly
/// on an existing document. In other words, `.doc()` should return the first document
/// greater than `DocId`.
fn skip_next(&mut self, target: DocId) -> SkipResult {
if !self.advance() {
return SkipResult::End;
@@ -52,19 +54,56 @@ pub trait DocSet {
}
}
/// Fills a given mutable buffer with the next doc ids from the
/// `DocSet`
///
/// If that many `DocId`s are available, the method should
/// fill the entire buffer and return the length of the buffer.
///
/// If we reach the end of the `DocSet` before filling
/// it entirely, then the buffer is filled up to this point, and
/// return value is the number of elements that were filled.
///
/// # Warning
///
/// This method is only here for specific high-performance
/// use case where batching. The normal way to
/// go through the `DocId`'s is to call `.advance()`.
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
for (i, buffer_val) in buffer.iter_mut().enumerate() {
if self.advance() {
*buffer_val = self.doc();
} else {
return i;
}
}
buffer.len()
}
/// Returns the current document
fn doc(&self) -> DocId;
/// Advances the cursor to the next document
/// None is returned if the iterator has `DocSet`
/// has already been entirely consumed.
fn next(&mut self) -> Option<DocId> {
if self.advance() {
Some(self.doc())
} else {
None
/// Returns a best-effort hint of the
/// length of the docset.
fn size_hint(&self) -> u32;
/// Appends all docs to a `bitset`.
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
while self.advance() {
bitset.insert(self.doc());
}
}
/// Returns the number documents matching.
///
/// Calling this method consumes the `DocSet`.
fn count(&mut self) -> u32 {
let mut count = 0u32;
while self.advance() {
count += 1u32;
}
count
}
}
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
@@ -82,21 +121,19 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
let unboxed: &TDocSet = self.borrow();
unboxed.doc()
}
}
impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
fn advance(&mut self) -> bool {
let unref: &mut TDocSet = *self;
unref.advance()
fn size_hint(&self) -> u32 {
let unboxed: &TDocSet = self.borrow();
unboxed.size_hint()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
let unref: &mut TDocSet = *self;
unref.skip_next(target)
fn count(&mut self) -> u32 {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count()
}
fn doc(&self) -> DocId {
let unref: &TDocSet = *self;
unref.doc()
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.append_to_bitset(bitset);
}
}

View File

@@ -1,95 +1,138 @@
#![allow(enum_variant_names)]
/// Definition of Tantivy's error and result.
//! Definition of Tantivy's error and result.
use std::io;
use std::result;
use std::path::PathBuf;
use std::error;
use std::sync::PoisonError;
use directory::error::{FileError, OpenWriteError, OpenDirectoryError};
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use query;
use schema;
use fastfield::FastFieldNotAvailableError;
use serde_json;
error_chain!(
errors {
/// Path does not exist.
PathDoesNotExist(buf: PathBuf) {
description("path does not exist")
display("path does not exist: '{:?}'", buf)
}
/// File already exists, this is a problem when we try to write into a new file.
FileAlreadyExists(buf: PathBuf) {
description("file already exists")
display("file already exists: '{:?}'", buf)
}
/// IO Error.
IOError(err: IOError) {
description("an IO error occurred")
display("an IO error occurred: '{}'", err)
}
/// The data within is corrupted.
///
/// For instance, it contains invalid JSON.
CorruptedFile(buf: PathBuf) {
description("file contains corrupted data")
display("file contains corrupted data: '{:?}'", buf)
}
/// A thread holding the locked panicked and poisoned the lock.
Poisoned {
description("a thread holding the locked panicked and poisoned the lock")
}
/// Invalid argument was passed by the user.
InvalidArgument(arg: String) {
description("an invalid argument was passed")
display("an invalid argument was passed: '{}'", arg)
}
/// An Error happened in one of the thread.
ErrorInThread(err: String) {
description("an error occurred in a thread")
display("an error occurred in a thread: '{}'", err)
}
/// An Error appeared related to the lack of a field.
SchemaError(field: String) {
description("a schema field is missing")
display("a schema field is missing: '{}'", field)
}
/// Tried to access a fastfield reader for a field not configured accordingly.
FastFieldError(err: FastFieldNotAvailableError) {
description("fast field not available")
display("fast field not available: '{:?}'", err)
}
}
);
/// Tantivy result.
pub type Result<T> = result::Result<T, Error>;
impl From<FastFieldNotAvailableError> for Error {
fn from(fastfield_error: FastFieldNotAvailableError) -> Error {
ErrorKind::FastFieldError(fastfield_error).into()
}
}
/// Generic tantivy error.
///
/// Any specialized error return in tantivy can be converted in `tantivy::Error`.
#[derive(Debug)]
pub enum Error {
/// Path does not exist.
PathDoesNotExist(PathBuf),
/// File already exists, this is a problem when we try to write into a new file.
FileAlreadyExists(PathBuf),
/// IO Error
IOError(io::Error),
/// A thread holding the locked panicked and poisoned the lock.
Poisoned,
/// The data within is corrupted.
///
/// For instance, it contains invalid JSON.
CorruptedFile(PathBuf, Box<error::Error + Send>),
/// Invalid argument was passed by the user.
InvalidArgument(String),
/// An Error happened in one of the thread
ErrorInThread(String), // TODO investigate better solution
impl From<IOError> for Error {
fn from(io_error: IOError) -> Error {
ErrorKind::IOError(io_error).into()
}
}
impl From<io::Error> for Error {
fn from(io_error: io::Error) -> Error {
Error::IOError(io_error)
ErrorKind::IOError(io_error.into()).into()
}
}
impl From<query::QueryParserError> for Error {
fn from(parsing_error: query::QueryParserError) -> Error {
Error::InvalidArgument(format!("Query is invalid. {:?}", parsing_error))
ErrorKind::InvalidArgument(format!("Query is invalid. {:?}", parsing_error)).into()
}
}
impl<Guard> From<PoisonError<Guard>> for Error {
fn from(_: PoisonError<Guard>) -> Error {
Error::Poisoned
ErrorKind::Poisoned.into()
}
}
impl From<FileError> for Error {
fn from(error: FileError) -> Error {
impl From<OpenReadError> for Error {
fn from(error: OpenReadError) -> Error {
match error {
FileError::FileDoesNotExist(filepath) => Error::PathDoesNotExist(filepath),
FileError::IOError(io_error) => Error::IOError(io_error),
OpenReadError::FileDoesNotExist(filepath) => {
ErrorKind::PathDoesNotExist(filepath).into()
}
OpenReadError::IOError(io_error) => ErrorKind::IOError(io_error).into(),
}
}
}
impl From<schema::DocParsingError> for Error {
fn from(error: schema::DocParsingError) -> Error {
Error::InvalidArgument(format!("Failed to parse document {:?}", error))
ErrorKind::InvalidArgument(format!("Failed to parse document {:?}", error)).into()
}
}
impl From<OpenWriteError> for Error {
fn from(error: OpenWriteError) -> Error {
match error {
OpenWriteError::FileAlreadyExists(filepath) =>
Error::FileAlreadyExists(filepath),
OpenWriteError::IOError(io_error) =>
Error::IOError(io_error),
}
OpenWriteError::FileAlreadyExists(filepath) => ErrorKind::FileAlreadyExists(filepath),
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
}.into()
}
}
impl From<OpenDirectoryError> for Error {
fn from(error: OpenDirectoryError) -> Error {
match error {
OpenDirectoryError::DoesNotExist(directory_path) =>
Error::PathDoesNotExist(directory_path),
OpenDirectoryError::NotADirectory(directory_path) =>
Error::InvalidArgument(format!("{:?} is not a directory", directory_path)),
OpenDirectoryError::DoesNotExist(directory_path) => {
ErrorKind::PathDoesNotExist(directory_path).into()
}
OpenDirectoryError::NotADirectory(directory_path) => ErrorKind::InvalidArgument(
format!("{:?} is not a directory", directory_path),
).into(),
}
}
}
impl From<serde_json::Error> for Error {
fn from(error: serde_json::Error) -> Error {
let io_err = io::Error::from(error);
ErrorKind::IOError(io_err.into()).into()
}
}

118
src/fastfield/delete.rs Normal file
View File

@@ -0,0 +1,118 @@
use bit_set::BitSet;
use directory::WritePtr;
use std::io::Write;
use std::io;
use directory::ReadOnlySource;
use DocId;
use common::HasLen;
/// Write a delete `BitSet`
///
/// where `delete_bitset` is the set of deleted `DocId`.
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> {
let max_doc = delete_bitset.capacity();
let mut byte = 0u8;
let mut shift = 0u8;
for doc in 0..max_doc {
if delete_bitset.contains(doc) {
byte |= 1 << shift;
}
if shift == 7 {
writer.write_all(&[byte])?;
shift = 0;
byte = 0;
} else {
shift += 1;
}
}
if max_doc % 8 > 0 {
writer.write_all(&[byte])?;
}
writer.flush()
}
/// Set of deleted `DocId`s.
#[derive(Clone)]
pub struct DeleteBitSet {
data: ReadOnlySource,
len: usize,
}
impl DeleteBitSet {
/// Opens a delete bitset given its data source.
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
let num_deleted: usize = data.as_slice()
.iter()
.map(|b| b.count_ones() as usize)
.sum();
DeleteBitSet {
data,
len: num_deleted,
}
}
/// Returns whether the document has been marked as deleted.
pub fn is_deleted(&self, doc: DocId) -> bool {
if self.len == 0 {
false
} else {
let byte_offset = doc / 8u32;
let b: u8 = (*self.data)[byte_offset as usize];
let shift = (doc & 7u32) as u8;
b & (1u8 << shift) != 0
}
}
}
impl HasLen for DeleteBitSet {
fn len(&self) -> usize {
self.len
}
}
#[cfg(test)]
mod tests {
use std::path::PathBuf;
use bit_set::BitSet;
use directory::*;
use super::*;
fn test_delete_bitset_helper(bitset: &BitSet) {
let test_path = PathBuf::from("test");
let mut directory = RAMDirectory::create();
{
let mut writer = directory.open_write(&*test_path).unwrap();
write_delete_bitset(bitset, &mut writer).unwrap();
}
{
let source = directory.open_read(&test_path).unwrap();
let delete_bitset = DeleteBitSet::open(source);
let n = bitset.capacity();
for doc in 0..n {
assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId));
}
assert_eq!(delete_bitset.len(), bitset.len());
}
}
#[test]
fn test_delete_bitset() {
{
let mut bitset = BitSet::with_capacity(10);
bitset.insert(1);
bitset.insert(9);
test_delete_bitset_helper(&bitset);
}
{
let mut bitset = BitSet::with_capacity(8);
bitset.insert(1);
bitset.insert(2);
bitset.insert(3);
bitset.insert(5);
bitset.insert(7);
test_delete_bitset_helper(&bitset);
}
}
}

24
src/fastfield/error.rs Normal file
View File

@@ -0,0 +1,24 @@
use std::result;
use schema::FieldEntry;
/// `FastFieldNotAvailableError` is returned when the
/// user requested for a fast field reader, and the field was not
/// defined in the schema as a fast field.
#[derive(Debug)]
pub struct FastFieldNotAvailableError {
field_name: String,
}
impl FastFieldNotAvailableError {
/// Creates a `FastFieldNotAvailable` error.
/// `field_entry` is the configuration of the field
/// for which fast fields are not available.
pub fn new(field_entry: &FieldEntry) -> FastFieldNotAvailableError {
FastFieldNotAvailableError {
field_name: field_entry.name().to_string(),
}
}
}
/// Result when trying to access a fast field reader.
pub type Result<R> = result::Result<R, FastFieldNotAvailableError>;

View File

@@ -0,0 +1,68 @@
use super::MultiValueIntFastFieldReader;
use DocId;
use termdict::TermOrdinal;
use schema::Facet;
use termdict::{TermDictionary, TermDictionaryImpl};
/// The facet reader makes it possible to access the list of
/// facets associated to a given document in a specific
/// segment.
///
/// Rather than manipulating `Facet` object directly, the API
/// exposes those in the form of list of `Facet` ordinal.
///
/// A segment ordinal can then be translated into a facet via
/// `.facet_from_ord(...)`.
///
/// Facet ordinals are defined as their position in the sorted
/// list of facets. This ordinal is segment local and
/// only makes sense for a given segment.
pub struct FacetReader {
term_ords: MultiValueIntFastFieldReader<u64>,
term_dict: TermDictionaryImpl,
}
impl FacetReader {
/// Creates a new `FacetReader`.
///
/// A facet reader just wraps :
/// - a `MultiValueIntFastFieldReader` that makes it possible to
/// access the list of facet ords for a given document.
/// - a `TermDictionaryImpl` that helps associating a facet to
/// an ordinal and vice versa.
pub fn new(
term_ords: MultiValueIntFastFieldReader<u64>,
term_dict: TermDictionaryImpl,
) -> FacetReader {
FacetReader {
term_ords,
term_dict,
}
}
/// Returns the size of the sets of facets in the segment.
/// This does not take in account the documents that may be marked
/// as deleted.
///
/// `Facet` ordinals range from `0` to `num_facets() - 1`.
pub fn num_facets(&self) -> usize {
self.term_dict.num_terms()
}
/// Accessor for the facet term dictionary.
pub fn facet_dict(&self) -> &TermDictionaryImpl {
&self.term_dict
}
/// Given a term ordinal returns the term associated to it.
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
let found_term = self.term_dict
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
assert!(found_term, "Term ordinal {} no found.", facet_ord);
}
/// Return the list of facet ordinals associated to a document.
pub fn facet_ords(&mut self, doc: DocId, output: &mut Vec<u64>) {
self.term_ords.get_vals(doc, output);
}
}

View File

@@ -1,64 +1,159 @@
/// Fast field module
///
/// Fast fields are the equivalent of `DocValues` in `Lucene`.
/// Fast fields are stored in column-oriented fashion and allow fast
/// random access given a `DocId`.
///
/// Their performance is comparable to that of an array lookup.
/// They are useful when a field is required for all or most of
/// the `DocSet` : for instance for scoring, grouping, filtering, or facetting.
///
/// Currently only u32 fastfield are supported.
/*!
Column oriented field storage for tantivy.
It is the equivalent of `Lucene`'s `DocValues`.
Fast fields is a column-oriented fashion storage of `tantivy`.
It is designed for the fast random access of some document
fields given a document id.
`FastField` are useful when a field is required for all or most of
the `DocSet` : for instance for scoring, grouping, filtering, or faceting.
Fields have to be declared as `FAST` in the schema.
Currently only 64-bits integers (signed or unsigned) are
supported.
They are stored in a bit-packed fashion so that their
memory usage is directly linear with the amplitude of the
values stored.
Read access performance is comparable to that of an array lookup.
*/
use common;
use schema::Cardinality;
use schema::FieldType;
use schema::Value;
pub use self::delete::DeleteBitSet;
pub use self::delete::write_delete_bitset;
pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub use self::multivalued::MultiValueIntFastFieldReader;
pub use self::reader::FastFieldReader;
pub use self::serializer::FastFieldSerializer;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
mod reader;
mod writer;
mod serializer;
mod error;
mod delete;
mod facet_reader;
mod multivalued;
pub use self::writer::{U32FastFieldsWriter, U32FastFieldWriter};
pub use self::reader::{U32FastFieldsReader, U32FastFieldReader};
pub use self::serializer::FastFieldSerializer;
/// Trait for types that are allowed for fast fields: (u64 or i64).
pub trait FastValue: Default + Clone + Copy {
/// Converts a value from u64
///
/// Internally all fast field values are encoded as u64.
fn from_u64(val: u64) -> Self;
/// Converts a value to u64.
///
/// Internally all fast field values are encoded as u64.
fn to_u64(&self) -> u64;
/// Returns the fast field cardinality that can be extracted from the given
/// `FieldType`.
///
/// If the type is not a fast field, `None` is returned.
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality>;
/// Cast value to `u64`.
/// The value is just reinterpreted in memory.
fn as_u64(&self) -> u64;
}
impl FastValue for u64 {
fn from_u64(val: u64) -> Self {
val
}
fn to_u64(&self) -> u64 {
*self
}
fn as_u64(&self) -> u64 {
*self
}
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
match *field_type {
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
FieldType::HierarchicalFacet => Some(Cardinality::MultiValues),
_ => None,
}
}
}
impl FastValue for i64 {
fn from_u64(val: u64) -> Self {
common::u64_to_i64(val)
}
fn to_u64(&self) -> u64 {
common::i64_to_u64(*self)
}
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
match *field_type {
FieldType::I64(ref integer_options) => integer_options.get_fastfield_cardinality(),
_ => None,
}
}
fn as_u64(&self) -> u64 {
*self as u64
}
}
fn value_to_u64(value: &Value) -> u64 {
match *value {
Value::U64(ref val) => *val,
Value::I64(ref val) => common::i64_to_u64(*val),
_ => panic!("Expected a u64/i64 field, got {:?} ", value),
}
}
#[cfg(test)]
mod tests {
use super::*;
use schema::Field;
use std::path::Path;
use directory::{Directory, WritePtr, RAMDirectory};
use schema::Document;
use schema::{Schema, SchemaBuilder};
use schema::FAST;
use test::Bencher;
use test;
use common::CompositeFile;
use directory::{Directory, RAMDirectory, WritePtr};
use fastfield::FastFieldReader;
use rand::Rng;
use rand::SeedableRng;
use rand::XorShiftRng;
use schema::{Schema, SchemaBuilder};
use schema::Document;
use schema::FAST;
use schema::Field;
use std::collections::HashMap;
use std::path::Path;
use super::*;
use test;
use test::Bencher;
lazy_static! {
static ref SCHEMA: Schema = {
let mut schema_builder = SchemaBuilder::default();
schema_builder.add_u32_field("field", FAST);
schema_builder.add_u64_field("field", FAST);
schema_builder.build()
};
static ref FIELD: Field = {
static ref FIELD: Field = {
SCHEMA.get_field("field").unwrap()
};
}
fn add_single_field_doc(fast_field_writers: &mut U32FastFieldsWriter, field: Field, value: u32) {
let mut doc = Document::default();
doc.add_u32(field, value);
fast_field_writers.add_document(&doc);
}
#[test]
pub fn test_fastfield() {
let test_fastfield = U32FastFieldReader::from(vec!(100,200,300));
println!("{}", test_fastfield.get(0));
println!("{}", test_fastfield.get(1));
println!("{}", test_fastfield.get(2));
let test_fastfield = FastFieldReader::<u64>::from(vec![100, 200, 300]);
assert_eq!(test_fastfield.get(0), 100);
assert_eq!(test_fastfield.get(1), 200);
assert_eq!(test_fastfield.get(2), 300);
assert_eq!(test_fastfield.get(2), 300);
}
#[test]
@@ -67,24 +162,27 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 2u32);
fast_field_writers.serialize(&mut serializer).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
fast_field_writers.add_document(&doc!(*FIELD=>13u64));
fast_field_writers.add_document(&doc!(*FIELD=>14u64));
fast_field_writers.add_document(&doc!(*FIELD=>2u64));
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 20 as usize);
assert_eq!(source.len(), 36 as usize);
}
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
assert_eq!(fast_field_reader.get(0), 13u32);
assert_eq!(fast_field_reader.get(1), 14u32);
assert_eq!(fast_field_reader.get(2), 2u32);
let composite_file = CompositeFile::open(&source).unwrap();
let field_source = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(field_source);
assert_eq!(fast_field_reader.get(0), 13u64);
assert_eq!(fast_field_reader.get(1), 14u64);
assert_eq!(fast_field_reader.get(2), 2u64);
}
}
@@ -94,72 +192,187 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 4u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 3_052u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 9002u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 15_001u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 777u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_002u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 215u32);
fast_field_writers.serialize(&mut serializer).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
fast_field_writers.add_document(&doc!(*FIELD=>4u64));
fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64));
fast_field_writers.add_document(&doc!(*FIELD=>3_052u64));
fast_field_writers.add_document(&doc!(*FIELD=>9_002u64));
fast_field_writers.add_document(&doc!(*FIELD=>15_001u64));
fast_field_writers.add_document(&doc!(*FIELD=>777u64));
fast_field_writers.add_document(&doc!(*FIELD=>1_002u64));
fast_field_writers.add_document(&doc!(*FIELD=>1_501u64));
fast_field_writers.add_document(&doc!(*FIELD=>215u64));
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 45 as usize);
assert_eq!(source.len(), 61 as usize);
}
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
assert_eq!(fast_field_reader.get(0), 4u32);
assert_eq!(fast_field_reader.get(1), 14_082_001u32);
assert_eq!(fast_field_reader.get(2), 3_052u32);
assert_eq!(fast_field_reader.get(3), 9002u32);
assert_eq!(fast_field_reader.get(4), 15_001u32);
assert_eq!(fast_field_reader.get(5), 777u32);
assert_eq!(fast_field_reader.get(6), 1_002u32);
assert_eq!(fast_field_reader.get(7), 1_501u32);
assert_eq!(fast_field_reader.get(8), 215u32);
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);
assert_eq!(fast_field_reader.get(0), 4u64);
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
assert_eq!(fast_field_reader.get(2), 3_052u64);
assert_eq!(fast_field_reader.get(3), 9002u64);
assert_eq!(fast_field_reader.get(4), 15_001u64);
assert_eq!(fast_field_reader.get(5), 777u64);
assert_eq!(fast_field_reader.get(6), 1_002u64);
assert_eq!(fast_field_reader.get(7), 1_501u64);
assert_eq!(fast_field_reader.get(8), 215u64);
}
}
#[test]
fn test_intfastfield_null_amplitude() {
#[test]
fn test_intfastfield_null_amplitude() {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for _ in 0..10_000 {
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u32);
fast_field_writers.add_document(&doc!(*FIELD=>100_000u64));
}
fast_field_writers.serialize(&mut serializer).unwrap();
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 18 as usize);
assert_eq!(source.len(), 34 as usize);
}
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);
for doc in 0..10_000 {
assert_eq!(fast_field_reader.get(doc), 100_000u32);
assert_eq!(fast_field_reader.get(doc), 100_000u64);
}
}
}
fn generate_permutation() -> Vec<u32> {
#[test]
fn test_intfastfield_large_numbers() {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
// forcing the amplitude to be high
fast_field_writers.add_document(&doc!(*FIELD=>0u64));
for i in 0u64..10_000u64 {
fast_field_writers.add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + i));
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 80042 as usize);
}
{
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);
assert_eq!(fast_field_reader.get(0), 0u64);
for doc in 1..10_001 {
assert_eq!(
fast_field_reader.get(doc),
5_000_000_000_000_000_000u64 + doc as u64 - 1u64
);
}
}
}
#[test]
fn test_signed_intfastfield() {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
let mut schema_builder = SchemaBuilder::new();
let i64_field = schema_builder.add_i64_field("field", FAST);
let schema = schema_builder.build();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
for i in -100i64..10_000i64 {
let mut doc = Document::default();
doc.add_i64(i64_field, i);
fast_field_writers.add_document(&doc);
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 17709 as usize);
}
{
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let data = fast_fields_composite.open_read(i64_field).unwrap();
let fast_field_reader = FastFieldReader::<i64>::open(data);
assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64);
for (doc, i) in (-100i64..10_000i64).enumerate() {
assert_eq!(fast_field_reader.get(doc as u32), i);
}
let mut buffer = vec![0i64; 100];
fast_field_reader.get_range(53, &mut buffer[..]);
for i in 0..100 {
assert_eq!(buffer[i], -100i64 + 53i64 + i as i64);
}
}
}
#[test]
fn test_signed_intfastfield_default_val() {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
let mut schema_builder = SchemaBuilder::new();
let i64_field = schema_builder.add_i64_field("field", FAST);
let schema = schema_builder.build();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let doc = Document::default();
fast_field_writers.add_document(&doc);
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let data = fast_fields_composite.open_read(i64_field).unwrap();
let fast_field_reader = FastFieldReader::<i64>::open(data);
assert_eq!(fast_field_reader.get(0u32), 0i64);
}
}
fn generate_permutation() -> Vec<u64> {
let seed: &[u32; 4] = &[1, 2, 3, 4];
let mut rng = XorShiftRng::from_seed(*seed);
let mut permutation: Vec<u32> = (0u32..1_000_000u32).collect();
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
rng.shuffle(&mut permutation);
permutation
}
@@ -172,19 +385,23 @@ mod tests {
let mut directory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
}
fast_field_writers.serialize(&mut serializer).unwrap();
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
let mut a = 0u32;
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);
let mut a = 0u64;
for _ in 0..n {
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
a = fast_field_reader.get(a as u32);
@@ -197,8 +414,8 @@ mod tests {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u32;
for i in (0u32..n).step_by(7) {
let mut a = 0u64;
for i in Iterator::step_by(0u32..n, 7) {
a ^= permutation[i as usize];
}
a
@@ -210,7 +427,7 @@ mod tests {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;
let mut a = 0u64;
for _ in 0u32..n {
a = permutation[a as usize];
}
@@ -225,22 +442,26 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
}
fast_field_writers.serialize(&mut serializer).unwrap();
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u32;
for i in (0u32..n).step_by(7) {
let mut a = 0u64;
for i in Iterator::step_by(0u32..n, 7) {
a ^= fast_field_reader.get(i);
}
a
@@ -255,23 +476,27 @@ mod tests {
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
}
fast_field_writers.serialize(&mut serializer).unwrap();
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
let fast_fields_composite = CompositeFile::open(&source).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data);
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;
for _ in 0u32..n {
a = fast_field_reader.get(a);
a = fast_field_reader.get(a) as u32;
}
a
});

View File

@@ -0,0 +1,88 @@
mod writer;
mod reader;
pub use self::writer::MultiValueIntFastFieldWriter;
pub use self::reader::MultiValueIntFastFieldReader;
#[cfg(test)]
mod tests {
use schema::SchemaBuilder;
use schema::Cardinality;
use schema::IntOptions;
use Index;
#[test]
fn test_multivalued_u64() {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_u64_field(
"multifield",
IntOptions::default().set_fast(Cardinality::MultiValues),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(field=>1u64, field=>3u64));
index_writer.add_document(doc!());
index_writer.add_document(doc!(field=>4u64));
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
assert!(index_writer.commit().is_ok());
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let mut vals = Vec::new();
let multi_value_reader = reader.multi_fast_field_reader::<u64>(field).unwrap();
{
multi_value_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[4u64]);
}
{
multi_value_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[1u64, 3u64]);
}
{
multi_value_reader.get_vals(1, &mut vals);
assert!(vals.is_empty());
}
}
#[test]
fn test_multivalued_i64() {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_i64_field(
"multifield",
IntOptions::default().set_fast(Cardinality::MultiValues),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(field=> 1i64, field => 3i64));
index_writer.add_document(doc!());
index_writer.add_document(doc!(field=> -4i64));
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
assert!(index_writer.commit().is_ok());
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let mut vals = Vec::new();
let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap();
{
multi_value_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[-4i64]);
}
{
multi_value_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[1i64, 3i64]);
}
{
multi_value_reader.get_vals(1, &mut vals);
assert!(vals.is_empty());
}
{
multi_value_reader.get_vals(3, &mut vals);
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
}
}
}

View File

@@ -0,0 +1,109 @@
use DocId;
use fastfield::{FastFieldReader, FastValue};
/// Reader for a multivalued `u64` fast field.
///
/// The reader is implemented as two `u64` fast field.
///
/// The `vals_reader` will access the concatenated list of all
/// values for all reader.
/// The `idx_reader` associated, for each document, the index of its first value.
///
#[derive(Clone)]
pub struct MultiValueIntFastFieldReader<Item: FastValue> {
idx_reader: FastFieldReader<u64>,
vals_reader: FastFieldReader<Item>,
}
impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
pub(crate) fn open(
idx_reader: FastFieldReader<u64>,
vals_reader: FastFieldReader<Item>,
) -> MultiValueIntFastFieldReader<Item> {
MultiValueIntFastFieldReader {
idx_reader,
vals_reader,
}
}
/// Returns the array of values associated to the given `doc`.
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let start = self.idx_reader.get(doc) as u32;
let stop = self.idx_reader.get(doc + 1) as u32;
let len = (stop - start) as usize;
vals.resize(len, Item::default());
self.vals_reader.get_range(start, &mut vals[..]);
}
}
#[cfg(test)]
mod tests {
use core::Index;
use schema::{Document, Facet, SchemaBuilder};
#[test]
fn test_multifastfield_reader() {
let mut schema_builder = SchemaBuilder::new();
let facet_field = schema_builder.add_facet_field("facets");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index
.writer_with_num_threads(1, 30_000_000)
.expect("Failed to create index writer.");
{
let mut doc = Document::new();
doc.add_facet(facet_field, "/category/cat2");
doc.add_facet(facet_field, "/category/cat1");
index_writer.add_document(doc);
}
{
let mut doc = Document::new();
doc.add_facet(facet_field, "/category/cat2");
index_writer.add_document(doc);
}
{
let mut doc = Document::new();
doc.add_facet(facet_field, "/category/cat3");
index_writer.add_document(doc);
}
index_writer.commit().expect("Commit failed");
index.load_searchers().expect("Reloading searchers");
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
let mut facet = Facet::root();
{
facet_reader.facet_from_ord(1, &mut facet);
assert_eq!(facet, Facet::from("/category"));
}
{
facet_reader.facet_from_ord(2, &mut facet);
assert_eq!(facet, Facet::from("/category/cat1"));
}
{
facet_reader.facet_from_ord(3, &mut facet);
assert_eq!(format!("{}", facet), "/category/cat2");
assert_eq!(facet, Facet::from("/category/cat2"));
}
{
facet_reader.facet_from_ord(4, &mut facet);
assert_eq!(facet, Facet::from("/category/cat3"));
}
let mut vals = Vec::new();
{
facet_reader.facet_ords(0, &mut vals);
assert_eq!(&vals[..], &[3, 2]);
}
{
facet_reader.facet_ords(1, &mut vals);
assert_eq!(&vals[..], &[3]);
}
{
facet_reader.facet_ords(2, &mut vals);
assert_eq!(&vals[..], &[4]);
}
}
}

View File

@@ -0,0 +1,111 @@
use fastfield::FastFieldSerializer;
use fastfield::serializer::FastSingleFieldSerializer;
use fastfield::value_to_u64;
use std::collections::HashMap;
use postings::UnorderedTermId;
use schema::{Document, Field};
use std::io;
use itertools::Itertools;
pub struct MultiValueIntFastFieldWriter {
field: Field,
vals: Vec<u64>,
doc_index: Vec<u64>,
is_facet: bool,
}
impl MultiValueIntFastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field, is_facet: bool) -> Self {
MultiValueIntFastFieldWriter {
field,
vals: Vec::new(),
doc_index: Vec::new(),
is_facet,
}
}
pub fn field(&self) -> Field {
self.field
}
pub fn next_doc(&mut self) {
self.doc_index.push(self.vals.len() as u64);
}
/// Records a new value.
///
/// The n-th value being recorded is implicitely
/// associated to the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: UnorderedTermId) {
self.vals.push(val);
}
pub fn add_document(&mut self, doc: &Document) {
if !self.is_facet {
for field_value in doc.field_values() {
if field_value.field() == self.field {
self.add_val(value_to_u64(field_value.value()));
}
}
}
}
/// Serializes fast field values by pushing them to the `FastFieldSerializer`.
///
/// HashMap makes it possible to remap them before serializing.
/// Specifically, string terms are first stored in the writer as their
/// position in the `IndexWriter`'s `HashMap`. This value is called
/// an `UnorderedTermId`.
///
/// During the serialization of the segment, terms gets sorted and
/// `tantivy` builds a mapping to convert this `UnorderedTermId` into
/// term ordinals.
///
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
mapping_opt: Option<&HashMap<UnorderedTermId, usize>>,
) -> io::Result<()> {
{
// writing the offset index
let mut doc_index_serializer =
serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
for &offset in &self.doc_index {
doc_index_serializer.add_val(offset)?;
}
doc_index_serializer.add_val(self.vals.len() as u64)?;
doc_index_serializer.close_field()?;
}
{
// writing the values themselves.
let mut value_serializer: FastSingleFieldSerializer<_>;
match mapping_opt {
Some(mapping) => {
value_serializer = serializer.new_u64_fast_field_with_idx(
self.field,
0u64,
mapping.len() as u64,
1,
)?;
for val in &self.vals {
let remapped_val = *mapping.get(val).expect("Missing term ordinal") as u64;
value_serializer.add_val(remapped_val)?;
}
}
None => {
let val_min_max = self.vals.iter().cloned().minmax();
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0));
value_serializer =
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
for &val in &self.vals {
value_serializer.add_val(val)?;
}
}
}
value_serializer.close_field()?;
}
Ok(())
}
}

View File

@@ -1,148 +1,137 @@
use std::io;
use std::collections::HashMap;
use std::ops::Deref;
use directory::ReadOnlySource;
use common::BinarySerializable;
use DocId;
use schema::{Field, SchemaBuilder};
use std::path::Path;
use schema::FAST;
use directory::{WritePtr, RAMDirectory, Directory};
use fastfield::FastFieldSerializer;
use fastfield::U32FastFieldsWriter;
use common::bitpacker::compute_num_bits;
use common::bitpacker::BitUnpacker;
use common::CompositeFile;
use common::compute_num_bits;
use directory::{Directory, RAMDirectory, WritePtr};
use directory::ReadOnlySource;
use DocId;
use fastfield::{FastFieldSerializer, FastFieldsWriter};
use owning_ref::OwningRef;
use schema::FAST;
use schema::SchemaBuilder;
use std::collections::HashMap;
use std::marker::PhantomData;
use std::mem;
use std::path::Path;
use super::FastValue;
lazy_static! {
static ref U32_FAST_FIELD_EMPTY: ReadOnlySource = {
let u32_fast_field = U32FastFieldReader::from(Vec::new());
u32_fast_field._data.clone()
};
/// Trait for accessing a fastfield.
///
/// Depending on the field type, a different
/// fast field is required.
#[derive(Clone)]
pub struct FastFieldReader<Item: FastValue> {
bit_unpacker: BitUnpacker<OwningRef<ReadOnlySource, [u8]>>,
min_value_u64: u64,
max_value_u64: u64,
_phantom: PhantomData<Item>,
}
pub struct U32FastFieldReader {
_data: ReadOnlySource,
bit_unpacker: BitUnpacker,
min_val: u32,
max_val: u32,
}
impl U32FastFieldReader {
pub fn empty() -> U32FastFieldReader {
U32FastFieldReader::open(U32_FAST_FIELD_EMPTY.clone()).expect("should always work.")
}
pub fn min_val(&self,) -> u32 {
self.min_val
}
pub fn max_val(&self,) -> u32 {
self.max_val
}
pub fn open(data: ReadOnlySource) -> io::Result<U32FastFieldReader> {
let min_val;
let amplitude;
let max_val;
impl<Item: FastValue> FastFieldReader<Item> {
/// Opens a fast field given a source.
pub fn open(data: ReadOnlySource) -> Self {
let min_value: u64;
let amplitude: u64;
{
let mut cursor = data.as_slice();
min_val = try!(u32::deserialize(&mut cursor));
amplitude = try!(u32::deserialize(&mut cursor));
max_val = min_val + amplitude;
min_value =
u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
amplitude =
u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
}
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = {
let data_arr = &(data.deref()[8..]);
BitUnpacker::new(data_arr, num_bits as usize)
};
Ok(U32FastFieldReader {
_data: data,
bit_unpacker: bit_unpacker,
min_val: min_val,
max_val: max_val,
})
let owning_ref = OwningRef::new(data).map(|data| &data[16..]);
let bit_unpacker = BitUnpacker::new(owning_ref, num_bits);
FastFieldReader {
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
_phantom: PhantomData,
}
}
pub fn get(&self, doc: DocId) -> u32 {
self.min_val + self.bit_unpacker.get(doc as usize)
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
// `maxdoc`.
pub fn get(&self, doc: DocId) -> Item {
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc as usize))
}
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// # Panics
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
pub fn get_range(&self, start: u32, output: &mut [Item]) {
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
self.bit_unpacker.get_range(start, output_u64);
for out in output_u64.iter_mut() {
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
}
}
/// Returns the minimum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
pub fn min_value(&self) -> Item {
Item::from_u64(self.min_value_u64)
}
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
pub fn max_value(&self) -> Item {
Item::from_u64(self.max_value_u64)
}
}
impl From<Vec<u32>> for U32FastFieldReader {
fn from(vals: Vec<u32>) -> U32FastFieldReader {
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_u32_field("field", FAST);
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("test");
let path = Path::new("__dummy__");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema);
for val in vals {
let mut fast_field_writer = fast_field_writers.get_field_writer(field).unwrap();
fast_field_writer.add_val(val);
let write: WritePtr = directory
.open_write(path)
.expect("With a RAMDirectory, this should never fail.");
let mut serializer = FastFieldSerializer::from_write(write)
.expect("With a RAMDirectory, this should never fail.");
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
{
let fast_field_writer = fast_field_writers
.get_field_writer(field)
.expect("With a RAMDirectory, this should never fail.");
for val in vals {
fast_field_writer.add_val(val.to_u64());
}
}
fast_field_writers.serialize(&mut serializer).unwrap();
fast_field_writers
.serialize(&mut serializer, &HashMap::new())
.unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
fast_field_readers.get_field(field).unwrap()
}
}
pub struct U32FastFieldsReader {
source: ReadOnlySource,
field_offsets: HashMap<Field, (u32, u32)>,
}
impl U32FastFieldsReader {
pub fn open(source: ReadOnlySource) -> io::Result<U32FastFieldsReader> {
let header_offset;
let field_offsets: Vec<(Field, u32)>;
{
let buffer = source.as_slice();
{
let mut cursor = buffer;
header_offset = try!(u32::deserialize(&mut cursor));
}
{
let mut cursor = &buffer[header_offset as usize..];
field_offsets = try!(Vec::deserialize(&mut cursor));
}
}
let mut end_offsets: Vec<u32> = field_offsets
.iter()
.map(|&(_, offset)| offset)
.collect();
end_offsets.push(header_offset);
let mut field_offsets_map: HashMap<Field, (u32, u32)> = HashMap::new();
for (field_start_offsets, stop_offset) in field_offsets.iter().zip(end_offsets.iter().skip(1)) {
let (field, start_offset) = *field_start_offsets;
field_offsets_map.insert(field, (start_offset, *stop_offset));
}
Ok(U32FastFieldsReader {
field_offsets: field_offsets_map,
source: source,
})
}
pub fn get_field(&self, field: Field) -> io::Result<U32FastFieldReader> {
match self.field_offsets.get(&field) {
Some(&(start, stop)) => {
let field_source = self.source.slice(start as usize, stop as usize);
U32FastFieldReader::open(field_source)
}
None => {
Err(io::Error::new(io::ErrorKind::InvalidInput, "Could not find field"))
}
}
let source = directory.open_read(path).expect("Failed to open the file");
let composite_file =
CompositeFile::open(&source).expect("Failed to read the composite file");
let field_source = composite_file
.open_read(field)
.expect("File component not found");
FastFieldReader::open(field_source)
}
}

View File

@@ -1,106 +1,109 @@
use common::BinarySerializable;
use directory::WritePtr;
use schema::Field;
use common::bitpacker::{compute_num_bits, BitPacker};
use std::io::{self, Write, Seek, SeekFrom};
use common::bitpacker::BitPacker;
use common::compute_num_bits;
use common::CountingWriter;
use common::CompositeWrite;
use std::io::{self, Write};
/// `FastFieldSerializer` is in charge of serializing
/// fastfields on disk.
///
///
/// Fast fields are encoded using bit-packing.
///
///
/// `FastFieldWriter`s are in charge of pushing the data to
/// the serializer.
/// The serializer expects to receive the following calls.
///
/// * `new_u32_fast_field(...)`
/// * `new_u64_fast_field(...)`
/// * `add_val(...)`
/// * `add_val(...)`
/// * `add_val(...)`
/// * ...
/// * `close_field()`
/// * `new_u32_fast_field(...)`
/// * `new_u64_fast_field(...)`
/// * `add_val(...)`
/// * ...
/// * `close_field()`
/// * `close()`
pub struct FastFieldSerializer {
write: WritePtr,
written_size: usize,
fields: Vec<(Field, u32)>,
min_value: u32,
field_open: bool,
bit_packer: BitPacker,
composite_write: CompositeWrite<WritePtr>,
}
impl FastFieldSerializer {
/// Constructor
pub fn new(mut write: WritePtr) -> io::Result<FastFieldSerializer> {
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
// just making room for the pointer to header.
let written_size: usize = try!(0u32.serialize(&mut write));
Ok(FastFieldSerializer {
write: write,
written_size: written_size,
fields: Vec::new(),
min_value: 0,
field_open: false,
bit_packer: BitPacker::new(0),
})
}
/// Start serializing a new u32 fast field
pub fn new_u32_fast_field(&mut self, field: Field, min_value: u32, max_value: u32) -> io::Result<()> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
}
self.min_value = min_value;
self.field_open = true;
self.fields.push((field, self.written_size as u32));
let write: &mut Write = &mut self.write;
self.written_size += try!(min_value.serialize(write));
let amplitude = max_value - min_value;
self.written_size += try!(amplitude.serialize(write));
let num_bits = compute_num_bits(amplitude);
self.bit_packer = BitPacker::new(num_bits as usize);
Ok(())
let composite_write = CompositeWrite::wrap(write);
Ok(FastFieldSerializer { composite_write })
}
/// Start serializing a new u64 fast field
pub fn new_u64_fast_field(
&mut self,
field: Field,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
}
/// Pushes a new value to the currently open u32 fast field.
pub fn add_val(&mut self, val: u32) -> io::Result<()> {
let val_to_write: u32 = val - self.min_value;
self.bit_packer.write(val_to_write, &mut self.write)?;
Ok(())
/// Start serializing a new u64 fast field
pub fn new_u64_fast_field_with_idx(
&mut self,
field: Field,
min_value: u64,
max_value: u64,
idx: usize,
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
FastSingleFieldSerializer::open(field_write, min_value, max_value)
}
/// Close the u32 fast field.
pub fn close_field(&mut self,) -> io::Result<()> {
if !self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
}
self.field_open = false;
// adding some padding to make sure we
// can read the last elements with our u64
// cursor
self.written_size += self.bit_packer.close(&mut self.write)?;
Ok(())
}
/// Closes the serializer
///
///
/// After this call the data must be persistently save on disk.
pub fn close(mut self,) -> io::Result<usize> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));
}
let header_offset: usize = self.written_size;
self.written_size += try!(self.fields.serialize(&mut self.write));
try!(self.write.seek(SeekFrom::Start(0)));
try!((header_offset as u32).serialize(&mut self.write));
try!(self.write.flush());
Ok(self.written_size)
pub fn close(self) -> io::Result<()> {
self.composite_write.close()
}
}
pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
num_bits: u8,
}
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
min_value.serialize(write)?;
let amplitude = max_value - min_value;
amplitude.serialize(write)?;
let num_bits = compute_num_bits(amplitude);
let bit_packer = BitPacker::new();
Ok(FastSingleFieldSerializer {
write,
bit_packer,
min_value,
num_bits,
})
}
/// Pushes a new value to the currently open u64 fast field.
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?;
Ok(())
}
pub fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)
}
}

View File

@@ -1,122 +1,222 @@
use schema::{Schema, Field, Document};
use schema::{Cardinality, Document, Field, Schema};
use fastfield::FastFieldSerializer;
use std::io;
use schema::Value;
use DocId;
use schema::FieldType;
use common;
use common::VInt;
use std::collections::HashMap;
use postings::UnorderedTermId;
use super::multivalued::MultiValueIntFastFieldWriter;
use common::BinarySerializable;
pub struct U32FastFieldsWriter {
field_writers: Vec<U32FastFieldWriter>,
/// The fastfieldswriter regroup all of the fast field writers.
pub struct FastFieldsWriter {
single_value_writers: Vec<IntFastFieldWriter>,
multi_values_writers: Vec<MultiValueIntFastFieldWriter>,
}
impl U32FastFieldsWriter {
impl FastFieldsWriter {
/// Create all `FastFieldWriter` required by the schema.
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
let mut single_value_writers = Vec::new();
let mut multi_values_writers = Vec::new();
pub fn from_schema(schema: &Schema) -> U32FastFieldsWriter {
let u32_fields: Vec<Field> = schema.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_u32_fast())
.map(|(field_id, _)| Field(field_id as u8))
.collect();
U32FastFieldsWriter::new(u32_fields)
}
pub fn new(fields: Vec<Field>) -> U32FastFieldsWriter {
U32FastFieldsWriter {
field_writers: fields
.into_iter()
.map(U32FastFieldWriter::new)
.collect(),
for (field_id, field_entry) in schema.fields().iter().enumerate() {
let field = Field(field_id as u32);
let default_value = if let FieldType::I64(_) = *field_entry.field_type() {
common::i64_to_u64(0i64)
} else {
0u64
};
match *field_entry.field_type() {
FieldType::I64(ref int_options) | FieldType::U64(ref int_options) => {
match int_options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let mut fast_field_writer = IntFastFieldWriter::new(field);
fast_field_writer.set_val_if_missing(default_value);
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValueIntFastFieldWriter::new(field, false);
multi_values_writers.push(fast_field_writer);
}
None => {}
}
}
FieldType::HierarchicalFacet => {
let fast_field_writer = MultiValueIntFastFieldWriter::new(field, true);
multi_values_writers.push(fast_field_writer);
}
_ => {}
}
}
FastFieldsWriter {
single_value_writers,
multi_values_writers,
}
}
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut U32FastFieldWriter> {
self.field_writers
/// Get the `FastFieldWriter` associated to a field.
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
// TODO optimize
self.single_value_writers
.iter_mut()
.find(|field_writer| field_writer.field == field)
.find(|field_writer| field_writer.field() == field)
}
/// Returns the fast field multi-value writer for the given field.
///
/// Returns None if the field does not exist, or is not
/// configured as a multivalued fastfield in the schema.
pub(crate) fn get_multivalue_writer(
&mut self,
field: Field,
) -> Option<&mut MultiValueIntFastFieldWriter> {
// TODO optimize
// TODO expose for users
self.multi_values_writers
.iter_mut()
.find(|multivalue_writer| multivalue_writer.field() == field)
}
/// Indexes all of the fastfields of a new document.
pub fn add_document(&mut self, doc: &Document) {
for field_writer in &mut self.field_writers {
for field_writer in &mut self.single_value_writers {
field_writer.add_document(doc);
}
for field_writer in &mut self.multi_values_writers {
field_writer.next_doc();
field_writer.add_document(doc);
}
}
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
for field_writer in &self.field_writers {
try!(field_writer.serialize(serializer));
/// Serializes all of the `FastFieldWriter`s by pushing them in
/// order to the fast field serializer.
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
mapping: &HashMap<Field, HashMap<UnorderedTermId, usize>>,
) -> io::Result<()> {
for field_writer in &self.single_value_writers {
field_writer.serialize(serializer)?;
}
for field_writer in &self.multi_values_writers {
let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field))?;
}
Ok(())
}
/// Ensures all of the fast field writers have
/// reached `doc`. (included)
///
/// The missing values will be filled with 0.
pub fn fill_val_up_to(&mut self, doc: DocId) {
for field_writer in &mut self.field_writers {
field_writer.fill_val_up_to(doc);
}
}
}
pub struct U32FastFieldWriter {
/// Fast field writer for ints.
/// The fast field writer just keeps the values in memory.
///
/// Only when the segment writer can be closed and
/// persisted on disc, the fast field writer is
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
/// method.
///
/// We cannot serialize earlier as the values are
/// bitpacked and the number of bits required for bitpacking
/// can only been known once we have seen all of the values.
///
/// Both u64, and i64 use the same writer.
/// i64 are just remapped to the `0..2^64 - 1`
/// using `common::i64_to_u64`.
pub struct IntFastFieldWriter {
field: Field,
vals: Vec<u32>,
vals: Vec<u8>,
val_count: usize,
val_if_missing: u64,
val_min: u64,
val_max: u64,
}
impl U32FastFieldWriter {
pub fn new(field: Field) -> U32FastFieldWriter {
U32FastFieldWriter {
field: field,
impl IntFastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field) -> IntFastFieldWriter {
IntFastFieldWriter {
field,
vals: Vec::new(),
val_count: 0,
val_if_missing: 0u64,
val_min: u64::max_value(),
val_max: 0,
}
}
/// Ensures all of the fast field writer have
/// reached `doc`. (included)
///
/// The missing values will be filled with 0.
fn fill_val_up_to(&mut self, doc: DocId) {
let target = doc as usize + 1;
debug_assert!(self.vals.len() <= target);
while self.vals.len() < target {
self.add_val(0u32)
/// Returns the field that this writer is targetting.
pub fn field(&self) -> Field {
self.field
}
/// Sets the default value.
///
/// This default value is recorded for documents if
/// a document does not have any value.
fn set_val_if_missing(&mut self, val_if_missing: u64) {
self.val_if_missing = val_if_missing;
}
/// Records a new value.
///
/// The n-th value being recorded is implicitely
/// associated to the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u64) {
VInt(val)
.serialize(&mut self.vals)
.expect("unable to serialize VInt to Vec");
if val > self.val_max {
self.val_max = val;
}
if val < self.val_min {
self.val_min = val;
}
self.val_count += 1;
}
pub fn add_val(&mut self, val: u32) {
self.vals.push(val);
}
fn extract_val(&self, doc: &Document) -> u32 {
/// Extract the value associated to the fast field for
/// this document.
///
/// i64 are remapped to u64 using the logic
/// in `common::i64_to_u64`.
///
/// If the value is missing, then the default value is used
/// instead.
/// If the document has more than one value for the given field,
/// only the first one is taken in account.
fn extract_val(&self, doc: &Document) -> u64 {
match doc.get_first(self.field) {
Some(v) => {
match *v {
Value::U32(ref val) => { *val }
_ => { panic!("Expected a u32field, got {:?} ", v) }
}
},
None => {
// TODO make default value configurable
0u32
}
Some(v) => super::value_to_u64(v),
None => self.val_if_missing,
}
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
pub fn add_document(&mut self, doc: &Document) {
let val = self.extract_val(doc);
self.add_val(val);
}
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
let zero = 0;
let min = *self.vals.iter().min().unwrap_or(&zero);
let max = *self.vals.iter().max().unwrap_or(&min);
try!(serializer.new_u32_fast_field(self.field, min, max));
for &val in &self.vals {
try!(serializer.add_val(val));
let (min, max) = if self.val_min > self.val_max {
(0, 0)
} else {
(self.val_min, self.val_max)
};
let mut single_field_serializer = serializer.new_u64_fast_field(self.field, min, max)?;
let mut cursor = self.vals.as_slice();
while let Ok(VInt(val)) = VInt::deserialize(&mut cursor) {
single_field_serializer.add_val(val)?;
}
serializer.close_field()
single_field_serializer.close_field()
}
}

106
src/fieldnorm/code.rs Normal file
View File

@@ -0,0 +1,106 @@
#[inline(always)]
pub fn id_to_fieldnorm(id: u8) -> u32 {
FIELD_NORMS_TABLE[id as usize]
}
#[inline(always)]
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
FIELD_NORMS_TABLE
.binary_search(&fieldnorm)
.unwrap_or_else(|idx| idx - 1) as u8
}
pub const FIELD_NORMS_TABLE: [u32; 256] = [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54,
56, 60, 64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144,
152, 168, 184, 200, 216, 232, 248, 264, 280, 312, 344, 376, 408, 440, 472, 504,
536, 600, 664, 728, 792, 856, 920, 984,
1048, 1176, 1304, 1432, 1560, 1688, 1816, 1944,
2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120,
4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240,
10264, 11288, 12312, 13336, 14360, 15384,
16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744,
32792, 36888, 40984, 45080, 49176, 53272, 57368, 61464,
65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480,
163864, 180248, 196632, 213016, 229400, 245784, 262168,
294936, 327704, 360472, 393240, 426008, 458776,
491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528,
983064, 1048600, 1179672, 1310744, 1441816, 1572888, 1703960, 1835032,
1966104, 2097176, 2359320, 2621464, 2883608, 3145752, 3407896, 3670040, 3932184,
4194328, 4718616, 5242904, 5767192, 6291480, 6815768, 7340056, 7864344, 8388632, 9437208,
10485784, 11534360, 12582936, 13631512, 14680088, 15728664, 16777240, 18874392, 20971544,
23068696, 25165848, 27263000, 29360152, 31457304, 33554456, 37748760, 41943064,
46137368, 50331672, 54525976, 58720280, 62914584, 67108888, 75497496, 83886104,
92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968, 167772184,
184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912, 335544344,
369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800, 671088664,
738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576, 1342177304,
1476395032, 1610612760, 1744830488, 1879048216, 2013265944
];
#[cfg(test)]
mod tests {
use super::{fieldnorm_to_id, id_to_fieldnorm, FIELD_NORMS_TABLE};
#[test]
fn test_decode_code() {
assert_eq!(fieldnorm_to_id(0), 0);
assert_eq!(fieldnorm_to_id(1), 1);
for i in 0..41 {
assert_eq!(fieldnorm_to_id(i), i as u8);
}
assert_eq!(fieldnorm_to_id(41), 40);
assert_eq!(fieldnorm_to_id(42), 41);
for id in 43..256 {
let field_norm = FIELD_NORMS_TABLE[id];
assert_eq!(id_to_fieldnorm(id as u8), field_norm);
assert_eq!(fieldnorm_to_id(field_norm), id as u8);
assert_eq!(fieldnorm_to_id(field_norm - 1), id as u8 - 1);
assert_eq!(fieldnorm_to_id(field_norm + 1), id as u8);
}
}
#[test]
fn test_u32_max() {
assert_eq!(fieldnorm_to_id(u32::max_value()), u8::max_value());
}
#[test]
fn test_fieldnorm_byte() {
// const expression are not really a thing
// yet... Therefore we do things the other way around.
// The array is defined as a const,
// and we check in the unit test that the const
// value is matching the logic.
const IDENTITY_PART: u8 = 24u8;
fn decode_field_norm_exp_part(b: u8) -> u32 {
let bits = (b & 0b00000111) as u32;
let shift = b >> 3;
if shift == 0 {
bits
} else {
(bits | 8u32) << ((shift - 1u8) as u32)
}
}
fn decode_fieldnorm_byte(b: u8) -> u32 {
if b < IDENTITY_PART {
b as u32
} else {
(IDENTITY_PART as u32) + decode_field_norm_exp_part(b - IDENTITY_PART)
}
}
for i in 0..256 {
assert_eq!(FIELD_NORMS_TABLE[i], decode_fieldnorm_byte(i as u8));
}
}
}

29
src/fieldnorm/mod.rs Normal file
View File

@@ -0,0 +1,29 @@
//! The fieldnorm represents the length associated to
//! a given Field of a given document.
//!
//! This metric is important to compute the score of a
//! document : a document having a query word in one its short fields
//! (e.g. title) is likely to be more relevant than in one of its longer field
//! (e.g. body).
//!
//! It encodes `fieldnorm` on one byte with some precision loss,
//! using the exact same scheme as Lucene. Each value is place on a log-scale
//! that takes values from `0` to `255`.
//!
//! A value on this scale is identified by a `fieldnorm_id`.
//! Apart from compression, this scale also makes it possible to
//! precompute computationally expensive functions of the fieldnorm
//! in a very short array.
//!
//! This trick is used by the [BM25 similarity]().
mod code;
mod serializer;
mod writer;
mod reader;
pub use self::reader::FieldNormReader;
pub use self::writer::FieldNormsWriter;
pub use self::serializer::FieldNormsSerializer;
use self::code::{fieldnorm_to_id, id_to_fieldnorm};

82
src/fieldnorm/reader.rs Normal file
View File

@@ -0,0 +1,82 @@
use super::{id_to_fieldnorm, fieldnorm_to_id};
use directory::ReadOnlySource;
use DocId;
/// Reads the fieldnorm associated to a document.
/// The fieldnorm represents the length associated to
/// a given Field of a given document.
///
/// This metric is important to compute the score of a
/// document : a document having a query word in one its short fields
/// (e.g. title) is likely to be more relevant than in one of its longer field
/// (e.g. body).
///
/// tantivy encodes `fieldnorm` on one byte with some precision loss,
/// using the same scheme as Lucene. Each value is place on a log-scale
/// that takes values from `0` to `255`.
///
/// A value on this scale is identified by a `fieldnorm_id`.
/// Apart from compression, this scale also makes it possible to
/// precompute computationally expensive functions of the fieldnorm
/// in a very short array.
pub struct FieldNormReader {
data: ReadOnlySource
}
impl FieldNormReader {
/// Opens a field norm reader given its data source.
pub fn open(data: ReadOnlySource) -> Self {
FieldNormReader {
data
}
}
/// Returns the `fieldnorm` associated to a doc id.
/// The fieldnorm is a value approximating the number
/// of tokens in a given field of the `doc_id`.
///
/// It is imprecise, and always lower than the actual
/// number of tokens.
///
/// The fieldnorm is effectively decoded from the
/// `fieldnorm_id` by doing a simple table lookup.
pub fn fieldnorm(&self, doc_id: DocId) -> u32 {
let fieldnorm_id = self.fieldnorm_id(doc_id);
id_to_fieldnorm(fieldnorm_id)
}
/// Returns the `fieldnorm_id` associated to a document.
#[inline(always)]
pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 {
let fielnorms_data = self.data.as_slice();
fielnorms_data[doc_id as usize]
}
/// Converts a `fieldnorm_id` into a fieldnorm.
#[inline(always)]
pub fn id_to_fieldnorm(id: u8) -> u32 {
id_to_fieldnorm(id)
}
/// Converts a `fieldnorm` into a `fieldnorm_id`.
/// (This function is not injective).
#[inline(always)]
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
fieldnorm_to_id(fieldnorm)
}
}
#[cfg(test)]
impl From<Vec<u32>> for FieldNormReader {
fn from(field_norms: Vec<u32>) -> FieldNormReader {
let field_norms_id = field_norms.into_iter()
.map(FieldNormReader::fieldnorm_to_id)
.collect::<Vec<u8>>();
let field_norms_data = ReadOnlySource::from(field_norms_id);
FieldNormReader {
data: field_norms_data
}
}
}

View File

@@ -0,0 +1,37 @@
use directory::WritePtr;
use std::io;
use common::CompositeWrite;
use schema::Field;
use std::io::Write;
pub struct FieldNormsSerializer {
composite_write: CompositeWrite,
}
impl FieldNormsSerializer {
/// Constructor
pub fn from_write(write: WritePtr) -> io::Result<FieldNormsSerializer> {
// just making room for the pointer to header.
let composite_write = CompositeWrite::wrap(write);
Ok(FieldNormsSerializer {
composite_write
})
}
pub fn serialize_field(&mut self, field: Field, fieldnorms_data: &[u8]) -> io::Result<()> {
let write = self.composite_write.for_field(field);
write.write_all(fieldnorms_data)?;
write.flush()?;
Ok(())
}
pub fn close(self) -> io::Result<()> {
self.composite_write.close()?;
Ok(())
}
}

65
src/fieldnorm/writer.rs Normal file
View File

@@ -0,0 +1,65 @@
use DocId;
use schema::Field;
use super::FieldNormsSerializer;
use std::io;
use schema::Schema;
use super::fieldnorm_to_id;
pub struct FieldNormsWriter {
fields: Vec<Field>,
fieldnorms_buffer: Vec<Vec<u8>>
}
impl FieldNormsWriter {
pub fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
schema
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| {
field_entry.is_indexed()
})
.map(|(field, _)| Field(field as u32))
.collect::<Vec<Field>>()
}
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
let max_field = fields
.iter()
.map(|field| field.0)
.max()
.map(|max_field_id| max_field_id as usize + 1)
.unwrap_or(0);
FieldNormsWriter {
fields,
fieldnorms_buffer: (0..max_field)
.map(|_| Vec::new())
.collect::<Vec<_>>()
}
}
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
for &field in self.fields.iter() {
self.fieldnorms_buffer[field.0 as usize].resize(max_doc as usize, 0u8);
}
}
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.0 as usize];
assert!(fieldnorm_buffer.len() <= doc as usize, "Cannot register a given fieldnorm twice");
// we fill intermediary `DocId` as having a fieldnorm of 0.
fieldnorm_buffer.resize(doc as usize + 1, 0u8);
fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
}
pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
for &field in self.fields.iter() {
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.0 as usize][..];
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
}
Ok(())
}
}

59
src/functional_test.rs Normal file
View File

@@ -0,0 +1,59 @@
use std::collections::HashSet;
use rand::thread_rng;
use schema::*;
use Index;
use Searcher;
use rand::distributions::{IndependentSample, Range};
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
assert!(searcher.segment_readers().len() < 20);
assert_eq!(searcher.num_docs() as usize, vals.len());
}
#[test]
#[ignore]
#[cfg(feature="mmap")]
fn test_indexing() {
let mut schema_builder = SchemaBuilder::default();
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap();
let universe = Range::new(0u64, 20u64);
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..200 {
let random_val = universe.ind_sample(&mut rng);
if random_val == 0 {
index_writer.commit().expect("Commit failed");
committed_docs.extend(&uncommitted_docs);
uncommitted_docs.clear();
index.load_searchers().unwrap();
let searcher = index.searcher();
// check that everything is correct.
check_index_content(&searcher, &committed_docs);
} else {
if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u64(id_field, random_val);
index_writer.delete_term(doc_id_term);
} else {
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
}
index_writer.add_document(doc);
}
}
}
}

297
src/indexer/delete_queue.rs Normal file
View File

@@ -0,0 +1,297 @@
use super::operation::DeleteOperation;
use std::sync::{Arc, RwLock};
use std::mem;
use std::ops::DerefMut;
// The DeleteQueue is similar in conceptually to a multiple
// consumer single producer broadcast channel.
//
// All consumer will receive all messages.
//
// Consumer of the delete queue are holding a `DeleteCursor`,
// which points to a specific place of the `DeleteQueue`.
//
// New consumer can be created in two ways
// - calling `delete_queue.cursor()` returns a cursor, that
// will include all future delete operation (and no past operations).
// - cloning an existing cursor returns a new cursor, that
// is at the exact same position, and can now advance independently
// from the original cursor.
#[derive(Default)]
struct InnerDeleteQueue {
writer: Vec<DeleteOperation>,
last_block: Option<Arc<Block>>,
}
#[derive(Clone, Default)]
pub struct DeleteQueue {
inner: Arc<RwLock<InnerDeleteQueue>>,
}
impl DeleteQueue {
// Creates a new delete queue.
pub fn new() -> DeleteQueue {
let delete_queue = DeleteQueue {
inner: Arc::default(),
};
let next_block = NextBlock::from(delete_queue.clone());
{
let mut delete_queue_wlock = delete_queue.inner.write().unwrap();
delete_queue_wlock.last_block = Some(Arc::new(Block {
operations: Arc::default(),
next: next_block,
}));
}
delete_queue
}
// Creates a new cursor that makes it possible to
// consume future delete operations.
//
// Past delete operations are not accessible.
pub fn cursor(&self) -> DeleteCursor {
let last_block = self.inner
.read()
.expect("Read lock poisoned when opening delete queue cursor")
.last_block
.clone()
.expect(
"Failed to unwrap last_block. This should never happen
as the Option<> is only here to make
initialization possible",
);
let operations_len = last_block.operations.len();
DeleteCursor {
block: last_block,
pos: operations_len,
}
}
// Appends a new delete operations.
pub fn push(&self, delete_operation: DeleteOperation) {
self.inner
.write()
.expect("Failed to acquire write lock on delete queue writer")
.writer
.push(delete_operation);
}
// DeleteQueue is a linked list of blocks of
// delete operations.
//
// Writing happens by simply appending to a vec.
// `.flush()` takes this pending delete operations vec
// creates a new read-only block from it,
// and appends it to the linked list.
//
// `.flush()` happens when, for instance,
// a consumer reaches the last read-only operations.
// It then ask the delete queue if there happen to
// be some unflushed operations.
//
fn flush(&self) -> Option<Arc<Block>> {
let mut self_wlock = self.inner
.write()
.expect("Failed to acquire write lock on delete queue writer");
let delete_operations;
{
let writer: &mut Vec<DeleteOperation> = &mut self_wlock.writer;
if writer.is_empty() {
return None;
}
delete_operations = mem::replace(writer, vec![]);
}
let next_block = NextBlock::from(self.clone());
{
self_wlock.last_block = Some(Arc::new(Block {
operations: Arc::new(delete_operations),
next: next_block,
}));
}
self_wlock.last_block.clone()
}
}
enum InnerNextBlock {
Writer(DeleteQueue),
Closed(Arc<Block>),
}
struct NextBlock(RwLock<InnerNextBlock>);
impl From<DeleteQueue> for NextBlock {
fn from(delete_queue: DeleteQueue) -> NextBlock {
NextBlock(RwLock::new(InnerNextBlock::Writer(delete_queue)))
}
}
impl NextBlock {
fn next_block(&self) -> Option<Arc<Block>> {
{
let next_read_lock = self.0
.read()
.expect("Failed to acquire write lock in delete queue");
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
return Some(Arc::clone(block));
}
}
let next_block;
{
let mut next_write_lock = self.0
.write()
.expect("Failed to acquire write lock in delete queue");
match *next_write_lock {
InnerNextBlock::Closed(ref block) => {
return Some(Arc::clone(block));
}
InnerNextBlock::Writer(ref writer) => match writer.flush() {
Some(flushed_next_block) => {
next_block = flushed_next_block;
}
None => {
return None;
}
},
}
*next_write_lock.deref_mut() = InnerNextBlock::Closed(Arc::clone(&next_block));
Some(next_block)
}
}
}
struct Block {
operations: Arc<Vec<DeleteOperation>>,
next: NextBlock,
}
#[derive(Clone)]
pub struct DeleteCursor {
block: Arc<Block>,
pos: usize,
}
impl DeleteCursor {
/// Skips operations and position it so that
/// - either all of the delete operation currently in the
/// queue are consume and the next get will return None.
/// - the next get will return the first operation with an
/// `opstamp >= target_opstamp`.
pub fn skip_to(&mut self, target_opstamp: u64) {
// TODO Can be optimize as we work with block.
#[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))]
loop {
if let Some(operation) = self.get() {
if operation.opstamp >= target_opstamp {
break;
}
} else {
break;
}
self.advance();
}
}
/// If the current block has been entirely
/// consumed, try to load the next one.
///
/// Return `true`, if after this attempt,
/// the cursor is on a block that has not
/// been entirely consumed.
/// Return `false`, if we have reached the end of the queue.
fn load_block_if_required(&mut self) -> bool {
if self.pos >= self.block.operations.len() {
// we have consumed our operations entirely.
// let's ask our writer if he has more for us.
// self.go_next_block();
match self.block.next.next_block() {
Some(block) => {
self.block = block;
self.pos = 0;
true
}
None => false,
}
} else {
true
}
}
/// Advance to the next delete operation.
/// Returns true iff there is such an operation.
pub fn advance(&mut self) -> bool {
if self.load_block_if_required() {
self.pos += 1;
true
} else {
false
}
}
/// Get the current delete operation.
/// Calling `.get` does not advance the cursor.
pub fn get(&mut self) -> Option<&DeleteOperation> {
if self.load_block_if_required() {
Some(&self.block.operations[self.pos])
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::{DeleteOperation, DeleteQueue};
use schema::{Field, Term};
#[test]
fn test_deletequeue() {
let delete_queue = DeleteQueue::new();
let make_op = |i: usize| {
let field = Field(1u32);
DeleteOperation {
opstamp: i as u64,
term: Term::from_field_u64(field, i as u64),
}
};
delete_queue.push(make_op(1));
delete_queue.push(make_op(2));
let snapshot = delete_queue.cursor();
{
let mut operations_it = snapshot.clone();
assert_eq!(operations_it.get().unwrap().opstamp, 1);
operations_it.advance();
assert_eq!(operations_it.get().unwrap().opstamp, 2);
operations_it.advance();
assert!(operations_it.get().is_none());
operations_it.advance();
let mut snapshot2 = delete_queue.cursor();
assert!(snapshot2.get().is_none());
delete_queue.push(make_op(3));
assert_eq!(snapshot2.get().unwrap().opstamp, 3);
assert_eq!(operations_it.get().unwrap().opstamp, 3);
assert_eq!(operations_it.get().unwrap().opstamp, 3);
operations_it.advance();
assert!(operations_it.get().is_none());
operations_it.advance();
}
{
let mut operations_it = snapshot.clone();
assert_eq!(operations_it.get().unwrap().opstamp, 1);
operations_it.advance();
assert_eq!(operations_it.get().unwrap().opstamp, 2);
operations_it.advance();
assert_eq!(operations_it.get().unwrap().opstamp, 3);
operations_it.advance();
assert!(operations_it.get().is_none());
}
}
}

View File

@@ -1,9 +1,6 @@
use Directory;
use std::path::Path;
use directory::error::OpenWriteError;
pub const LOCKFILE_NAME: &'static str = ".tantivy-indexer.lock";
use core::LOCKFILE_FILEPATH;
/// The directory lock is a mechanism used to
/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html)
@@ -16,17 +13,15 @@ pub struct DirectoryLock {
impl DirectoryLock {
pub fn lock(mut directory: Box<Directory>) -> Result<DirectoryLock, OpenWriteError> {
let lockfile_path = Path::new(LOCKFILE_NAME);
try!(directory.open_write(lockfile_path));
Ok(DirectoryLock { directory: directory })
directory.open_write(&*LOCKFILE_FILEPATH)?;
Ok(DirectoryLock { directory })
}
}
impl Drop for DirectoryLock {
fn drop(&mut self) {
let lockfile_path = Path::new(LOCKFILE_NAME);
if let Err(e) = self.directory.delete(lockfile_path) {
if let Err(e) = self.directory.delete(&*LOCKFILE_FILEPATH) {
error!("Failed to remove the lock file. {:?}", e);
}
}
}
}

View File

@@ -0,0 +1,93 @@
use std::sync::Arc;
use DocId;
// Doc to opstamp is used to identify which
// document should be deleted.
//
// Since the docset matching the query of a delete operation
// is not computed right when the delete operation is received,
// we need to find a way to evaluate, for each document,
// whether the document was added before or after
// the delete operation. This anteriority is used by comparing
// the docstamp of the document.
//
// The doc to opstamp mapping stores precisely an array
// indexed by doc id and storing the opstamp of the document.
//
// This mapping is (for the moment) stricly increasing
// because of the way document id are allocated.
#[derive(Clone)]
pub enum DocToOpstampMapping {
WithMap(Arc<Vec<u64>>),
None,
}
impl From<Vec<u64>> for DocToOpstampMapping {
fn from(opstamps: Vec<u64>) -> DocToOpstampMapping {
DocToOpstampMapping::WithMap(Arc::new(opstamps))
}
}
impl DocToOpstampMapping {
/// Given an opstamp return the limit doc id L
/// such that all doc id D such that
// D >= L iff opstamp(D) >= than `target_opstamp`.
//
// The edge case opstamp = some doc opstamp is in practise
// never called.
pub fn compute_doc_limit(&self, target_opstamp: u64) -> DocId {
match *self {
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
match doc_opstamps.binary_search(&target_opstamp) {
Ok(doc_id) | Err(doc_id) => doc_id as DocId,
}
}
DocToOpstampMapping::None => DocId::max_value(),
}
}
}
#[cfg(test)]
mod tests {
use super::DocToOpstampMapping;
#[test]
fn test_doc_to_opstamp_mapping_none() {
let doc_to_opstamp_mapping = DocToOpstampMapping::None;
assert_eq!(
doc_to_opstamp_mapping.compute_doc_limit(1),
u32::max_value()
);
}
#[test]
fn test_doc_to_opstamp_mapping_complex() {
{
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![]);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 0);
}
{
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64]);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 1);
}
{
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64, 12u64, 17u64, 23u64]);
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
for i in 2u64..13u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 1);
}
for i in 13u64..18u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 2);
}
for i in 18u64..24u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 3);
}
for i in 24u64..30u64 {
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 4);
}
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,4 @@
extern crate itertools;
use super::merge_policy::{MergePolicy, MergeCandidate};
use super::merge_policy::{MergeCandidate, MergePolicy};
use core::SegmentMeta;
use std::cmp;
use std::f64;
@@ -8,8 +7,7 @@ const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
const DEFAULT_MIN_MERGE_SIZE: usize = 8;
/// LogMergePolicy tries tries to merge segments that have a similar number of
/// `LogMergePolicy` tries tries to merge segments that have a similar number of
/// documents.
#[derive(Debug, Clone)]
pub struct LogMergePolicy {
@@ -24,7 +22,7 @@ impl LogMergePolicy {
}
/// Set the minimum number of segment that may be merge together.
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
self.min_merge_size = min_merge_size;
}
@@ -48,20 +46,21 @@ impl LogMergePolicy {
impl MergePolicy for LogMergePolicy {
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
if segments.is_empty() {
return Vec::new();
}
let mut size_sorted_tuples = segments.iter()
.map(|x| x.num_docs)
let mut size_sorted_tuples = segments
.iter()
.map(|x| x.num_docs())
.enumerate()
.collect::<Vec<(usize, u32)>>();
size_sorted_tuples.sort_by(|x, y| y.cmp(x));
size_sorted_tuples.sort_by(|x, y| y.1.cmp(&(x.1)));
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples.into_iter()
.map(|(ind, num_docs)| (ind, (self.clip_min_size(num_docs) as f64).log2()))
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
.into_iter()
.map(|(ind, num_docs)| (ind, f64::from(self.clip_min_size(num_docs)).log2()))
.collect();
let (first_ind, first_score) = size_sorted_log_tuples[0];
@@ -75,18 +74,13 @@ impl MergePolicy for LogMergePolicy {
levels.last_mut().unwrap().push(ind);
}
let result = levels.iter()
levels
.iter()
.filter(|level| level.len() >= self.min_merge_size)
.map(|ind_vec| {
MergeCandidate(ind_vec.iter()
.map(|&ind| segments[ind].segment_id)
.collect())
})
.collect();
result
.map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()))
.collect()
}
fn box_clone(&self) -> Box<MergePolicy> {
box self.clone()
}
@@ -106,7 +100,7 @@ impl Default for LogMergePolicy {
mod tests {
use super::*;
use indexer::merge_policy::MergePolicy;
use core::{SegmentMeta, SegmentId};
use core::{SegmentId, SegmentMeta};
fn test_merge_policy() -> LogMergePolicy {
let mut log_merge_policy = LogMergePolicy::default();
@@ -122,11 +116,15 @@ mod tests {
assert!(result_list.is_empty());
}
fn seg_meta(num_docs: u32) -> SegmentMeta {
let mut segment_metas = SegmentMeta::new(SegmentId::generate_random());
segment_metas.set_max_doc(num_docs);
segment_metas
}
#[test]
fn test_log_merge_policy_pair() {
let test_input = vec![SegmentMeta::new(SegmentId::generate_random(), 10),
SegmentMeta::new(SegmentId::generate_random(), 10),
SegmentMeta::new(SegmentId::generate_random(), 10)];
let test_input = vec![seg_meta(10), seg_meta(10), seg_meta(10)];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 1);
}
@@ -134,12 +132,23 @@ mod tests {
#[test]
fn test_log_merge_policy_levels() {
// multiple levels all get merged correctly
let test_input = vec![SegmentMeta::new(SegmentId::generate_random(), 10),
SegmentMeta::new(SegmentId::generate_random(), 10),
SegmentMeta::new(SegmentId::generate_random(), 10),
SegmentMeta::new(SegmentId::generate_random(), 1000),
SegmentMeta::new(SegmentId::generate_random(), 1000),
SegmentMeta::new(SegmentId::generate_random(), 1000)];
// 2 MergeCandidates expected:
// * one with the 6 * 10-docs segments
// * one with the 3 * 1000-docs segments
// no MergeCandidate expected for the 2 * 10_000-docs segments as min_merge_size=3
let test_input = vec![
seg_meta(10),
seg_meta(10),
seg_meta(10),
seg_meta(1000),
seg_meta(1000),
seg_meta(1000),
seg_meta(10000),
seg_meta(10000),
seg_meta(10),
seg_meta(10),
seg_meta(10),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
}
@@ -147,24 +156,28 @@ mod tests {
#[test]
fn test_log_merge_policy_within_levels() {
// multiple levels all get merged correctly
let test_input = vec![SegmentMeta::new(SegmentId::generate_random(), 10),
SegmentMeta::new(SegmentId::generate_random(), 11),
SegmentMeta::new(SegmentId::generate_random(), 12),
SegmentMeta::new(SegmentId::generate_random(), 1000),
SegmentMeta::new(SegmentId::generate_random(), 1000),
SegmentMeta::new(SegmentId::generate_random(), 1000)];
let test_input = vec![
seg_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75)
seg_meta(11), // log2(11) = ~3.46
seg_meta(12), // log2(12) = ~3.58
seg_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75)
seg_meta(1000), // log2(1000) = ~9.97
seg_meta(1000),
]; // log2(1000) = ~9.97
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 2);
}
#[test]
fn test_log_merge_policy_small_segments() {
// multiple levels all get merged correctly
let test_input = vec![SegmentMeta::new(SegmentId::generate_random(), 1),
SegmentMeta::new(SegmentId::generate_random(), 1),
SegmentMeta::new(SegmentId::generate_random(), 1),
SegmentMeta::new(SegmentId::generate_random(), 2),
SegmentMeta::new(SegmentId::generate_random(), 2),
SegmentMeta::new(SegmentId::generate_random(), 2)];
// segments under min_layer_size are merged together
let test_input = vec![
seg_meta(1),
seg_meta(1),
seg_meta(1),
seg_meta(2),
seg_meta(2),
seg_meta(2),
];
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
assert_eq!(result_list.len(), 1);
}

View File

@@ -3,27 +3,25 @@ use core::SegmentMeta;
use std::marker;
use std::fmt::Debug;
/// Set of segment suggested for a merge.
/// Set of segment suggested for a merge.
#[derive(Debug, Clone)]
pub struct MergeCandidate(pub Vec<SegmentId>);
/// The Merge policy defines which segments should be merged.
///
/// The `MergePolicy` defines which segments should be merged.
///
/// Every time a the list of segments changes, the segment updater
/// asks the merge policy if some segments should be merged.
pub trait MergePolicy: marker::Send + Debug {
/// Given the list of segment metas, returns the list of merge candidates.
pub trait MergePolicy: marker::Send + marker::Sync + Debug {
/// Given the list of segment metas, returns the list of merge candidates.
///
/// This call happens on the segment updater thread, and will block
/// other segment updates, so all implementations should happen rapidly.
/// This call happens on the segment updater thread, and will block
/// other segment updates, so all implementations should happen rapidly.
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
/// Returns a boxed clone of the MergePolicy.
fn box_clone(&self) -> Box<MergePolicy>;
}
/// Never merge segments.
/// Never merge segments.
#[derive(Debug)]
pub struct NoMergePolicy;
@@ -37,9 +35,41 @@ impl MergePolicy for NoMergePolicy {
fn compute_merge_candidates(&self, _segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
Vec::new()
}
fn box_clone(&self) -> Box<MergePolicy> {
box NoMergePolicy
}
}
#[cfg(test)]
pub mod tests {
use super::*;
use core::SegmentId;
use core::SegmentMeta;
/// `MergePolicy` useful for test purposes.
///
/// Everytime there is more than one segment,
/// it will suggest to merge them.
#[derive(Debug)]
pub struct MergeWheneverPossible;
impl MergePolicy for MergeWheneverPossible {
fn compute_merge_candidates(&self, segment_metas: &[SegmentMeta]) -> Vec<MergeCandidate> {
let segment_ids = segment_metas
.iter()
.map(|segment_meta| segment_meta.id())
.collect::<Vec<SegmentId>>();
if segment_ids.len() > 1 {
vec![MergeCandidate(segment_ids)]
} else {
vec![]
}
}
fn box_clone(&self) -> Box<MergePolicy> {
box MergeWheneverPossible
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,22 +1,29 @@
mod index_writer;
pub mod index_writer;
pub mod segment_serializer;
pub mod merger;
mod merge_policy;
pub mod merge_policy;
mod log_merge_policy;
mod segment_register;
mod segment_writer;
mod segment_manager;
pub mod delete_queue;
pub mod segment_updater;
mod directory_lock;
mod segment_entry;
mod doc_opstamp_mapping;
pub mod operation;
mod stamper;
mod prepared_commit;
pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::{SegmentEntry, SegmentState};
pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_writer::SegmentWriter;
pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_policy::{NoMergePolicy, MergeCandidate, MergePolicy};
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
pub use self::segment_manager::SegmentManager;
pub(crate) use self::directory_lock::DirectoryLock;
/// Alias for the default merge policy, which is the LogMergePolicy.
/// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy;

16
src/indexer/operation.rs Normal file
View File

@@ -0,0 +1,16 @@
use schema::Document;
use schema::Term;
/// Timestamped Delete operation.
#[derive(Clone, Eq, PartialEq, Debug)]
pub struct DeleteOperation {
pub opstamp: u64,
pub term: Term,
}
/// Timestamped Add operation.
#[derive(Eq, PartialEq, Debug)]
pub struct AddOperation {
pub opstamp: u64,
pub document: Document,
}

View File

@@ -0,0 +1,39 @@
use Result;
use super::IndexWriter;
/// A prepared commit
pub struct PreparedCommit<'a> {
index_writer: &'a mut IndexWriter,
payload: Option<String>,
opstamp: u64,
}
impl<'a> PreparedCommit<'a> {
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit {
PreparedCommit {
index_writer,
payload: None,
opstamp
}
}
pub fn opstamp(&self) -> u64 {
self.opstamp
}
pub fn set_payload(&mut self, payload: &str) {
self.payload = Some(payload.to_string())
}
pub fn abort(self) -> Result<()> {
self.index_writer.rollback()
}
pub fn commit(self) -> Result<u64> {
info!("committing {}", self.opstamp);
self.index_writer
.segment_updater()
.commit(self.opstamp, self.payload)?;
Ok(self.opstamp)
}
}

View File

@@ -0,0 +1,121 @@
use core::SegmentMeta;
use bit_set::BitSet;
use indexer::delete_queue::DeleteCursor;
use core::SegmentId;
use std::fmt;
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum SegmentState {
Ready,
InMerge,
}
impl SegmentState {
pub fn letter_code(&self) -> char {
match *self {
SegmentState::InMerge => 'M',
SegmentState::Ready => 'R',
}
}
}
/// A segment entry describes the state of
/// a given segment, at a given instant.
///
/// In addition to segment `meta`,
/// it contains a few transient states
/// - `state` expresses whether the segment is already in the
/// middle of a merge
/// - `delete_bitset` is a bitset describing
/// documents that were deleted during the commit
/// itself.
/// - `delete_cursor` is the position in the delete queue.
/// Deletes happening before the cursor are reflected either
/// in the .del file or in the `delete_bitset`.
#[derive(Clone)]
pub struct SegmentEntry {
meta: SegmentMeta,
state: SegmentState,
delete_bitset: Option<BitSet>,
delete_cursor: DeleteCursor,
}
impl SegmentEntry {
/// Create a new `SegmentEntry`
pub fn new(
segment_meta: SegmentMeta,
delete_cursor: DeleteCursor,
delete_bitset: Option<BitSet>,
) -> SegmentEntry {
SegmentEntry {
meta: segment_meta,
state: SegmentState::Ready,
delete_bitset,
delete_cursor,
}
}
/// Return a reference to the segment entry deleted bitset.
///
/// `DocId` in this bitset are flagged as deleted.
pub fn delete_bitset(&self) -> Option<&BitSet> {
self.delete_bitset.as_ref()
}
/// Set the `SegmentMeta` for this segment.
pub fn set_meta(&mut self, segment_meta: SegmentMeta) {
self.meta = segment_meta;
}
/// Return a reference to the segment_entry's delete cursor
pub fn delete_cursor(&mut self) -> &mut DeleteCursor {
&mut self.delete_cursor
}
/// Return the `SegmentEntry`.
///
/// The state describes whether the segment is available for
/// a merge or not.
pub fn state(&self) -> SegmentState {
self.state
}
/// Returns the segment id.
pub fn segment_id(&self) -> SegmentId {
self.meta.id()
}
/// Accessor to the `SegmentMeta`
pub fn meta(&self) -> &SegmentMeta {
&self.meta
}
/// Mark the `SegmentEntry` as in merge.
///
/// Only segments that are not already
/// in a merge are elligible for future merge.
pub fn start_merge(&mut self) {
self.state = SegmentState::InMerge;
}
/// Cancel a merge
///
/// If a merge fails, it is important to switch
/// the segment back to a idle state, so that it
/// may be elligible for future merges.
pub fn cancel_merge(&mut self) {
self.state = SegmentState::Ready;
}
/// Returns true iff a segment should
/// be considered for a merge.
pub fn is_ready(&self) -> bool {
self.state == SegmentState::Ready
}
}
impl fmt::Debug for SegmentEntry {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
write!(formatter, "SegmentEntry({:?}, {:?})", self.meta, self.state)
}
}

View File

@@ -1,180 +1,227 @@
use super::segment_register::SegmentRegister;
use std::sync::RwLock;
use core::SegmentMeta;
use core::{LOCKFILE_FILEPATH, META_FILEPATH};
use core::SegmentId;
use indexer::SegmentEntry;
use std::path::PathBuf;
use std::collections::hash_set::HashSet;
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
use std::fmt::{self, Debug, Formatter};
use std::sync::atomic::{AtomicUsize, Ordering};
use indexer::delete_queue::DeleteCursor;
#[derive(Default)]
struct SegmentRegisters {
docstamp: u64,
uncommitted: SegmentRegister,
committed: SegmentRegister,
writing: HashSet<SegmentId>,
}
#[derive(Eq, PartialEq)]
pub enum CommitState {
Committed,
Uncommitted,
Missing,
}
impl Default for SegmentRegisters {
fn default() -> SegmentRegisters {
SegmentRegisters {
docstamp: 0u64,
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::default()
}
}
}
/// The segment manager stores the list of segments
/// as well as their state.
///
/// It guarantees the atomicity of the
/// It guarantees the atomicity of the
/// changes (merges especially)
#[derive(Default)]
pub struct SegmentManager {
registers: RwLock<SegmentRegisters>,
// generation is an ever increasing counter that
// is incremented whenever we modify
// the segment manager. It can be useful for debugging
// purposes, and it also acts as a "dirty" marker,
// to detect when the `meta.json` should be written.
generation: AtomicUsize,
}
impl Debug for SegmentManager {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
let lock = self.read();
write!(f, "{{ uncommitted: {:?}, committed: {:?} }}", lock.uncommitted, lock.committed)
write!(
f,
"{{ uncommitted: {:?}, committed: {:?} }}",
lock.uncommitted, lock.committed
)
}
}
/// Returns the `SegmentMeta`s for (committed segment, uncommitted segments).
/// The result is consistent with other transactions.
///
/// For instance, a segment will not appear in both committed and uncommitted
/// segments
pub fn get_segment_ready_for_commit(segment_manager: &SegmentManager,) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
pub fn get_mergeable_segments(
segment_manager: &SegmentManager,
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let registers_lock = segment_manager.read();
(registers_lock.committed.get_segment_ready_for_commit(),
registers_lock.uncommitted.get_segment_ready_for_commit())
(
registers_lock.committed.get_mergeable_segments(),
registers_lock.uncommitted.get_mergeable_segments(),
)
}
impl SegmentManager {
/// Returns whether a segment is committed, uncommitted or missing.
pub fn is_committed(&self, segment_id: SegmentId) -> CommitState {
let lock = self.read();
if lock.uncommitted.contains(segment_id) {
CommitState::Uncommitted
pub fn from_segments(
segment_metas: Vec<SegmentMeta>,
delete_cursor: &DeleteCursor,
) -> SegmentManager {
SegmentManager {
registers: RwLock::new(SegmentRegisters {
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas, delete_cursor),
writing: HashSet::new(),
}),
}
else if lock.committed.contains(segment_id) {
CommitState::Committed
}
else {
CommitState::Missing
}
}
pub fn docstamp(&self,) -> u64 {
self.read().docstamp
}
pub fn from_segments(segment_metas: Vec<SegmentMeta>) -> SegmentManager {
SegmentManager {
registers: RwLock::new( SegmentRegisters {
docstamp: 0u64, // TODO put the actual value
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::from(segment_metas),
}),
generation: AtomicUsize::default(),
/// Returns all of the segment entries (committed or uncommitted)
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
let mut segment_entries = self.read().uncommitted.segment_entries();
segment_entries.extend(self.read().committed.segment_entries());
segment_entries
}
/// Returns the overall number of segments in the `SegmentManager`
pub fn num_segments(&self) -> usize {
let registers_lock = self.read();
registers_lock.committed.len() + registers_lock.uncommitted.len()
}
pub fn list_files(&self) -> HashSet<PathBuf> {
let registers_lock = self.read();
let mut files = HashSet::new();
files.insert(META_FILEPATH.clone());
files.insert(LOCKFILE_FILEPATH.clone());
let segment_metas: Vec<SegmentMeta> = registers_lock
.committed
.get_all_segments()
.into_iter()
.chain(registers_lock.uncommitted.get_all_segments().into_iter())
.chain(registers_lock.writing.iter().cloned().map(SegmentMeta::new))
.collect();
for segment_meta in segment_metas {
files.extend(segment_meta.list_files());
}
files
}
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
let registers = self.read();
registers
.committed
.segment_entry(segment_id)
.or_else(|| registers.uncommitted.segment_entry(segment_id))
}
// Lock poisoning should never happen :
// The lock is acquired and released within this class,
// and the operations cannot panic.
fn read(&self,) -> RwLockReadGuard<SegmentRegisters> {
self.registers.read().expect("Failed to acquire read lock on SegmentManager.")
// and the operations cannot panic.
fn read(&self) -> RwLockReadGuard<SegmentRegisters> {
self.registers
.read()
.expect("Failed to acquire read lock on SegmentManager.")
}
fn write(&self,) -> RwLockWriteGuard<SegmentRegisters> {
self.generation.fetch_add(1, Ordering::Release);
self.registers.write().expect("Failed to acquire write lock on SegmentManager.")
fn write(&self) -> RwLockWriteGuard<SegmentRegisters> {
self.registers
.write()
.expect("Failed to acquire write lock on SegmentManager.")
}
pub fn generation(&self,) -> usize {
self.generation.load(Ordering::Acquire)
}
/// Removes all of the uncommitted segments
/// and returns them.
pub fn rollback(&self,) -> Vec<SegmentId> {
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
let mut registers_lock = self.write();
let segment_ids = registers_lock.uncommitted.segment_ids();
registers_lock.committed.clear();
registers_lock.uncommitted.clear();
segment_ids
}
pub fn commit(&self, docstamp: u64) {
let mut registers_lock = self.write();
let segment_entries = registers_lock.uncommitted.segment_entries();
for segment_entry in segment_entries {
registers_lock.committed.add_segment_entry(segment_entry);
}
registers_lock.docstamp = docstamp;
registers_lock.uncommitted.clear();
}
pub fn add_segment(&self, segment_meta: SegmentMeta) {
let mut registers_lock = self.write();
registers_lock.uncommitted.add_segment(segment_meta);
}
pub fn start_merge(&self, segment_ids: &[SegmentId]) {
let mut registers_lock = self.write();
if registers_lock.uncommitted.contains_all(segment_ids) {
for segment_id in segment_ids {
registers_lock.uncommitted.start_merge(segment_id);
}
}
else if registers_lock.committed.contains_all(segment_ids) {
} else if registers_lock.committed.contains_all(segment_ids) {
for segment_id in segment_ids {
registers_lock.committed.start_merge(segment_id);
}
}
}
pub fn end_merge(&self, merged_segment_ids: &[SegmentId], merged_segment_meta: &SegmentMeta) {
let mut registers_lock = self.write();
if registers_lock.uncommitted.contains_all(merged_segment_ids) {
for segment_id in merged_segment_ids {
registers_lock.uncommitted.remove_segment(segment_id);
}
registers_lock.uncommitted.add_segment(merged_segment_meta.clone());
}
else if registers_lock.committed.contains_all(merged_segment_ids) {
for segment_id in merged_segment_ids {
registers_lock.committed.remove_segment(segment_id);
}
registers_lock.committed.add_segment(merged_segment_meta.clone());
} else {
warn!("couldn't find segment in SegmentManager");
error!("Merge operation sent for segments that are not all uncommited or commited.");
}
}
pub fn committed_segments(&self,) -> Vec<SegmentId> {
let registers_lock = self.read();
registers_lock.committed.segment_ids()
pub fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_id: SegmentId,
) {
let mut registers_lock = self.write();
// we mark all segments are ready for merge.
{
let target_segment_register: &mut SegmentRegister;
target_segment_register = {
if registers_lock
.uncommitted
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.uncommitted
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");
return;
}
};
for segment_id in before_merge_segment_ids {
target_segment_register.cancel_merge(segment_id);
}
}
// ... and we make sure the target segment entry
// can be garbage collected.
registers_lock.writing.remove(&after_merge_segment_id);
}
pub fn segment_metas(&self,) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
pub fn write_segment(&self, segment_id: SegmentId) {
let mut registers_lock = self.write();
registers_lock.writing.insert(segment_id);
}
pub fn add_segment(&self, segment_entry: SegmentEntry) {
let mut registers_lock = self.write();
registers_lock.writing.remove(&segment_entry.segment_id());
registers_lock.uncommitted.add_segment_entry(segment_entry);
}
pub fn end_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentEntry,
) {
let mut registers_lock = self.write();
registers_lock
.writing
.remove(&after_merge_segment_entry.segment_id());
let target_register: &mut SegmentRegister = {
if registers_lock
.uncommitted
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.uncommitted
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");
return;
}
};
for segment_id in before_merge_segment_ids {
target_register.remove_segment(segment_id);
}
target_register.add_segment_entry(after_merge_segment_entry);
}
pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> {
let registers_lock = self.read();
(registers_lock.committed.segment_metas(), registers_lock.uncommitted.segment_metas())
registers_lock.committed.segment_metas()
}
}

View File

@@ -3,198 +3,188 @@ use std::collections::HashMap;
use core::SegmentMeta;
use std::fmt;
use std::fmt::{Debug, Formatter};
#[derive(Clone, PartialEq, Eq, Debug)]
pub enum SegmentState {
Ready,
InMerge,
}
impl SegmentState {
fn letter_code(&self,) -> char {
match *self {
SegmentState::InMerge => 'M',
SegmentState::Ready => 'R',
}
}
}
#[derive(Clone)]
pub struct SegmentEntry {
meta: SegmentMeta,
state: SegmentState,
}
impl SegmentEntry {
fn start_merge(&mut self,) {
self.state = SegmentState::InMerge;
}
fn is_ready(&self,) -> bool {
self.state == SegmentState::Ready
}
}
use indexer::segment_entry::SegmentEntry;
use indexer::delete_queue::DeleteCursor;
/// The segment register keeps track
/// of the list of segment, their size as well
/// as the state they are in.
///
/// It is consumed by indexes to get the list of
///
/// It is consumed by indexes to get the list of
/// segments that are currently searchable,
/// and by the index merger to identify
/// and by the index merger to identify
/// merge candidates.
#[derive(Default)]
pub struct SegmentRegister {
segment_states: HashMap<SegmentId, SegmentEntry>,
segment_states: HashMap<SegmentId, SegmentEntry>,
}
impl Debug for SegmentRegister {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
try!(write!(f, "SegmentRegister("));
write!(f, "SegmentRegister(")?;
for (k, v) in &self.segment_states {
try!(write!(f, "{}:{}, ", k.short_uuid_string(), v.state.letter_code()));
write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code())?;
}
try!(write!(f, ")"));
write!(f, ")")?;
Ok(())
}
}
impl SegmentRegister {
pub fn clear(&mut self,) {
pub fn clear(&mut self) {
self.segment_states.clear();
}
pub fn get_segment_ready_for_commit(&self,) -> Vec<SegmentMeta> {
pub fn len(&self) -> usize {
self.segment_states.len()
}
pub fn get_all_segments(&self) -> Vec<SegmentMeta> {
self.segment_states
.values()
.map(|segment_entry| segment_entry.meta().clone())
.collect()
}
pub fn get_mergeable_segments(&self) -> Vec<SegmentMeta> {
self.segment_states
.values()
.filter(|segment_entry| segment_entry.is_ready())
.map(|segment_entry| segment_entry.meta.clone())
.map(|segment_entry| segment_entry.meta().clone())
.collect()
}
pub fn segment_entries(&self,) -> Vec<SegmentEntry>{
self.segment_states
.values()
.cloned()
.collect()
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
self.segment_states.values().cloned().collect()
}
pub fn segment_metas(&self,) -> Vec<SegmentMeta> {
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
let mut segment_ids: Vec<SegmentMeta> = self.segment_states
.values()
.map(|segment_entry| segment_entry.meta.clone())
.map(|segment_entry| segment_entry.meta().clone())
.collect();
segment_ids.sort_by_key(|meta| meta.segment_id);
segment_ids.sort_by_key(|meta| meta.id());
segment_ids
}
pub fn segment_ids(&self,) -> Vec<SegmentId> {
self.segment_metas()
.into_iter()
.map(|segment_meta| segment_meta.segment_id)
.collect()
}
#[cfg(test)]
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
self.segment_states
.get(&segment_id)
.map(|segment_entry| segment_entry.clone())
self.segment_states.get(segment_id).cloned()
}
pub fn contains(&self, segment_id: SegmentId) -> bool {
self.segment_states.contains_key(&segment_id)
}
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
segment_ids
.iter()
.all(|segment_id| self.segment_states.contains_key(segment_id))
}
pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) {
let segment_id = segment_entry.meta.segment_id;
let segment_id = segment_entry.segment_id();
self.segment_states.insert(segment_id, segment_entry);
}
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
self.add_segment_entry(SegmentEntry {
meta: segment_meta.clone(),
state: SegmentState::Ready,
});
}
pub fn remove_segment(&mut self, segment_id: &SegmentId) {
self.segment_states.remove(segment_id);
}
}
pub fn cancel_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.cancel_merge();
}
pub fn start_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.start_merge();
}
}
}
impl From<Vec<SegmentMeta>> for SegmentRegister {
fn from(segment_metas: Vec<SegmentMeta>) -> SegmentRegister {
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: &DeleteCursor) -> SegmentRegister {
let mut segment_states = HashMap::new();
for segment_meta in segment_metas {
let segment_id = segment_meta.segment_id;
let segment_entry = SegmentEntry {
meta: segment_meta,
state: SegmentState::Ready,
};
let segment_id = segment_meta.id();
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone(), None);
segment_states.insert(segment_id, segment_entry);
}
SegmentRegister {
segment_states: segment_states,
}
}
}
impl Default for SegmentRegister {
fn default() -> SegmentRegister {
SegmentRegister {
segment_states: HashMap::new(),
}
SegmentRegister { segment_states }
}
}
#[cfg(test)]
mod tests {
use indexer::SegmentState;
use core::SegmentId;
use core::SegmentMeta;
use indexer::delete_queue::*;
use super::*;
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
segment_register
.segment_metas()
.into_iter()
.map(|segment_meta| segment_meta.id())
.collect()
}
#[test]
fn test_segment_register() {
let delete_queue = DeleteQueue::new();
let mut segment_register = SegmentRegister::default();
let segment_id_a = SegmentId::generate_random();
let segment_id_b = SegmentId::generate_random();
let segment_id_merged = SegmentId::generate_random();
let segment_meta_merged = SegmentMeta::new(segment_id_merged, 10 + 20);
segment_register.add_segment(SegmentMeta::new(segment_id_a, 10));
assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state, SegmentState::Ready);
assert_eq!(segment_register.segment_ids(), vec!(segment_id_a));
segment_register.add_segment(SegmentMeta::new(segment_id_b, 20));
assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state, SegmentState::Ready);
{
let segment_meta = SegmentMeta::new(segment_id_a);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::Ready
);
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
{
let segment_meta = SegmentMeta::new(segment_id_b);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::Ready
);
segment_register.start_merge(&segment_id_a);
segment_register.start_merge(&segment_id_b);
assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state, SegmentState::InMerge);
assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state, SegmentState::InMerge);
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::InMerge
);
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::InMerge
);
segment_register.remove_segment(&segment_id_a);
segment_register.remove_segment(&segment_id_b);
segment_register.add_segment(segment_meta_merged);
assert_eq!(segment_register.segment_ids(), vec!(segment_id_merged));
{
let segment_meta_merged = SegmentMeta::new(segment_id_merged);
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_ids(&segment_register), vec![segment_id_merged]);
}
}
}

Some files were not shown because too many files have changed in this diff Show More