Compare commits

..

81 Commits

Author SHA1 Message Date
Paul Masurel
d07e896a2f Exposed API to create a new Segment. 2017-05-13 15:15:35 +09:00
Paul Masurel
574feb8026 Merge branch 'issue/136' into tantivy-imhotep 2017-05-12 17:04:20 +09:00
Paul Masurel
ecbdd70c37 Removed the clunky linked list logic of the heap. 2017-05-12 14:01:52 +09:00
Paul Masurel
fb1b2be782 issue/136 Fix following CR 2017-05-12 13:51:09 +09:00
Paul Masurel
0bc047f8e1 blop 2017-05-12 13:39:04 +09:00
Paul Masurel
2c2aa3d66c Merge branch 'issue/136' into tantivy-imhotep
Conflicts:
	src/postings/mod.rs
2017-05-11 22:42:03 +09:00
Paul Masurel
4c4c28e2c4 Fix broke compile 2017-05-11 20:57:32 +09:00
Paul Masurel
9f9e588905 Merge branch 'master' into issue/136
Conflicts:
	src/postings/postings_writer.rs
2017-05-11 20:50:24 +09:00
Paul Masurel
6fd17e0ead Code cleaning 2017-05-11 20:47:30 +09:00
Paul Masurel
65dc5b0d83 Closes #145 2017-05-11 19:48:06 +09:00
Paul Masurel
15d15c01f8 Runing examples in CI
Closes #143
2017-05-11 19:43:36 +09:00
Paul Masurel
106832a66a Make Term::with_capacity crate-public 2017-05-11 19:37:15 +09:00
Paul Masurel
477b9136b9 FIXED inconsistent Term's field serialization.
Also.

Cleaned up the code to make sure that the logic
is only in one place.
Removed allocate_vec

Closes #141
Closes #139
Closes #142
Closes #138
2017-05-11 19:37:15 +09:00
Paul Masurel
7852d097b8 CHANGELOG 0.3.1 did not included the fix of the Field(u32) 2017-05-11 09:48:37 +09:00
Paul Masurel
da99bbcb9d Merge branch 'issue/indexing-refactoring' into tantivy-imhotep
Conflicts:
	src/common/mod.rs
2017-05-10 21:27:44 +09:00
Ashley Mannix
0bd56241bb pretty print meta.json 2017-05-10 20:13:53 +09:00
Paul Masurel
54ab897755 Added comment 2017-05-10 19:30:24 +09:00
Paul Masurel
1369d2d144 Quadratic probing. 2017-05-10 10:38:47 +09:00
Paul Masurel
d3f829dc8a Bugfix 2017-05-10 00:29:37 +09:00
Paul Masurel
e82ccf9627 Merge branch 'master' into issue/indexing-refactoring 2017-05-09 16:43:33 +09:00
Paul Masurel
d3d29f7f54 NOBUG Updated CHANGELOG with the serde change for 0.4.0 2017-05-09 16:42:25 +09:00
Paul Masurel
3566717979 Merge pull request #134 from tantivy-search/chore/serde-rebase
Replace rustc_serialize with serde (updated)
2017-05-09 16:38:42 +09:00
Paul Masurel
90bc3e3773 Added limitation on term dictionary saturation 2017-05-09 14:10:33 +09:00
Paul Masurel
ffb62b6835 working 2017-05-09 10:17:05 +09:00
Ashley Mannix
4f9ce91d6a update underflow test 2017-05-08 14:40:58 +10:00
Laurentiu Nicola
3c3a2fbfe8 Remove old serialization code 2017-05-08 07:36:15 +03:00
Laurentiu Nicola
0508571d1a Use the proper error type on u64 overflow 2017-05-08 07:35:33 +03:00
Laurentiu Nicola
7b733dd34f Fix i64 overflow check and merge NotJSON with NotJSONObject 2017-05-08 07:09:54 +03:00
Ashley Mannix
2c798e3147 Replace rustc_serialize with serde 2017-05-07 20:21:22 +03:00
Paul Masurel
33f9426dd6 Merge branch 'master' into tantivy-imhotep 2017-05-07 15:58:13 +09:00
Paul Masurel
2c13f210bc Bugfix on merging i64 fast fields 2017-05-07 15:57:29 +09:00
Paul Masurel
647c97fa3d Expose common 2017-05-07 14:31:56 +09:00
Paul Masurel
8029aea548 Exposing store 2017-05-07 14:24:36 +09:00
Paul Masurel
3b33484cf8 compatibility with tantivy-imhotep 2017-05-07 14:19:38 +09:00
Paul Masurel
2a909ddcc7 Merge branch 'master' into tantivy-imhotep
Conflicts:
	src/collector/mod.rs
	src/common/bitpacker.rs
	src/common/mod.rs
	src/core/segment_reader.rs
	src/fastfield/mod.rs
	src/fastfield/reader.rs
	src/fastfield/writer.rs
	src/functional_test.rs
	src/indexer/merger.rs
	src/indexer/segment_writer.rs
	src/lib.rs
	src/postings/serializer.rs
	src/query/query_parser/query_parser.rs
	src/schema/document.rs
	src/schema/field_entry.rs
	src/schema/field_type.rs
	src/schema/int_options.rs
	src/schema/mod.rs
	src/schema/named_field_document.rs
	src/schema/schema.rs
	src/schema/value.rs
2017-05-07 14:03:18 +09:00
Paul Masurel
0dad02791c issues/65 Added comments
Closes #65
Closes #132
2017-05-06 23:09:45 +09:00
Paul Masurel
2947364ae1 issues/65 Phrase query for untokenized fields are not tokenized. 2017-05-06 22:14:26 +09:00
Paul Masurel
05111599b3 Removed several TODOs 2017-05-05 16:08:09 +08:00
Paul Masurel
83263eabbb issues/65 Updated changelog added some doc. 2017-05-04 17:13:14 +08:00
Paul Masurel
5cb5c9a8f2 issues/65 Added i64 fast fields 2017-05-04 16:46:14 +08:00
Paul Masurel
9ab92b7739 i64 fast field working 2017-05-04 16:46:14 +08:00
Paul Masurel
962bddfbbf Merge with panicks. 2017-05-04 16:46:14 +08:00
Paul Masurel
26cfe2909f FastField with different types 2017-05-04 16:46:13 +08:00
Paul Masurel
afdfb1a69b Compiling... fastfield not implemented yet 2017-05-04 16:46:13 +08:00
Paul Masurel
b26ad1d57a Added int options 2017-05-04 16:46:13 +08:00
Paul Masurel
1dbd54edbb Renamed u64options 2017-05-04 16:46:13 +08:00
Paul Masurel
deb04eb090 issue/65 Switching to u64. 2017-05-04 16:46:13 +08:00
Paul Masurel
bed34bf502 Merge branch 'issues/122' 2017-04-23 16:14:40 +08:00
Paul Masurel
925b9063a7 Bugfix in the streamdictionary. Impl of Sync and Send for FastFieldReader 2017-04-23 10:50:14 +08:00
Paul Masurel
5e1ce381fe Merge branch 'issues/65' into tantivy-imhotep
Conflicts:
	src/core/segment_reader.rs
	src/fastfield/reader.rs
2017-04-21 09:53:14 +09:00
Paul Masurel
67381e448f Renamed u64options 2017-04-21 09:13:26 +09:00
Paul Masurel
19d535c28e issue/65 Switching to u64. 2017-04-20 13:32:59 +09:00
Paul Masurel
95bfb71901 NOBUG Remove 256 num fields limit 2017-04-19 22:37:34 +09:00
Paul Masurel
e00d6538fa NOBUG Improve interface 2017-04-19 22:35:52 +09:00
Paul Masurel
8d7445f08a removing the 255 fields limit 2017-04-19 20:07:38 +09:00
Paul Masurel
74e10843a7 issue/120 Disabled SIMD vbyte compression for msvc 2017-04-17 22:36:32 +09:00
Paul Masurel
1b922e6d23 issue 120. Using streamvbyte codec for the vbyte part of the encoding 2017-04-16 18:49:53 +09:00
Paul Masurel
a7c6c31538 Merge commit '9d071c8d4610aa61f4b1f7dd489210415a05cfc0' as 'cpp/streamvbyte' 2017-04-16 15:22:43 +09:00
Paul Masurel
9d071c8d46 Squashed 'cpp/streamvbyte/' content from commit f38aa6b
git-subtree-dir: cpp/streamvbyte
git-subtree-split: f38aa6b6ec4c5cee9d72c94ef305e6a79a108252
2017-04-16 15:22:43 +09:00
Paul Masurel
202e69b7e0 BUGFIX the thing observed on windows 2017-04-15 19:41:14 +09:00
Paul Masurel
a650969509 Merge branch 'master' into tantivy-imhotep 2017-04-15 13:11:58 +09:00
Paul Masurel
04074f7bcb Merge pull request #119 from tantivy-search/issue/118
Using u32 for field ids
2017-04-15 13:11:22 +09:00
Paul Masurel
8a28d1643d Using u32 for field ids 2017-04-15 13:04:33 +09:00
Paul Masurel
c8d06d63b9 Test on UTF-8 2017-04-15 09:43:12 +09:00
Paul Masurel
7eec9f038d Merge branch 'master' into tantivy-imhotep
Conflicts:
	src/common/mod.rs
	src/core/segment_reader.rs
	src/datastruct/fstmap.rs
	src/indexer/merger.rs
	src/postings/mod.rs
	src/postings/segment_postings.rs
	src/postings/serializer.rs
	src/query/boolean_query/mod.rs
	src/query/term_query/term_scorer.rs
	src/query/term_query/term_weight.rs
2017-04-15 00:21:56 +09:00
Paul Masurel
57870fdcef Added a stream builder. 2017-04-14 23:23:26 +09:00
Paul Masurel
c0f2055e32 Added dictionary optimized for streaming 2017-04-11 23:07:34 +09:00
Paul Masurel
9a8f06a523 bugfix on opening termquery when there is no termfreq 2017-02-24 19:08:14 +09:00
Paul Masurel
bb57bee099 committing random shit because of jason 2017-02-23 22:52:17 +09:00
Paul Masurel
bc2a1f00e6 send sync for u32fastfieldreader 2017-02-23 21:11:52 +09:00
Paul Masurel
391f258ff3 Making u32fastfield send/sync 2017-02-23 20:11:35 +09:00
Paul Masurel
673712a339 Added public method to schema. 2017-02-23 19:48:06 +09:00
Paul Masurel
29ad1d84e5 exposing fastfield as public 2017-02-23 19:28:51 +09:00
Paul Masurel
62d9236200 Bugfix 2017-02-23 17:24:22 +09:00
Paul Masurel
f5f8e130b0 Exposing fstmap 2017-02-23 13:15:23 +09:00
Paul Masurel
d5d9218093 made datastruct public to help generate doc. 2017-02-23 11:12:29 +09:00
Paul Masurel
a44f34c49d NOBUG cleanup 2017-02-22 19:40:01 +09:00
Paul Masurel
d8ea083177 Added block iterator for segment postings. 2017-02-22 18:38:58 +09:00
Paul Masurel
d32dff1da9 NOBUG added advance_block 2017-02-22 10:50:25 +09:00
Paul Masurel
f9ca0b16f1 NOBUG Try block iteration 2017-02-22 10:45:19 +09:00
Paul Masurel
a39fe90930 NOBUG Change the code for Box<Scorer> 2017-02-22 09:38:43 +09:00
85 changed files with 4656 additions and 1287 deletions

View File

@@ -28,6 +28,7 @@ script:
travis-cargo test &&
travis-cargo bench &&
travis-cargo doc
- cargo run --example simple_search
after_success:
- bash ./script/build-doc.sh
- travis-cargo doc-upload

View File

@@ -1,3 +1,20 @@
Tantivy 0.4.0
==========================
- Raise the limit of number of fields (previously 256 fields)
- Removed u32 fields. They are replaced by u64 and i64 fields (#65)
- Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola
- QueryParser:
- Explicit error returned when searched for a term that is not indexed
- Searching for a int term via the query parser was broken `(age:1)`
- Searching for a non-indexed field returns an explicit Error
- Phrase query for non-tokenized field are not tokenized by the query parser.
Tantivy 0.3.1
==========================
- Expose a method to trigger files garbage collection
Tantivy 0.3
==========================

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.3.1"
version = "0.4.0-alpha"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
build = "build.rs"
license = "MIT"
@@ -20,18 +20,20 @@ regex = "0.2"
fst = "0.1.37"
atomicwrites = "0.1.3"
tempfile = "2.1"
rustc-serialize = "0.3"
log = "0.3.6"
combine = "2.2"
tempdir = "0.3"
bincode = "0.5"
serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
bincode = "0.7.0-alpha7"
libc = {version = "0.2.20", optional=true}
num_cpus = "1.2"
itertools = "0.5.9"
lz4 = "1.20"
bit-set = "0.4.0"
time = "0.1"
uuid = { version = "0.4", features = ["v4", "rustc-serialize"] }
uuid = { version = "0.5", features = ["v4", "serde"] }
chan = "0.1"
version = "2"
crossbeam = "0.2"

View File

@@ -21,4 +21,5 @@ install:
build: false
test_script:
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
- REM SET RUST_LOG=tantivy,test & cargo run --example simple_search

View File

@@ -4,7 +4,8 @@ mod build {
pub fn build() {
let mut config = gcc::Config::new();
config.include("./cpp/simdcomp/include")
config
.include("./cpp/simdcomp/include")
.file("cpp/simdcomp/src/avxbitpacking.c")
.file("cpp/simdcomp/src/simdintegratedbitpacking.c")
.file("cpp/simdcomp/src/simdbitpacking.c")
@@ -18,18 +19,26 @@ mod build {
config.opt_level(3);
if cfg!(target_env = "msvc") {
config.define("NDEBUG", None)
config
.define("NDEBUG", None)
.flag("/Gm-")
.flag("/GS-")
.flag("/Gy")
.flag("/Oi")
.flag("/GL");
} else {
config.flag("-msse4.1")
.flag("-march=native");
}
}
if !cfg!(target_env = "msvc") {
config
.include("./cpp/streamvbyte/include")
.file("cpp/streamvbyte/src/streamvbyte.c")
.file("cpp/streamvbyte/src/streamvbytedelta.c")
.flag("-msse4.1")
.flag("-march=native")
.flag("-std=c99");
}
config.compile("libsimdcomp.a");
// Workaround for linking static libraries built with /GL

32
cpp/streamvbyte/.gitignore vendored Normal file
View File

@@ -0,0 +1,32 @@
# Object files
*.o
*.ko
*.obj
*.elf
# Precompiled Headers
*.gch
*.pch
# Libraries
*.lib
*.a
*.la
*.lo
# Shared objects (inc. Windows DLLs)
*.dll
*.so
*.so.*
*.dylib
# Executables
*.exe
*.out
*.app
*.i*86
*.x86_64
*.hex
# Debug files
*.dSYM/

View File

@@ -0,0 +1,7 @@
language: c
sudo: false
compiler:
- gcc
- clang
script: make && ./unit

202
cpp/streamvbyte/LICENSE Normal file
View File

@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

60
cpp/streamvbyte/README.md Normal file
View File

@@ -0,0 +1,60 @@
streamvbyte
===========
[![Build Status](https://travis-ci.org/lemire/streamvbyte.png)](https://travis-ci.org/lemire/streamvbyte)
StreamVByte is a new integer compression technique that applies SIMD instructions (vectorization) to
Google's Group Varint approach. The net result is faster than other byte-oriented compression
techniques.
The approach is patent-free, the code is available under the Apache License.
It includes fast differential coding.
It assumes a recent Intel processor (e.g., haswell or better) .
The code should build using most standard-compliant C99 compilers. The provided makefile
expects a Linux-like system.
Usage:
make
./unit
See example.c for an example.
Short code sample:
```C
// suppose that datain is an array of uint32_t integers
size_t compsize = streamvbyte_encode(datain, N, compressedbuffer); // encoding
// here the result is stored in compressedbuffer using compsize bytes
streamvbyte_decode(compressedbuffer, recovdata, N); // decoding (fast)
```
If the values are sorted, then it might be preferable to use differential coding:
```C
// suppose that datain is an array of uint32_t integers
size_t compsize = streamvbyte_delta_encode(datain, N, compressedbuffer,0); // encoding
// here the result is stored in compressedbuffer using compsize bytes
streamvbyte_delta_decode(compressedbuffer, recovdata, N,0); // decoding (fast)
```
You have to know how many integers were coded when you decompress. You can store this
information along with the compressed stream.
See also
--------
* SIMDCompressionAndIntersection: A C++ library to compress and intersect sorted lists of integers using SIMD instructions https://github.com/lemire/SIMDCompressionAndIntersect
* The FastPFOR C++ library : Fast integer compression https://github.com/lemire/FastPFor
* High-performance dictionary coding https://github.com/lemire/dictionary
* LittleIntPacker: C library to pack and unpack short arrays of integers as fast as possible https://github.com/lemire/LittleIntPacker
* The SIMDComp library: A simple C library for compressing lists of integers using binary packing https://github.com/lemire/simdcomp
* MaskedVByte: Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte
* CSharpFastPFOR: A C# integer compression library https://github.com/Genbox/CSharpFastPFOR
* JavaFastPFOR: A java integer compression library https://github.com/lemire/JavaFastPFOR
* Encoding: Integer Compression Libraries for Go https://github.com/zhenjl/encoding
* FrameOfReference is a C++ library dedicated to frame-of-reference (FOR) compression: https://github.com/lemire/FrameOfReference
* libvbyte: A fast implementation for varbyte 32bit/64bit integer compression https://github.com/cruppstahl/libvbyte
* TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor
* Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch

24
cpp/streamvbyte/example.c Normal file
View File

@@ -0,0 +1,24 @@
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include "streamvbyte.h"
int main() {
int N = 5000;
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint8_t * compressedbuffer = malloc(N * sizeof(uint32_t));
uint32_t * recovdata = malloc(N * sizeof(uint32_t));
for (int k = 0; k < N; ++k)
datain[k] = 120;
size_t compsize = streamvbyte_encode(datain, N, compressedbuffer); // encoding
// here the result is stored in compressedbuffer using compsize bytes
size_t compsize2 = streamvbyte_decode(compressedbuffer, recovdata,
N); // decoding (fast)
assert(compsize == compsize2);
free(datain);
free(compressedbuffer);
free(recovdata);
printf("Compressed %d integers down to %d bytes.\n",N,(int) compsize);
return 0;
}

View File

@@ -0,0 +1,19 @@
#ifndef VARINTDECODE_H_
#define VARINTDECODE_H_
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <stdint.h>// please use a C99-compatible compiler
#include <stddef.h>
// Encode an array of a given length read from in to bout in varint format.
// Returns the number of bytes written.
size_t streamvbyte_encode(const uint32_t *in, uint32_t length, uint8_t *out);
// Read "length" 32-bit integers in varint format from in, storing the result in out.
// Returns the number of bytes read.
size_t streamvbyte_decode(const uint8_t* in, uint32_t* out, uint32_t length);
#endif /* VARINTDECODE_H_ */

View File

@@ -0,0 +1,24 @@
/*
* streamvbytedelta.h
*
* Created on: Apr 14, 2016
* Author: lemire
*/
#ifndef INCLUDE_STREAMVBYTEDELTA_H_
#define INCLUDE_STREAMVBYTEDELTA_H_
// Encode an array of a given length read from in to bout in StreamVByte format.
// Returns the number of bytes written.
// this version uses differential coding (coding differences between values) starting at prev (you can often set prev to zero)
size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t length, uint8_t *out, uint32_t prev);
// Read "length" 32-bit integers in StreamVByte format from in, storing the result in out.
// Returns the number of bytes read.
// this version uses differential coding (coding differences between values) starting at prev (you can often set prev to zero)
size_t streamvbyte_delta_decode(const uint8_t* in, uint32_t* out, uint32_t length, uint32_t prev);
#endif /* INCLUDE_STREAMVBYTEDELTA_H_ */

58
cpp/streamvbyte/makefile Normal file
View File

@@ -0,0 +1,58 @@
# minimalist makefile
.SUFFIXES:
#
.SUFFIXES: .cpp .o .c .h
CFLAGS = -fPIC -march=native -std=c99 -O3 -Wall -Wextra -pedantic -Wshadow
LDFLAGS = -shared
LIBNAME=libstreamvbyte.so.0.0.1
all: unit $(LIBNAME)
test:
./unit
install: $(OBJECTS)
cp $(LIBNAME) /usr/local/lib
ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libstreamvbyte.so
ldconfig
cp $(HEADERS) /usr/local/include
HEADERS=./include/streamvbyte.h ./include/streamvbytedelta.h
uninstall:
for h in $(HEADERS) ; do rm /usr/local/$$h; done
rm /usr/local/lib/$(LIBNAME)
rm /usr/local/lib/libstreamvbyte.so
ldconfig
OBJECTS= streamvbyte.o streamvbytedelta.o
streamvbytedelta.o: ./src/streamvbytedelta.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/streamvbytedelta.c -Iinclude
streamvbyte.o: ./src/streamvbyte.c $(HEADERS)
$(CC) $(CFLAGS) -c ./src/streamvbyte.c -Iinclude
$(LIBNAME): $(OBJECTS)
$(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS)
example: ./example.c $(HEADERS) $(OBJECTS)
$(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS)
unit: ./tests/unit.c $(HEADERS) $(OBJECTS)
$(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude $(OBJECTS)
dynunit: ./tests/unit.c $(HEADERS) $(LIBNAME)
$(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude -lstreamvbyte
clean:
rm -f unit *.o $(LIBNAME) example

View File

@@ -0,0 +1,495 @@
#include "streamvbyte.h"
#if defined(_MSC_VER)
/* Microsoft C/C++-compatible compiler */
#include <intrin.h>
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
/* GCC-compatible compiler, targeting x86/x86-64 */
#include <x86intrin.h>
#elif defined(__GNUC__) && defined(__ARM_NEON__)
/* GCC-compatible compiler, targeting ARM with NEON */
#include <arm_neon.h>
#elif defined(__GNUC__) && defined(__IWMMXT__)
/* GCC-compatible compiler, targeting ARM with WMMX */
#include <mmintrin.h>
#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
#include <altivec.h>
#elif defined(__GNUC__) && defined(__SPE__)
/* GCC-compatible compiler, targeting PowerPC with SPE */
#include <spe.h>
#endif
static uint8_t lengthTable[256] = { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9,
10, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8,
9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10,
8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12,
13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8,
9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12,
10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15,
13, 14, 15, 16 };
static uint8_t shuffleTable[256][16] = { { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1,
-1, -1, 3, -1, -1, -1 }, // 1111
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 2111
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 3111
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 4111
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 1211
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 2211
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 3211
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 4211
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 1311
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 2311
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 3311
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1 }, // 4311
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1 }, // 1411
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1 }, // 2411
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1 }, // 3411
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1 }, // 4411
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 }, // 1121
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 2121
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 3121
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 4121
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 1221
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 2221
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 3221
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 4221
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 1321
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 2321
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 3321
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1 }, // 4321
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1 }, // 1421
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1 }, // 2421
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1 }, // 3421
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1 }, // 4421
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1 }, // 1131
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 2131
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 3131
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 4131
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 1231
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 2231
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 3231
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 4231
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 1331
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 2331
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 3331
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1 }, // 4331
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1 }, // 1431
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1 }, // 2431
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1 }, // 3431
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1 }, // 4431
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 }, // 1141
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 2141
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 3141
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 4141
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 1241
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 2241
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 3241
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 4241
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 1341
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 2341
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 3341
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1 }, // 4341
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1 }, // 1441
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1 }, // 2441
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1 }, // 3441
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 }, // 4441
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 }, // 1112
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 2112
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 3112
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 4112
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 1212
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 2212
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 3212
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 4212
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 1312
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 2312
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 3312
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1 }, // 4312
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1 }, // 1412
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1 }, // 2412
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1 }, // 3412
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1 }, // 4412
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 }, // 1122
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 2122
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 3122
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 4122
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 1222
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 2222
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 3222
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 4222
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 1322
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 2322
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 3322
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1 }, // 4322
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1 }, // 1422
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1 }, // 2422
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1 }, // 3422
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1 }, // 4422
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1 }, // 1132
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 2132
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 3132
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 4132
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 1232
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 2232
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 3232
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 4232
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 1332
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 2332
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 3332
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1 }, // 4332
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1 }, // 1432
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1 }, // 2432
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1 }, // 3432
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1 }, // 4432
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 }, // 1142
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 2142
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 3142
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 4142
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 1242
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 2242
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 3242
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 4242
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 1342
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 2342
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 3342
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1 }, // 4342
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1 }, // 1442
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1 }, // 2442
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1 }, // 3442
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 }, // 4442
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1 }, // 1113
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 2113
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 3113
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 4113
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 1213
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 2213
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 3213
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 4213
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 1313
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 2313
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 3313
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1 }, // 4313
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1 }, // 1413
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1 }, // 2413
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1 }, // 3413
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1 }, // 4413
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1 }, // 1123
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 2123
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 3123
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 4123
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 1223
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 2223
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 3223
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 4223
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 1323
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 2323
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 3323
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1 }, // 4323
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1 }, // 1423
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1 }, // 2423
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1 }, // 3423
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1 }, // 4423
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1 }, // 1133
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 2133
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 3133
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 4133
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 1233
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 2233
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 3233
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 4233
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 1333
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 2333
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 3333
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1 }, // 4333
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1 }, // 1433
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1 }, // 2433
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1 }, // 3433
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1 }, // 4433
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1 }, // 1143
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 2143
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 3143
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 4143
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 1243
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 2243
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 3243
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 4243
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 1343
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 2343
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 3343
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1 }, // 4343
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1 }, // 1443
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1 }, // 2443
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 }, // 3443
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1 }, // 4443
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 }, // 1114
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 2114
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 3114
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 4114
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 1214
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 2214
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 3214
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 4214
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 1314
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 2314
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 3314
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11 }, // 4314
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9 }, // 1414
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10 }, // 2414
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11 }, // 3414
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12 }, // 4414
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 }, // 1124
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 2124
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 3124
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 4124
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 1224
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 2224
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 3224
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 4224
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 1324
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 2324
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 3324
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12 }, // 4324
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10 }, // 1424
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11 }, // 2424
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12 }, // 3424
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13 }, // 4424
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8 }, // 1134
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 2134
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 3134
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 4134
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 1234
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 2234
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 3234
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 4234
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 1334
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 2334
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 3334
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13 }, // 4334
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11 }, // 1434
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12 }, // 2434
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13 }, // 3434
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14 }, // 4434
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 }, // 1144
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 2144
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 3144
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 4144
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 1244
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 2244
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 3244
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 4244
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 1344
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 2344
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 3344
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14 }, // 4344
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, // 1444
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, // 2444
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, // 3444
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } // 4444
};
static uint8_t _encode_data(uint32_t val, uint8_t *__restrict__ *dataPtrPtr) {
uint8_t *dataPtr = *dataPtrPtr;
uint8_t code;
if (val < (1 << 8)) { // 1 byte
*dataPtr = (uint8_t)(val);
*dataPtrPtr += 1;
code = 0;
} else if (val < (1 << 16)) { // 2 bytes
*(uint16_t *) dataPtr = (uint16_t)(val);
*dataPtrPtr += 2;
code = 1;
} else if (val < (1 << 24)) { // 3 bytes
*(uint16_t *) dataPtr = (uint16_t)(val);
*(dataPtr + 2) = (uint8_t)(val >> 16);
*dataPtrPtr += 3;
code = 2;
} else { // 4 bytes
*(uint32_t *) dataPtr = val;
*dataPtrPtr += 4;
code = 3;
}
return code;
}
static uint8_t *svb_encode_scalar(const uint32_t *in,
uint8_t *__restrict__ keyPtr, uint8_t *__restrict__ dataPtr,
uint32_t count) {
if (count == 0)
return dataPtr; // exit immediately if no data
uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ...
uint8_t key = 0;
for (uint32_t c = 0; c < count; c++) {
if (shift == 8) {
shift = 0;
*keyPtr++ = key;
key = 0;
}
uint32_t val = in[c];
uint8_t code = _encode_data(val, &dataPtr);
key |= code << shift;
shift += 2;
}
*keyPtr = key; // write last key (no increment needed)
return dataPtr; // pointer to first unused data byte
}
// Encode an array of a given length read from in to bout in streamvbyte format.
// Returns the number of bytes written.
size_t streamvbyte_encode(const uint32_t *in, uint32_t count, uint8_t *out) {
uint8_t *keyPtr = out;
uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte
uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys
return svb_encode_scalar(in, keyPtr, dataPtr, count) - out;
}
static inline __m128i _decode_avx(uint32_t key,
const uint8_t *__restrict__ *dataPtrPtr) {
uint8_t len = lengthTable[key];
__m128i Data = _mm_loadu_si128((__m128i *) *dataPtrPtr);
__m128i Shuf = *(__m128i *) &shuffleTable[key];
Data = _mm_shuffle_epi8(Data, Shuf);
*dataPtrPtr += len;
return Data;
}
static inline void _write_avx(uint32_t *out, __m128i Vec) {
_mm_storeu_si128((__m128i *) out, Vec);
}
static inline uint32_t _decode_data(const uint8_t **dataPtrPtr, uint8_t code) {
const uint8_t *dataPtr = *dataPtrPtr;
uint32_t val;
if (code == 0) { // 1 byte
val = (uint32_t) * dataPtr;
dataPtr += 1;
} else if (code == 1) { // 2 bytes
val = (uint32_t) * (uint16_t *) dataPtr;
dataPtr += 2;
} else if (code == 2) { // 3 bytes
val = (uint32_t) * (uint16_t *) dataPtr;
val |= *(dataPtr + 2) << 16;
dataPtr += 3;
} else { // code == 3
val = *(uint32_t *) dataPtr; // 4 bytes
dataPtr += 4;
}
*dataPtrPtr = dataPtr;
return val;
}
static const uint8_t *svb_decode_scalar(uint32_t *outPtr, const uint8_t *keyPtr,
const uint8_t *dataPtr, uint32_t count) {
if (count == 0)
return dataPtr; // no reads or writes if no data
uint8_t shift = 0;
uint32_t key = *keyPtr++;
for (uint32_t c = 0; c < count; c++) {
if (shift == 8) {
shift = 0;
key = *keyPtr++;
}
uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3);
*outPtr++ = val;
shift += 2;
}
return dataPtr; // pointer to first unused byte after end
}
const uint8_t *svb_decode_avx_simple(uint32_t *out,
const uint8_t *__restrict__ keyPtr, const uint8_t *__restrict__ dataPtr,
uint64_t count) {
uint64_t keybytes = count / 4; // number of key bytes
__m128i Data;
if (keybytes >= 8) {
int64_t Offset = -(int64_t) keybytes / 8 + 1;
const uint64_t *keyPtr64 = (const uint64_t *) keyPtr - Offset;
uint64_t nextkeys = keyPtr64[Offset];
for (; Offset != 0; ++Offset) {
uint64_t keys = nextkeys;
nextkeys = keyPtr64[Offset + 1];
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 4, Data);
keys >>= 16;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out + 8, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 12, Data);
keys >>= 16;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out + 16, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 20, Data);
keys >>= 16;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out + 24, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 28, Data);
out += 32;
}
{
uint64_t keys = nextkeys;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 4, Data);
keys >>= 16;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out + 8, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 12, Data);
keys >>= 16;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out + 16, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 20, Data);
keys >>= 16;
Data = _decode_avx((keys & 0xFF), &dataPtr);
_write_avx(out + 24, Data);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
_write_avx(out + 28, Data);
out += 32;
}
}
uint64_t consumedkeys = keybytes - (keybytes & 7);
return svb_decode_scalar(out, keyPtr + consumedkeys, dataPtr, count & 31);
}
// Read count 32-bit integers in maskedvbyte format from in, storing the result in out. Returns the number of bytes read.
size_t streamvbyte_decode(const uint8_t* in, uint32_t* out, uint32_t count) {
if (count == 0)
return 0;
const uint8_t *keyPtr = in; // full list of keys is next
uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up)
const uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys
return svb_decode_avx_simple(out, keyPtr, dataPtr, count) - in;
}

View File

@@ -0,0 +1,575 @@
#include "streamvbyte.h"
#if defined(_MSC_VER)
/* Microsoft C/C++-compatible compiler */
#include <intrin.h>
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
/* GCC-compatible compiler, targeting x86/x86-64 */
#include <x86intrin.h>
#elif defined(__GNUC__) && defined(__ARM_NEON__)
/* GCC-compatible compiler, targeting ARM with NEON */
#include <arm_neon.h>
#elif defined(__GNUC__) && defined(__IWMMXT__)
/* GCC-compatible compiler, targeting ARM with WMMX */
#include <mmintrin.h>
#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
#include <altivec.h>
#elif defined(__GNUC__) && defined(__SPE__)
/* GCC-compatible compiler, targeting PowerPC with SPE */
#include <spe.h>
#endif
static uint8_t lengthTable[256] = { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9,
10, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8,
9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10,
8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12,
13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8,
9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12,
10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15,
13, 14, 15, 16 };
static uint8_t shuffleTable[256][16] = { { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1,
-1, -1, 3, -1, -1, -1 }, // 1111
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 2111
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 3111
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 4111
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 1211
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 2211
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 3211
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 4211
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 1311
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 2311
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 3311
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1 }, // 4311
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1 }, // 1411
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1 }, // 2411
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1 }, // 3411
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1 }, // 4411
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 }, // 1121
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 2121
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 3121
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 4121
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 1221
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 2221
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 3221
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 4221
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 1321
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 2321
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 3321
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1 }, // 4321
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1 }, // 1421
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1 }, // 2421
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1 }, // 3421
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1 }, // 4421
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1 }, // 1131
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 2131
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 3131
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 4131
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 1231
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 2231
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 3231
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 4231
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 1331
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 2331
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 3331
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1 }, // 4331
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1 }, // 1431
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1 }, // 2431
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1 }, // 3431
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1 }, // 4431
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 }, // 1141
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 2141
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 3141
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 4141
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 1241
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 2241
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 3241
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 4241
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 1341
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 2341
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 3341
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1 }, // 4341
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1 }, // 1441
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1 }, // 2441
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1 }, // 3441
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 }, // 4441
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 }, // 1112
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 2112
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 3112
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 4112
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 1212
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 2212
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 3212
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 4212
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 1312
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 2312
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 3312
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1 }, // 4312
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1 }, // 1412
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1 }, // 2412
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1 }, // 3412
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1 }, // 4412
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 }, // 1122
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 2122
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 3122
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 4122
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 1222
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 2222
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 3222
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 4222
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 1322
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 2322
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 3322
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1 }, // 4322
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1 }, // 1422
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1 }, // 2422
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1 }, // 3422
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1 }, // 4422
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1 }, // 1132
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 2132
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 3132
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 4132
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 1232
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 2232
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 3232
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 4232
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 1332
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 2332
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 3332
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1 }, // 4332
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1 }, // 1432
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1 }, // 2432
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1 }, // 3432
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1 }, // 4432
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 }, // 1142
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 2142
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 3142
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 4142
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 1242
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 2242
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 3242
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 4242
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 1342
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 2342
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 3342
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1 }, // 4342
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1 }, // 1442
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1 }, // 2442
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1 }, // 3442
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 }, // 4442
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1 }, // 1113
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 2113
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 3113
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 4113
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 1213
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 2213
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 3213
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 4213
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 1313
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 2313
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 3313
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1 }, // 4313
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1 }, // 1413
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1 }, // 2413
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1 }, // 3413
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1 }, // 4413
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1 }, // 1123
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 2123
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 3123
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 4123
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 1223
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 2223
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 3223
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 4223
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 1323
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 2323
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 3323
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1 }, // 4323
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1 }, // 1423
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1 }, // 2423
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1 }, // 3423
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1 }, // 4423
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1 }, // 1133
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 2133
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 3133
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 4133
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 1233
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 2233
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 3233
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 4233
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 1333
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 2333
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 3333
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1 }, // 4333
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1 }, // 1433
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1 }, // 2433
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1 }, // 3433
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1 }, // 4433
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1 }, // 1143
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 2143
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 3143
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 4143
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 1243
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 2243
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 3243
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 4243
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 1343
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 2343
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 3343
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1 }, // 4343
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1 }, // 1443
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1 }, // 2443
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 }, // 3443
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1 }, // 4443
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 }, // 1114
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 2114
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 3114
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 4114
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 1214
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 2214
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 3214
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 4214
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 1314
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 2314
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 3314
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11 }, // 4314
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9 }, // 1414
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10 }, // 2414
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11 }, // 3414
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12 }, // 4414
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 }, // 1124
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 2124
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 3124
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 4124
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 1224
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 2224
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 3224
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 4224
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 1324
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 2324
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 3324
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12 }, // 4324
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10 }, // 1424
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11 }, // 2424
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12 }, // 3424
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13 }, // 4424
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8 }, // 1134
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 2134
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 3134
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 4134
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 1234
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 2234
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 3234
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 4234
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 1334
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 2334
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 3334
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13 }, // 4334
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11 }, // 1434
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12 }, // 2434
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13 }, // 3434
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14 }, // 4434
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 }, // 1144
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 2144
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 3144
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 4144
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 1244
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 2244
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 3244
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 4244
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 1344
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 2344
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 3344
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14 }, // 4344
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, // 1444
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, // 2444
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, // 3444
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } // 4444
};
static uint8_t _encode_data(uint32_t val, uint8_t *__restrict__ *dataPtrPtr) {
uint8_t *dataPtr = *dataPtrPtr;
uint8_t code;
if (val < (1 << 8)) { // 1 byte
*dataPtr = (uint8_t)(val);
*dataPtrPtr += 1;
code = 0;
} else if (val < (1 << 16)) { // 2 bytes
*(uint16_t *) dataPtr = (uint16_t)(val);
*dataPtrPtr += 2;
code = 1;
} else if (val < (1 << 24)) { // 3 bytes
*(uint16_t *) dataPtr = (uint16_t)(val);
*(dataPtr + 2) = (uint8_t)(val >> 16);
*dataPtrPtr += 3;
code = 2;
} else { // 4 bytes
*(uint32_t *) dataPtr = val;
*dataPtrPtr += 4;
code = 3;
}
return code;
}
static uint8_t *svb_encode_scalar_d1_init(const uint32_t *in,
uint8_t *__restrict__ keyPtr, uint8_t *__restrict__ dataPtr,
uint32_t count, uint32_t prev) {
if (count == 0)
return dataPtr; // exit immediately if no data
uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ...
uint8_t key = 0;
for (uint32_t c = 0; c < count; c++) {
if (shift == 8) {
shift = 0;
*keyPtr++ = key;
key = 0;
}
uint32_t val = in[c] - prev;
prev = in[c];
uint8_t code = _encode_data(val, &dataPtr);
key |= code << shift;
shift += 2;
}
*keyPtr = key; // write last key (no increment needed)
return dataPtr; // pointer to first unused data byte
}
size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t count, uint8_t *out,
uint32_t prev) {
uint8_t *keyPtr = out; // keys come immediately after 32-bit count
uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte
uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys
return svb_encode_scalar_d1_init(in, keyPtr, dataPtr, count, prev) - out;
}
static inline __m128i _decode_avx(uint32_t key, const uint8_t *__restrict__ *dataPtrPtr) {
uint8_t len = lengthTable[key];
__m128i Data = _mm_loadu_si128((__m128i *) *dataPtrPtr);
__m128i Shuf = *(__m128i *) &shuffleTable[key];
Data = _mm_shuffle_epi8(Data, Shuf);
*dataPtrPtr += len;
return Data;
}
#define BroadcastLastXMM 0xFF // bits 0-7 all set to choose highest element
static inline void _write_avx(uint32_t *out, __m128i Vec) {
_mm_storeu_si128((__m128i *) out, Vec);
}
static __m128i _write_avx_d1(uint32_t *out, __m128i Vec, __m128i Prev) {
__m128i Add = _mm_slli_si128(Vec, 4); // Cycle 1: [- A B C] (already done)
Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // Cycle 2: [P P P P]
Vec = _mm_add_epi32(Vec, Add); // Cycle 2: [A AB BC CD]
Add = _mm_slli_si128(Vec, 8); // Cycle 3: [- - A AB]
Vec = _mm_add_epi32(Vec, Prev); // Cycle 3: [PA PAB PBC PCD]
Vec = _mm_add_epi32(Vec, Add); // Cycle 4: [PA PAB PABC PABCD]
_write_avx(out, Vec);
return Vec;
}
#ifndef _MSC_VER
static __m128i High16To32 = {0xFFFF0B0AFFFF0908, 0xFFFF0F0EFFFF0D0C};
#else
static __m128i High16To32 = {8, 9, -1, -1, 10, 11, -1, -1,
12, 13, -1, -1, 14, 15, -1, -1};
#endif
static inline __m128i _write_16bit_avx_d1(uint32_t *out, __m128i Vec, __m128i Prev) {
// vec == [A B C D E F G H] (16 bit values)
__m128i Add = _mm_slli_si128(Vec, 2); // [- A B C D E F G]
Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // [P P P P] (32-bit)
Vec = _mm_add_epi32(Vec, Add); // [A AB BC CD DE FG GH]
Add = _mm_slli_si128(Vec, 4); // [- - A AB BC CD DE EF]
Vec = _mm_add_epi32(Vec, Add); // [A AB ABC ABCD BCDE CDEF DEFG EFGH]
__m128i V1 = _mm_cvtepu16_epi32(Vec); // [A AB ABC ABCD] (32-bit)
V1 = _mm_add_epi32(V1, Prev); // [PA PAB PABC PABCD] (32-bit)
__m128i V2 =
_mm_shuffle_epi8(Vec, High16To32); // [BCDE CDEF DEFG EFGH] (32-bit)
V2 = _mm_add_epi32(V1, V2); // [PABCDE PABCDEF PABCDEFG PABCDEFGH] (32-bit)
_write_avx(out, V1);
_write_avx(out + 4, V2);
return V2;
}
static inline uint32_t _decode_data(const uint8_t **dataPtrPtr, uint8_t code) {
const uint8_t *dataPtr = *dataPtrPtr;
uint32_t val;
if (code == 0) { // 1 byte
val = (uint32_t) * dataPtr;
dataPtr += 1;
} else if (code == 1) { // 2 bytes
val = (uint32_t) * (uint16_t *) dataPtr;
dataPtr += 2;
} else if (code == 2) { // 3 bytes
val = (uint32_t) * (uint16_t *) dataPtr;
val |= *(dataPtr + 2) << 16;
dataPtr += 3;
} else { // code == 3
val = *(uint32_t *) dataPtr; // 4 bytes
dataPtr += 4;
}
*dataPtrPtr = dataPtr;
return val;
}
const uint8_t *svb_decode_scalar_d1_init(uint32_t *outPtr, const uint8_t *keyPtr,
const uint8_t *dataPtr, uint32_t count,
uint32_t prev) {
if (count == 0)
return dataPtr; // no reads or writes if no data
uint8_t shift = 0;
uint32_t key = *keyPtr++;
for (uint32_t c = 0; c < count; c++) {
if (shift == 8) {
shift = 0;
key = *keyPtr++;
}
uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3);
val += prev;
*outPtr++ = val;
prev = val;
shift += 2;
}
return dataPtr; // pointer to first unused byte after end
}
const uint8_t *svb_decode_avx_d1_init(uint32_t *out, const uint8_t *__restrict__ keyPtr,
const uint8_t *__restrict__ dataPtr, uint64_t count, uint32_t prev) {
uint64_t keybytes = count / 4; // number of key bytes
if (keybytes >= 8) {
__m128i Prev = _mm_set1_epi32(prev);
__m128i Data;
int64_t Offset = -(int64_t) keybytes / 8 + 1;
const uint64_t *keyPtr64 = (const uint64_t *) keyPtr - Offset;
uint64_t nextkeys = keyPtr64[Offset];
for (; Offset != 0; ++Offset) {
uint64_t keys = nextkeys;
nextkeys = keyPtr64[Offset + 1];
// faster 16-bit delta since we only have 8-bit values
if (!keys) { // 32 1-byte ints in a row
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((__m128i *) (dataPtr)));
Prev = _write_16bit_avx_d1(out, Data, Prev);
Data = _mm_cvtepu8_epi16(
_mm_lddqu_si128((__m128i *) (dataPtr + 8)));
Prev = _write_16bit_avx_d1(out + 8, Data, Prev);
Data = _mm_cvtepu8_epi16(
_mm_lddqu_si128((__m128i *) (dataPtr + 16)));
Prev = _write_16bit_avx_d1(out + 16, Data, Prev);
Data = _mm_cvtepu8_epi16(
_mm_lddqu_si128((__m128i *) (dataPtr + 24)));
Prev = _write_16bit_avx_d1(out + 24, Data, Prev);
out += 32;
dataPtr += 32;
continue;
}
Data = _decode_avx(keys & 0x00FF, &dataPtr);
Prev = _write_avx_d1(out, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 4, Data, Prev);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
Prev = _write_avx_d1(out + 8, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 12, Data, Prev);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
Prev = _write_avx_d1(out + 16, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 20, Data, Prev);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
Prev = _write_avx_d1(out + 24, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 28, Data, Prev);
out += 32;
}
{
uint64_t keys = nextkeys;
// faster 16-bit delta since we only have 8-bit values
if (!keys) { // 32 1-byte ints in a row
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((__m128i *) (dataPtr)));
Prev = _write_16bit_avx_d1(out, Data, Prev);
Data = _mm_cvtepu8_epi16(
_mm_lddqu_si128((__m128i *) (dataPtr + 8)));
Prev = _write_16bit_avx_d1(out + 8, Data, Prev);
Data = _mm_cvtepu8_epi16(
_mm_lddqu_si128((__m128i *) (dataPtr + 16)));
Prev = _write_16bit_avx_d1(out + 16, Data, Prev);
Data = _mm_cvtepu8_epi16(
_mm_loadl_epi64((__m128i *) (dataPtr + 24)));
Prev = _write_16bit_avx_d1(out + 24, Data, Prev);
out += 32;
dataPtr += 32;
} else {
Data = _decode_avx(keys & 0x00FF, &dataPtr);
Prev = _write_avx_d1(out, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 4, Data, Prev);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
Prev = _write_avx_d1(out + 8, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 12, Data, Prev);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
Prev = _write_avx_d1(out + 16, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 20, Data, Prev);
keys >>= 16;
Data = _decode_avx((keys & 0x00FF), &dataPtr);
Prev = _write_avx_d1(out + 24, Data, Prev);
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
Prev = _write_avx_d1(out + 28, Data, Prev);
out += 32;
}
}
prev = out[-1];
}
uint64_t consumedkeys = keybytes - (keybytes & 7);
return svb_decode_scalar_d1_init(out, keyPtr + consumedkeys, dataPtr,
count & 31, prev);
}
size_t streamvbyte_delta_decode(const uint8_t* in, uint32_t* out,
uint32_t count, uint32_t prev) {
uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up)
const uint8_t *keyPtr = in;
const uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys
return svb_decode_avx_d1_init(out, keyPtr, dataPtr, count, prev) - in;
}

View File

@@ -0,0 +1,73 @@
#include <stdio.h>
#include <stdlib.h>
#include "streamvbyte.h"
#include "streamvbytedelta.h"
int main() {
int N = 4096;
uint32_t * datain = malloc(N * sizeof(uint32_t));
uint8_t * compressedbuffer = malloc(2 * N * sizeof(uint32_t));
uint32_t * recovdata = malloc(N * sizeof(uint32_t));
for (int length = 0; length <= N;) {
printf("length = %d \n", length);
for (uint32_t gap = 1; gap <= 387420489; gap *= 3) {
for (int k = 0; k < length; ++k)
datain[k] = gap;
size_t compsize = streamvbyte_encode(datain, length,
compressedbuffer);
size_t usedbytes = streamvbyte_decode(compressedbuffer, recovdata,
length);
if (compsize != usedbytes) {
printf(
"[streamvbyte_decode] code is buggy gap = %d, size mismatch %d %d \n",
(int) gap, (int) compsize, (int) usedbytes);
return -1;
}
for (int k = 0; k < length; ++k) {
if (recovdata[k] != datain[k]) {
printf("[streamvbyte_decode] code is buggy gap = %d\n",
(int) gap);
return -1;
}
}
}
printf("Delta \n");
for (size_t gap = 1; gap <= 531441; gap *= 3) {
for (int k = 0; k < length; ++k)
datain[k] = gap * k;
size_t compsize = streamvbyte_delta_encode(datain, length,
compressedbuffer, 0);
size_t usedbytes = streamvbyte_delta_decode(compressedbuffer,
recovdata, length, 0);
if (compsize != usedbytes) {
printf(
"[streamvbyte_delta_decode] code is buggy gap = %d, size mismatch %d %d \n",
(int) gap, (int) compsize, (int) usedbytes);
return -1;
}
for (int k = 0; k < length; ++k) {
if (recovdata[k] != datain[k]) {
printf(
"[streamvbyte_delta_decode] code is buggy gap = %d\n",
(int) gap);
return -1;
}
}
}
if (length < 128)
++length;
else {
length *= 2;
}
}
free(datain);
free(compressedbuffer);
free(recovdata);
printf("Code looks good.\n");
return 0;
}

View File

@@ -1,4 +1,3 @@
extern crate rustc_serialize;
extern crate tantivy;
extern crate tempdir;

View File

@@ -74,7 +74,8 @@ pub mod tests {
use Score;
use core::SegmentReader;
use SegmentLocalId;
use fastfield::U32FastFieldReader;
use fastfield::U64FastFieldReader;
use fastfield::FastFieldReader;
use schema::Field;
/// Stores all of the doc ids.
@@ -125,9 +126,9 @@ pub mod tests {
///
/// This collector is mainly useful for tests.
pub struct FastFieldTestCollector {
vals: Vec<u32>,
vals: Vec<u64>,
field: Field,
ff_reader: Option<U32FastFieldReader>,
ff_reader: Option<U64FastFieldReader>,
}
impl FastFieldTestCollector {
@@ -139,14 +140,14 @@ pub mod tests {
}
}
pub fn vals(self,) -> Vec<u32> {
pub fn vals(self,) -> Vec<u64> {
self.vals
}
}
impl Collector for FastFieldTestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.ff_reader = reader.get_fast_field_reader(self.field);
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
Ok(())
}

View File

@@ -4,8 +4,38 @@ use common::serialize::BinarySerializable;
use std::mem;
pub fn compute_num_bits(amplitude: u32) -> u8 {
(32u32 - amplitude.leading_zeros()) as u8
/// Computes the number of bits that will be used for bitpacking.
///
/// In general the target is the minimum number of bits
/// required to express the amplitude given in argument.
///
/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
///
/// The logic is slightly more convoluted here as for optimization
/// reasons, we want to ensure that a value spawns over at most 8 bytes
/// of aligns bytes.
///
/// Spawning over 9 bytes is possible for instance, if we do
/// bitpacking with an amplitude of 63 bits.
/// In this case, the second int will start on bit
/// 63 (which belongs to byte 7) and ends at byte 15;
/// Hence 9 bytes (from byte 7 to byte 15 included).
///
/// To avoid this, we force the number of bits to 64bits
/// when the result is greater than `64-8 = 56 bits`.
///
/// Note that this only affects rare use cases spawning over
/// a very large range of values. Even in this case, it results
/// in an extra cost of at most 12% compared to the optimal
/// number of bits.
pub fn compute_num_bits(amplitude: u64) -> u8 {
let amplitude = (64u32 - amplitude.leading_zeros()) as u8;
if amplitude <= 64 - 8 {
amplitude
}
else {
64
}
}
pub struct BitPacker {
@@ -15,7 +45,7 @@ pub struct BitPacker {
written_size: usize,
}
impl BitPacker {
impl BitPacker {
pub fn new(num_bits: usize) -> BitPacker {
BitPacker {
@@ -26,7 +56,7 @@ impl BitPacker {
}
}
pub fn write<TWrite: Write>(&mut self, val: u32, output: &mut TWrite) -> io::Result<()> {
pub fn write<TWrite: Write>(&mut self, val: u64, output: &mut TWrite) -> io::Result<()> {
let val_u64 = val as u64;
if self.mini_buffer_written + self.num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
@@ -67,22 +97,29 @@ impl BitPacker {
pub struct BitUnpacker {
num_bits: usize,
mask: u32,
mask: u64,
data_ptr: *const u8,
data_len: usize,
}
impl BitUnpacker {
pub fn new(data: &[u8], num_bits: usize) -> BitUnpacker {
let mask: u64 =
if num_bits == 64 {
!0u64
}
else {
(1u64 << num_bits) - 1u64
};
BitUnpacker {
num_bits: num_bits,
mask: (1u32 << num_bits) - 1u32,
mask: mask,
data_ptr: data.as_ptr(),
data_len: data.len()
}
}
pub fn get(&self, idx: usize) -> u32 {
pub fn get(&self, idx: usize) -> u64 {
if self.num_bits == 0 {
return 0;
}
@@ -101,7 +138,7 @@ impl BitUnpacker {
}
val_unshifted_unmasked = unsafe { mem::transmute::<[u8; 8], u64>(arr) };
}
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u32;
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
(val_shifted & self.mask)
}
@@ -123,13 +160,14 @@ mod test {
assert_eq!(compute_num_bits(4), 3u8);
assert_eq!(compute_num_bits(255), 8u8);
assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
fn test_bitpacker_util(len: usize, num_bits: usize) {
let mut data = Vec::new();
let mut bitpacker = BitPacker::new(num_bits);
let max_val: u32 = (1 << num_bits) - 1;
let vals: Vec<u32> = (0u32..len as u32).map(|i| {
let max_val: u64 = (1 << num_bits) - 1;
let vals: Vec<u64> = (0u64..len as u64).map(|i| {
if max_val == 0 {
0
}

View File

@@ -0,0 +1,58 @@
use std::io::Write;
use std::io;
pub struct CountingWriter<W: Write> {
underlying: W,
written_bytes: usize,
}
impl<W: Write> CountingWriter<W> {
pub fn wrap(underlying: W) -> CountingWriter<W> {
CountingWriter {
underlying: underlying,
written_bytes: 0,
}
}
pub fn written_bytes(&self,) -> usize {
self.written_bytes
}
pub fn finish(mut self) -> io::Result<(W, usize)> {
self.flush()?;
Ok((self.underlying, self.written_bytes))
}
}
impl<W: Write> Write for CountingWriter<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
let written_size = self.underlying.write(buf)?;
self.written_bytes += written_size;
Ok(written_size)
}
fn flush(&mut self) -> io::Result<()> {
self.underlying.flush()
}
}
#[cfg(test)]
mod test {
use super::CountingWriter;
use std::io::Write;
#[test]
fn test_counting_writer() {
let buffer: Vec<u8> = vec!();
let mut counting_writer = CountingWriter::wrap(buffer);
let bytes = (0u8..10u8).collect::<Vec<u8>>();
counting_writer.write_all(&bytes).unwrap();
let (w, len): (Vec<u8>, usize) = counting_writer.finish().unwrap();
assert_eq!(len, 10);
assert_eq!(w.len(), 10);
}
}

View File

@@ -2,13 +2,15 @@ mod serialize;
mod timer;
mod vint;
pub mod bitpacker;
mod counting_writer;
pub use self::serialize::BinarySerializable;
pub use self::timer::Timing;
pub use self::timer::TimerTree;
pub use self::timer::OpenTimer;
pub use self::vint::VInt;
pub use self::counting_writer::CountingWriter;
use std::io;
/// Create a default io error given a string.
@@ -28,15 +30,53 @@ pub trait HasLen {
}
}
const HIGHEST_BIT: u64 = 1 << 63;
/// Creates an uninitialized Vec of a given usize
/// Maps `i64` to `u64` so that
/// `-2^63 .. 2^63-1` is mapped
/// to
/// `0 .. 2^64`
/// in that order.
///
/// `allocate_vec` does an unsafe call to `set_len`
/// as other solution are extremely slow in debug mode.
pub fn allocate_vec<T>(capacity: usize) -> Vec<T> {
let mut v = Vec::with_capacity(capacity);
unsafe {
v.set_len(capacity);
/// This is more suited than simply casting (`val as u64`)
/// because of bitpacking.
///
/// Imagine a list of `i64` ranging from -10 to 10.
/// When casting negative values, the negative values are projected
/// to values over 2^63, and all values end up requiring 64 bits.
#[inline(always)]
pub fn i64_to_u64(val: i64) -> u64 {
(val as u64) ^ HIGHEST_BIT
}
/// Reverse the mapping given by
/// `i64_to_u64`.
#[inline(always)]
pub fn u64_to_i64(val: u64) -> i64 {
(val ^ HIGHEST_BIT) as i64
}
#[cfg(test)]
mod test {
use super::{i64_to_u64, u64_to_i64};
fn test_i64_converter_helper(val: i64) {
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
}
v
}
#[test]
fn test_i64_converter() {
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
test_i64_converter_helper(0i64);
test_i64_converter_helper(i64::min_value());
test_i64_converter_helper(i64::max_value());
for i in -1000i64..1000i64 {
test_i64_converter_helper(i);
}
}
}

View File

@@ -71,6 +71,16 @@ impl BinarySerializable for u64 {
}
}
impl BinarySerializable for i64 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
writer.write_i64::<Endianness>(*self)
.map(|_| 8)
}
fn deserialize(reader: &mut Read) -> io::Result<i64> {
reader.read_i64::<Endianness>()
}
}
impl BinarySerializable for u8 {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {

View File

@@ -33,7 +33,7 @@ impl<'a> Drop for OpenTimer<'a> {
}
/// Timing recording
#[derive(Debug, RustcEncodable)]
#[derive(Debug, Serialize)]
pub struct Timing {
name: &'static str,
duration: i64,
@@ -41,7 +41,7 @@ pub struct Timing {
}
/// Timer tree
#[derive(Debug, RustcEncodable)]
#[derive(Debug, Serialize)]
pub struct TimerTree {
timings: Vec<Timing>,
}

View File

@@ -110,7 +110,7 @@ pub mod tests {
let data = generate_array(10_000, 0.1);
let mut encoder = CompositeEncoder::new();
let compressed = encoder.compress_unsorted(&data);
assert_eq!(compressed.len(), 19_790);
assert!(compressed.len() <= 19_794);
let mut decoder = CompositeDecoder::new();
let result = decoder.uncompress_unsorted(&compressed, data.len());
for i in 0..data.len() {
@@ -123,7 +123,7 @@ pub mod tests {
let data = generate_array(10_000, 0.1);
let mut encoder = CompositeEncoder::new();
let compressed = encoder.compress_sorted(&data);
assert_eq!(compressed.len(), 7_822);
assert!(compressed.len() <= 7_826);
let mut decoder = CompositeDecoder::new();
let result = decoder.uncompress_sorted(&compressed, data.len());
for i in 0..data.len() {

View File

@@ -4,16 +4,32 @@
mod composite;
pub use self::composite::{CompositeEncoder, CompositeDecoder};
#[cfg(feature="simdcompression")]
mod compression_simd;
#[cfg(feature="simdcompression")]
pub use self::compression_simd::{BlockEncoder, BlockDecoder};
#[cfg(not(feature="simdcompression"))]
mod compression_nosimd;
#[cfg(not(feature="simdcompression"))]
pub use self::compression_nosimd::{BlockEncoder, BlockDecoder};
mod pack {
mod compression_pack_nosimd;
pub use self::compression_pack_nosimd::*;
}
#[cfg(feature="simdcompression")]
mod pack {
mod compression_pack_simd;
pub use self::compression_pack_simd::*;
}
pub use self::pack::{BlockEncoder, BlockDecoder};
#[cfg( any(not(feature="simdcompression"), target_env="msvc") )]
mod vint {
mod compression_vint_nosimd;
pub use self::compression_vint_nosimd::*;
}
#[cfg( all(feature="simdcompression", not(target_env="msvc")) )]
mod vint {
mod compression_vint_simd;
pub use self::compression_vint_simd::*;
}
pub trait VIntEncoder {
@@ -26,51 +42,16 @@ pub trait VIntDecoder {
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> &'a [u8];
}
impl VIntEncoder for BlockEncoder{
impl VIntEncoder for BlockEncoder {
fn compress_vint_sorted(&mut self, input: &[u32], mut offset: u32) -> &[u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v - offset;
offset = v;
loop {
let next_byte: u8 = (to_encode % 128u32) as u8;
to_encode /= 128u32;
if to_encode == 0u32 {
self.output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
}
else {
self.output[byte_written] = next_byte;
byte_written += 1;
}
}
}
&self.output[..byte_written]
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8] {
vint::compress_sorted(input, &mut self.output, offset)
}
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v;
loop {
let next_byte: u8 = (to_encode % 128u32) as u8;
to_encode /= 128u32;
if to_encode == 0u32 {
self.output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
}
else {
self.output[byte_written] = next_byte;
byte_written += 1;
}
}
}
&self.output[..byte_written]
vint::compress_unsorted(input, &mut self.output)
}
}
}
impl VIntDecoder for BlockDecoder {
@@ -79,52 +60,19 @@ impl VIntDecoder for BlockDecoder {
compressed_data: &'a [u8],
offset: u32,
num_els: usize) -> &'a [u8] {
let mut read_byte = 0;
let mut result = offset;
for i in 0..num_els {
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[read_byte];
read_byte += 1;
result += ((cur_byte % 128u8) as u32) << shift;
if cur_byte & 128u8 != 0u8 {
break;
}
shift += 7;
}
self.output[i] = result;
}
self.output_len = num_els;
&compressed_data[read_byte..]
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
}
fn uncompress_vint_unsorted<'a>(
&mut self,
compressed_data: &'a [u8],
num_els: usize) -> &'a [u8] {
let mut read_byte = 0;
for i in 0..num_els {
let mut result = 0u32;
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[read_byte];
read_byte += 1;
result += ((cur_byte % 128u8) as u32) << shift;
if cur_byte & 128u8 != 0u8 {
break;
}
shift += 7;
}
self.output[i] = result;
}
self.output_len = num_els;
&compressed_data[read_byte..]
}
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
}
}
pub const NUM_DOCS_PER_BLOCK: usize = 128; //< should be a power of 2 to let the compiler optimize.
@@ -224,7 +172,7 @@ pub mod tests {
#[test]
fn test_encode_vint() {
{
let expected_length = 123;
let expected_length = 154;
let mut encoder = BlockEncoder::new();
let input: Vec<u32> = (0u32..123u32)
.map(|i| 4 + i * 7 / 2)
@@ -232,23 +180,13 @@ pub mod tests {
.collect();
for offset in &[0u32, 1u32, 2u32] {
let encoded_data = encoder.compress_vint_sorted(&input, *offset);
assert_eq!(encoded_data.len(), expected_length);
assert!(encoded_data.len() <= expected_length);
let mut decoder = BlockDecoder::new();
let remaining_data = decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
assert_eq!(0, remaining_data.len());
assert_eq!(input, decoder.output_array());
}
}
{
let mut encoder = BlockEncoder::new();
let input = vec!(3u32, 17u32, 187u32);
let encoded_data = encoder.compress_vint_sorted(&input, 0);
assert_eq!(encoded_data.len(), 4);
assert_eq!(encoded_data[0], 3u8 + 128u8);
assert_eq!(encoded_data[1], (17u8 - 3u8) + 128u8);
assert_eq!(encoded_data[2], (187u8 - 17u8 - 128u8));
assert_eq!(encoded_data[3], (1u8 + 128u8));
}
}
@@ -272,4 +210,27 @@ pub mod tests {
});
}
const NUM_INTS_BENCH_VINT: usize = 10;
#[bench]
fn bench_compress_vint(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
b.iter(|| {
encoder.compress_vint_sorted(&data, 0u32);
});
}
#[bench]
fn bench_uncompress_vint(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
});
}
}

View File

@@ -2,7 +2,7 @@ use common::bitpacker::compute_num_bits;
use common::bitpacker::{BitPacker, BitUnpacker};
use std::cmp;
use std::io::Write;
use super::NUM_DOCS_PER_BLOCK;
use super::super::NUM_DOCS_PER_BLOCK;
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;

View File

@@ -1,4 +1,4 @@
use super::NUM_DOCS_PER_BLOCK;
use super::super::NUM_DOCS_PER_BLOCK;
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;

View File

@@ -0,0 +1,92 @@
#[inline(always)]
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v - offset;
offset = v;
loop {
let next_byte: u8 = (to_encode % 128u32) as u8;
to_encode /= 128u32;
if to_encode == 0u32 {
output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
}
else {
output[byte_written] = next_byte;
byte_written += 1;
}
}
}
&output[..byte_written]
}
#[inline(always)]
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
let mut byte_written = 0;
for &v in input {
let mut to_encode: u32 = v;
loop {
let next_byte: u8 = (to_encode % 128u32) as u8;
to_encode /= 128u32;
if to_encode == 0u32 {
output[byte_written] = next_byte | 128u8;
byte_written += 1;
break;
}
else {
output[byte_written] = next_byte;
byte_written += 1;
}
}
}
&output[..byte_written]
}
#[inline(always)]
pub fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32) -> &'a [u8] {
let mut read_byte = 0;
let mut result = offset;
let num_els = output.len();
for i in 0..num_els {
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[read_byte];
read_byte += 1;
result += ((cur_byte % 128u8) as u32) << shift;
if cur_byte & 128u8 != 0u8 {
break;
}
shift += 7;
}
output[i] = result;
}
&compressed_data[read_byte..]
}
#[inline(always)]
pub fn uncompress_unsorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32]) -> &'a [u8] {
let mut read_byte = 0;
let num_els = output.len();
for i in 0..num_els {
let mut result = 0u32;
let mut shift = 0u32;
loop {
let cur_byte = compressed_data[read_byte];
read_byte += 1;
result += ((cur_byte % 128u8) as u32) << shift;
if cur_byte & 128u8 != 0u8 {
break;
}
shift += 7;
}
output[i] = result;
}
&compressed_data[read_byte..]
}

View File

@@ -0,0 +1,82 @@
mod streamvbyte {
use libc::size_t;
extern {
pub fn streamvbyte_delta_encode(
data: *const u32,
num_els: u32,
output: *mut u8,
offset: u32) -> size_t;
pub fn streamvbyte_delta_decode(
compressed_data: *const u8,
output: *mut u32,
num_els: u32,
offset: u32) -> size_t;
pub fn streamvbyte_encode(
data: *const u32,
num_els: u32,
output: *mut u8) -> size_t;
pub fn streamvbyte_decode(
compressed_data: *const u8,
output: *mut u32,
num_els: usize) -> size_t;
}
}
#[inline(always)]
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
let compress_length = unsafe {
streamvbyte::streamvbyte_delta_encode(
input.as_ptr(),
input.len() as u32,
output.as_mut_ptr(),
offset)
};
&output[..compress_length]
}
#[inline(always)]
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
let compress_length = unsafe {
streamvbyte::streamvbyte_encode(
input.as_ptr(),
input.len() as u32,
output.as_mut_ptr())
};
&output[..compress_length]
}
#[inline(always)]
pub fn uncompress_sorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32],
offset: u32) -> &'a [u8] {
let consumed_bytes = unsafe {
streamvbyte::streamvbyte_delta_decode(
compressed_data.as_ptr(),
output.as_mut_ptr(),
output.len() as u32,
offset)
};
&compressed_data[consumed_bytes..]
}
#[inline(always)]
pub fn uncompress_unsorted<'a>(
compressed_data: &'a [u8],
output: &mut [u32]) -> &'a [u8] {
let consumed_bytes = unsafe {
streamvbyte::streamvbyte_decode(
compressed_data.as_ptr(),
output.as_mut_ptr(),
output.len())
};
&compressed_data[consumed_bytes..]
}

View File

@@ -1,10 +1,10 @@
use Result;
use Error;
use serde_json;
use schema::Schema;
use std::sync::Arc;
use std::borrow::BorrowMut;
use std::fmt;
use rustc_serialize::json;
use core::SegmentId;
use directory::{Directory, MmapDirectory, RAMDirectory};
use indexer::index_writer::open_index_writer;
@@ -29,7 +29,7 @@ const NUM_SEARCHERS: usize = 12;
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data);
json::decode(&meta_string)
serde_json::from_str(&meta_string)
.map_err(|e| Error::CorruptedFile(META_FILEPATH.clone(), Box::new(e)))
}

View File

@@ -9,7 +9,7 @@ use core::SegmentMeta;
/// * the index docstamp
/// * the schema
///
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
#[derive(Clone,Debug,Serialize, Deserialize)]
pub struct IndexMeta {
pub segments: Vec<SegmentMeta>,
pub schema: Schema,

View File

@@ -68,8 +68,8 @@ impl Searcher {
}
/// Returns the segment_reader associated with the given segment_ordinal
pub fn segment_reader(&self, segment_ord: usize) -> &SegmentReader {
&self.segment_readers[segment_ord]
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
&self.segment_readers[segment_ord as usize]
}
/// Runs a query on the segment readers wrapped by the searcher
@@ -78,6 +78,7 @@ impl Searcher {
}
}
impl From<Vec<SegmentReader>> for Searcher {
fn from(segment_readers: Vec<SegmentReader>) -> Searcher {
Searcher {

View File

@@ -1,6 +1,5 @@
use uuid::Uuid;
use std::fmt;
use rustc_serialize::{Encoder, Decoder, Encodable, Decodable};
use std::cmp::{Ordering, Ord};
#[cfg(test)]
@@ -14,7 +13,7 @@ use std::sync::atomic;
///
/// In unit test, for reproducability, the SegmentId are
/// simply generated in an autoincrement fashion.
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct SegmentId(Uuid);
@@ -65,18 +64,6 @@ impl SegmentId {
}
}
impl Encodable for SegmentId {
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
self.0.encode(s)
}
}
impl Decodable for SegmentId {
fn decode<D: Decoder>(d: &mut D) -> Result<Self, D::Error> {
Uuid::decode(d).map(SegmentId)
}
}
impl fmt::Debug for SegmentId {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "Seg({:?})", self.short_uuid_string())

View File

@@ -3,7 +3,7 @@ use super::SegmentComponent;
use std::path::PathBuf;
use std::collections::HashSet;
#[derive(Clone, Debug, RustcDecodable,RustcEncodable)]
#[derive(Clone, Debug, Serialize, Deserialize)]
struct DeleteMeta {
num_deleted_docs: u32,
opstamp: u64,
@@ -13,7 +13,7 @@ struct DeleteMeta {
///
/// For instance the number of docs it contains,
/// how many are deleted, etc.
#[derive(Clone, Debug, RustcDecodable,RustcEncodable)]
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SegmentMeta {
segment_id: SegmentId,
max_doc: u32,

View File

@@ -5,25 +5,29 @@ use core::SegmentComponent;
use schema::Term;
use common::HasLen;
use core::SegmentMeta;
use fastfield::delete::DeleteBitSet;
use fastfield::{self, FastFieldNotAvailableError};
use fastfield::DeleteBitSet;
use postings::BlockSegmentPostings;
use store::StoreReader;
use schema::Document;
use directory::ReadOnlySource;
use DocId;
use std::str;
use postings::TermInfo;
use datastruct::FstMap;
use datastruct::TermDictionary;
use std::sync::Arc;
use std::fmt;
use schema::Field;
use postings::SegmentPostingsOption;
use postings::SegmentPostings;
use fastfield::{U32FastFieldsReader, U32FastFieldReader};
use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader};
use schema::Schema;
use schema::FieldType;
use postings::FreqHandler;
use schema::TextIndexingOptions;
/// Entry point to access all of the datastructures of the `Segment`
///
/// - term dictionary
@@ -39,11 +43,11 @@ use schema::TextIndexingOptions;
pub struct SegmentReader {
segment_id: SegmentId,
segment_meta: SegmentMeta,
term_infos: Arc<FstMap<TermInfo>>,
term_infos: Arc<TermDictionary<TermInfo>>,
postings_data: ReadOnlySource,
store_reader: StoreReader,
fast_fields_reader: Arc<U32FastFieldsReader>,
fieldnorms_reader: Arc<U32FastFieldsReader>,
fast_fields_reader: Arc<FastFieldsReader>,
fieldnorms_reader: Arc<FastFieldsReader>,
delete_bitset: DeleteBitSet,
positions_data: ReadOnlySource,
schema: Schema,
@@ -58,6 +62,11 @@ impl SegmentReader {
self.segment_meta.max_doc()
}
pub fn schema(&self) -> &Schema {
&self.schema
}
/// Returns the number of documents.
/// Deleted documents are not counted.
///
@@ -73,31 +82,32 @@ impl SegmentReader {
self.delete_bitset.len() as DocId
}
#[doc(hidden)]
pub fn fast_fields_reader(&self) -> &FastFieldsReader {
&*self.fast_fields_reader
}
/// Accessor to a segment's fast field reader given a field.
pub fn get_fast_field_reader(&self, field: Field) -> Option<U32FastFieldReader> {
/// Returns the u32 fast value reader if the field
/// is a u32 field indexed as "fast".
///
/// Return None if the field is not a u32 field
/// indexed with the fast option.
///
/// # Panics
/// May panic if the index is corrupted.
///
/// Returns the u64 fast value reader if the field
/// is a u64 field indexed as "fast".
///
/// Return a FastFieldNotAvailableError if the field is not
/// declared as a fast field in the schema.
///
/// # Panics
/// May panic if the index is corrupted.
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>(&self, field: Field) -> fastfield::Result<TFastFieldReader> {
let field_entry = self.schema.get_field_entry(field);
match field_entry.field_type() {
&FieldType::Str(_) => {
warn!("Field <{}> is not a fast field. It is a text field, and fast text fields are not supported yet.", field_entry.name());
None
},
&FieldType::U32(ref u32_options) => {
if u32_options.is_fast() {
self.fast_fields_reader.get_field(field)
}
else {
warn!("Field <{}> is not defined as a fast field.", field_entry.name());
None
}
},
if !TFastFieldReader::is_enabled(field_entry.field_type()) {
Err(FastFieldNotAvailableError::new(field_entry))
}
else {
Ok(
self.fast_fields_reader
.open_reader(field)
.expect("Fast field file corrupted.")
)
}
}
@@ -108,8 +118,8 @@ impl SegmentReader {
///
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U32FastFieldReader> {
self.fieldnorms_reader.get_field(field)
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U64FastFieldReader> {
self.fieldnorms_reader.open_reader(field)
}
/// Returns the number of documents containing the term.
@@ -129,15 +139,16 @@ impl SegmentReader {
pub fn open(segment: Segment) -> Result<SegmentReader> {
let source = try!(segment.open_read(SegmentComponent::TERMS));
let term_infos = try!(FstMap::from_source(source));
let term_infos = try!(TermDictionary::from_source(source));
let store_reader = StoreReader::from(try!(segment.open_read(SegmentComponent::STORE)));
let postings_shared_mmap = try!(segment.open_read(SegmentComponent::POSTINGS));
let fast_field_data = try!(segment.open_read(SegmentComponent::FASTFIELDS));
let fast_fields_reader = try!(U32FastFieldsReader::open(fast_field_data));
let fast_fields_reader = try!(FastFieldsReader::open(fast_field_data));
let fieldnorms_data = try!(segment.open_read(SegmentComponent::FIELDNORMS));
let fieldnorms_reader = try!(U32FastFieldsReader::open(fieldnorms_data));
let fieldnorms_reader = try!(FastFieldsReader::open(fieldnorms_data));
let positions_data = segment
.open_read(SegmentComponent::POSITIONS)
@@ -168,7 +179,7 @@ impl SegmentReader {
}
/// Return the term dictionary datastructure.
pub fn term_infos(&self) -> &FstMap<TermInfo> {
pub fn term_infos(&self) -> &TermDictionary<TermInfo> {
&self.term_infos
}
@@ -181,16 +192,29 @@ impl SegmentReader {
}
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encounterred and indexed.
///
/// If the field was not indexed with the indexing options that cover
/// the requested options, the returned `SegmentPostings` the method does not fail
/// and returns a `SegmentPostings` with as much information as possible.
///
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions`
/// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies.
pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<SegmentPostings> {
pub fn postings_data(&self, offset: usize) -> &[u8] {
&self.postings_data[offset..]
}
pub fn get_block_postings(&self) -> BlockSegmentPostings {
BlockSegmentPostings::from_data(0, &self.postings_data[..], FreqHandler::new_without_freq())
}
pub fn read_block_postings_from_terminfo(&self, term_info: &TermInfo, field_type: &FieldType) -> Option<BlockSegmentPostings> {
let offset = term_info.postings_offset as usize;
let postings_data = &self.postings_data[offset..];
let freq_handler = match *field_type {
FieldType::Str(_) => {
FreqHandler::new_without_freq()
}
_ => {
FreqHandler::new_without_freq()
}
};
Some(BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler))
}
pub fn read_block_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<BlockSegmentPostings> {
let field = term.field();
let field_entry = self.schema.get_field_entry(field);
let term_info = get!(self.get_term_info(&term));
@@ -230,7 +254,23 @@ impl SegmentReader {
FreqHandler::new_without_freq()
}
};
Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, &self.delete_bitset, freq_handler))
Some(BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler))
}
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encounterred and indexed.
///
/// If the field was not indexed with the indexing options that cover
/// the requested options, the returned `SegmentPostings` the method does not fail
/// and returns a `SegmentPostings` with as much information as possible.
///
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions`
/// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies.
pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<SegmentPostings> {
self.read_block_postings(term, option)
.map(|block_postings| {
SegmentPostings::from_block_postings(block_postings, self.delete_bitset.clone())
})
}
@@ -249,7 +289,7 @@ impl SegmentReader {
_ => SegmentPostingsOption::NoFreq,
}
}
FieldType::U32(_) => SegmentPostingsOption::NoFreq
FieldType::U64(_) | FieldType::I64(_) => SegmentPostingsOption::NoFreq
};
self.read_postings(term, segment_posting_option)
}

View File

@@ -1,7 +1,8 @@
use fst::Streamer;
use std::mem;
use std::collections::BinaryHeap;
use fst::map::Keys;
use postings::TermInfo;
use datastruct::TermDictionaryStreamer;
use schema::Field;
use schema::Term;
use core::SegmentReader;
@@ -34,7 +35,7 @@ impl Ord for HeapItem {
/// - a slice with the ordinal of the segments containing
/// the terms.
pub struct TermIterator<'a> {
key_streams: Vec<Keys<'a>>,
key_streams: Vec<TermDictionaryStreamer<'a, TermInfo>>,
heap: BinaryHeap<HeapItem>,
// Buffer hosting the list of segment ordinals containing
// the current term.
@@ -43,7 +44,7 @@ pub struct TermIterator<'a> {
}
impl<'a> TermIterator<'a> {
fn new(key_streams: Vec<Keys<'a>>) -> TermIterator<'a> {
fn new(key_streams: Vec<TermDictionaryStreamer<'a, TermInfo>>) -> TermIterator<'a> {
let key_streams_len = key_streams.len();
TermIterator {
key_streams: key_streams,
@@ -98,7 +99,7 @@ impl<'a> TermIterator<'a> {
fn advance_segments(&mut self) {
for segment_ord in self.current_segment_ords.drain(..) {
if let Some(term) = self.key_streams[segment_ord].next() {
if let Some((term, _val)) = self.key_streams[segment_ord].next() {
self.heap.push(HeapItem {
term: Term::from_bytes(term),
segment_ord: segment_ord,
@@ -126,7 +127,7 @@ impl<'a> From<&'a [SegmentReader]> for TermIterator<'a> {
TermIterator::new(
segment_readers
.iter()
.map(|reader| reader.term_infos().keys())
.map(|reader| reader.term_infos().stream())
.collect()
)
}
@@ -175,9 +176,7 @@ mod tests {
let mut term_it = searcher.terms();
let mut terms = String::new();
while let Some(term) = term_it.next() {
unsafe {
terms.push_str(term.text());
}
terms.push_str(term.text());
}
assert_eq!(terms, "abcdef");
}

View File

@@ -9,6 +9,7 @@ use directory::ReadOnlySource;
use common::BinarySerializable;
use std::marker::PhantomData;
fn convert_fst_error(e: fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
}
@@ -92,6 +93,10 @@ impl<V: BinarySerializable> FstMap<V> {
self.fst_index.keys()
}
pub fn fst_index(&self) -> &fst::Map {
&self.fst_index
}
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
let total_len = source.len();
let length_offset = total_len - 4;
@@ -107,8 +112,8 @@ impl<V: BinarySerializable> FstMap<V> {
_phantom_: PhantomData,
})
}
fn read_value(&self, offset: u64) -> V {
pub fn read_value(&self, offset: u64) -> V {
let buffer = self.values_mmap.as_slice();
let mut cursor = &buffer[(offset as usize)..];
V::deserialize(&mut cursor).expect("Data in FST is corrupted")

View File

@@ -1,7 +1,15 @@
mod fstmap;
mod skip;
pub mod stacker;
mod stream_dictionary;
//pub use self::fstmap::FstMapBuilder as TermDictionaryBuilder;
//pub use self::fstmap::FstMap as TermDictionary;
pub use self::stream_dictionary::StreamDictionaryBuilder as TermDictionaryBuilder;
pub use self::stream_dictionary::StreamDictionary as TermDictionary;
pub use self::stream_dictionary::StreamDictionaryStreamer as TermDictionaryStreamer;
pub use self::fstmap::FstMapBuilder;
pub use self::fstmap::FstMap;
pub use self::skip::{SkipListBuilder, SkipList};

View File

@@ -1,10 +1,6 @@
use std::iter;
use std::marker::PhantomData;
use super::heap::{Heap, HeapAllocable, BytesRef};
/// dbj2 hash function
fn djb2(key: &[u8]) -> u64 {
let mut state: u64 = 5381;
@@ -57,17 +53,40 @@ pub enum Entry {
/// the computation of the hash of the key twice,
/// or copying the key as long as there is no insert.
///
pub struct HashMap<'a, V> where V: HeapAllocable {
pub struct HashMap<'a> {
table: Box<[KeyValue]>,
heap: &'a Heap,
_phantom: PhantomData<V>,
mask: usize,
occupied: Vec<usize>,
}
impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
struct QuadraticProbing {
hash: usize,
i: usize,
mask: usize,
}
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a, V> {
impl QuadraticProbing {
fn compute(key: &[u8], mask: usize) -> QuadraticProbing {
let hash = djb2(key) as usize;
QuadraticProbing {
hash: hash,
i: 0,
mask: mask,
}
}
#[inline]
fn next(&mut self) -> usize {
self.i += 1;
(self.hash + self.i * self.i) & self.mask
}
}
impl<'a> HashMap<'a> {
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a> {
let table_size = 1 << num_bucket_power_of_2;
let table: Vec<KeyValue> = iter::repeat(KeyValue::default())
.take(table_size)
@@ -75,16 +94,17 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
HashMap {
table: table.into_boxed_slice(),
heap: heap,
_phantom: PhantomData,
mask: table_size - 1,
occupied: Vec::with_capacity(table_size / 2),
}
}
#[inline]
fn bucket(&self, key: &[u8]) -> usize {
let hash: u64 = djb2(key);
(hash as usize) & self.mask
fn probe(&self, key: &[u8]) -> QuadraticProbing {
QuadraticProbing::compute(key, self.mask)
}
pub fn is_saturated(&self) -> bool {
self.table.len() < self.occupied.len() * 5
}
fn get_key(&self, bytes_ref: BytesRef) -> &[u8] {
@@ -100,7 +120,7 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
addr
}
pub fn iter<'b: 'a>(&'b self,) -> impl Iterator<Item=(&'a [u8], (u32, &'a V))> + 'b {
pub fn iter<'b: 'a>(&'b self,) -> impl Iterator<Item=(&'a [u8], u32)> + 'b {
let heap: &'a Heap = self.heap;
let table: &'b [KeyValue] = &self.table;
self.occupied
@@ -109,23 +129,11 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
.map(move |bucket: usize| {
let kv = table[bucket];
let addr = kv.value_addr;
let v: &V = heap.get_mut_ref::<V>(addr);
(heap.get_slice(kv.key), (addr, v))
(heap.get_slice(kv.key), addr)
})
// .map(move |addr: u32| (heap.get_mut_ref::<V>(addr)) )
}
pub fn values_mut<'b: 'a>(&'b self,) -> impl Iterator<Item=&'a mut V> + 'b {
let heap: &'a Heap = self.heap;
let table: &'b [KeyValue] = &self.table;
self.occupied
.iter()
.cloned()
.map(move |bucket: usize| table[bucket].value_addr)
.map(move |addr: u32| heap.get_mut_ref::<V>(addr))
}
pub fn get_or_create<S: AsRef<[u8]>>(&mut self, key: S) -> &mut V {
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(&mut self, key: S) -> &mut V {
let entry = self.lookup(key.as_ref());
match entry {
Entry::Occupied(addr) => {
@@ -141,8 +149,9 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
pub fn lookup<S: AsRef<[u8]>>(&self, key: S) -> Entry {
let key_bytes: &[u8] = key.as_ref();
let mut bucket = self.bucket(key_bytes);
let mut probe = self.probe(key_bytes);
loop {
let bucket = probe.next();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
return Entry::Vacant(bucket);
@@ -150,7 +159,6 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
if self.get_key(kv.key) == key_bytes {
return Entry::Occupied(kv.value_addr);
}
bucket = (bucket + 1) & self.mask;
}
}
}
@@ -183,14 +191,11 @@ mod tests {
#[test]
fn test_hash_map() {
let heap = Heap::with_capacity(2_000_000);
let mut hash_map: HashMap<TestValue> = HashMap::new(18, &heap);
let mut hash_map: HashMap = HashMap::new(18, &heap);
{
{
let v: &mut TestValue = hash_map.get_or_create("abc");
assert_eq!(v.val, 0u32);
v.val = 3u32;
}
}
{
let v: &mut TestValue = hash_map.get_or_create("abcd");
@@ -205,10 +210,18 @@ mod tests {
let v: &mut TestValue = hash_map.get_or_create("abcd");
assert_eq!(v.val, 4u32);
}
let mut iter_values = hash_map.values_mut();
assert_eq!(iter_values.next().unwrap().val, 3u32);
assert_eq!(iter_values.next().unwrap().val, 4u32);
assert!(!iter_values.next().is_some());
let mut iter_values = hash_map.iter();
{
let (_, addr) = iter_values.next().unwrap();
let val: &TestValue = heap.get_ref(addr);
assert_eq!(val.val, 3u32);
}
{
let (_, addr) = iter_values.next().unwrap();
let val: &TestValue = heap.get_ref(addr);
assert_eq!(val.val, 4u32);
}
assert!(iter_values.next().is_none());
}
#[bench]

View File

@@ -1,6 +1,5 @@
use std::cell::UnsafeCell;
use std::mem;
use common::allocate_vec;
use std::ptr;
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
@@ -46,11 +45,6 @@ impl Heap {
pub fn capacity(&self,) -> u32 {
self.inner().capacity()
}
/// Return the amount of memory that has been allocated so far.
pub fn len(&self,) -> u32 {
self.inner().len()
}
/// Return amount of free space, in bytes.
pub fn num_free_bytes(&self,) -> u32 {
@@ -86,8 +80,14 @@ impl Heap {
pub fn set<Item>(&self, addr: u32, val: &Item) {
self.inner().set(addr, val);
}
/// Returns a mutable reference for an object at a given Item.
/// Returns a reference to an `Item` at a given `addr`.
#[cfg(test)]
pub fn get_ref<Item>(&self, addr: u32) -> &Item {
self.inner().get_mut_ref(addr)
}
/// Returns a mutable reference to an `Item` at a given `addr`.
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
self.inner().get_mut_ref(addr)
}
@@ -100,81 +100,58 @@ impl Heap {
struct InnerHeap {
buffer: Vec<u8>,
buffer_len: u32,
used: u32,
next_heap: Option<Box<InnerHeap>>,
has_been_resized: bool,
}
impl InnerHeap {
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
let buffer: Vec<u8> = allocate_vec(num_bytes);
let buffer: Vec<u8> = vec![0u8; num_bytes];
InnerHeap {
buffer: buffer,
buffer_len: num_bytes as u32,
next_heap: None,
used: 0u32,
has_been_resized: false,
}
}
pub fn clear(&mut self) {
self.used = 0u32;
self.next_heap = None;
}
pub fn capacity(&self,) -> u32 {
self.buffer.len() as u32
}
pub fn len(&self,) -> u32 {
self.used
}
// Returns the number of free bytes. If the buffer
// has reached it's capacity and overflowed to another buffer, return 0.
pub fn num_free_bytes(&self,) -> u32 {
if self.next_heap.is_some() {
if self.has_been_resized {
0u32
}
else {
self.buffer_len - self.used
(self.buffer.len() as u32) - self.used
}
}
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
let addr = self.used;
self.used += num_bytes as u32;
if self.used <= self.buffer_len {
addr
let buffer_len = self.buffer.len();
if self.used > buffer_len as u32 {
self.buffer.resize(buffer_len * 2, 0u8);
self.has_been_resized = true
}
else {
if self.next_heap.is_none() {
warn!("Exceeded heap size. The margin was apparently unsufficient. The segment will be committed right after indexing this very last document.");
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
}
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
}
addr
}
fn get_slice(&self, start: u32, stop: u32) -> &[u8] {
if start >= self.buffer_len {
self.next_heap.as_ref().unwrap().get_slice(start - self.buffer_len, stop - self.buffer_len)
}
else {
&self.buffer[start as usize..stop as usize]
}
&self.buffer[start as usize..stop as usize]
}
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
if start >= self.buffer_len {
self.next_heap.as_mut().unwrap().get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
}
else {
&mut self.buffer[start as usize..stop as usize]
}
&mut self.buffer[start as usize..stop as usize]
}
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
@@ -188,40 +165,23 @@ impl InnerHeap {
}
fn get_mut(&mut self, addr: u32) -> *mut u8 {
if addr >= self.buffer_len {
self.next_heap.as_mut().unwrap().get_mut(addr - self.buffer_len)
}
else {
let addr_isize = addr as isize;
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
}
let addr_isize = addr as isize;
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
}
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
if addr >= self.buffer_len {
self.next_heap.as_mut().unwrap().get_mut_ref(addr - self.buffer_len)
}
else {
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
let v_ptr = v_ptr_u8 as *mut Item;
unsafe { &mut *v_ptr }
}
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
let v_ptr = v_ptr_u8 as *mut Item;
unsafe { &mut *v_ptr }
}
fn set<Item>(&mut self, addr: u32, val: &Item) {
if addr >= self.buffer_len {
self.next_heap.as_mut().unwrap().set(addr - self.buffer_len, val);
}
else {
let v_ptr: *const Item = val as *const Item;
let v_ptr_u8: *const u8 = v_ptr as *const u8;
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
unsafe {
let dest_ptr: *mut u8 = self.get_mut(addr);
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
}
let v_ptr: *const Item = val as *const Item;
let v_ptr_u8: *const u8 = v_ptr as *const u8;
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
unsafe {
let dest_ptr: *mut u8 = self.get_mut(addr);
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
}
}
}

View File

@@ -18,10 +18,10 @@ fn test_unrolled_linked_list() {
ks.push(2);
ks.push(3);
for k in (1..5).map(|k| k * 100) {
let mut hashmap: HashMap<ExpUnrolledLinkedList> = HashMap::new(10, &heap);
let mut hashmap: HashMap = HashMap::new(10, &heap);
for j in 0..k {
for i in 0..500 {
let mut list = hashmap.get_or_create(i.to_string());
let mut list: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string());
list.push(i*j, &heap);
}
}

View File

@@ -0,0 +1,465 @@
#![allow(should_implement_trait)]
use std::cmp::max;
use std::io;
use std::io::Write;
use std::io::Read;
use fst;
use fst::raw::Fst;
use common::VInt;
use directory::ReadOnlySource;
use common::BinarySerializable;
use std::marker::PhantomData;
use common::CountingWriter;
use std::cmp::Ordering;
use fst::{IntoStreamer, Streamer};
use std::str;
use fst::raw::Node;
use fst::raw::CompiledAddr;
const BLOCK_SIZE: usize = 1024;
fn convert_fst_error(e: fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
}
pub struct StreamDictionaryBuilder<W: Write, V: BinarySerializable + Clone + Default> {
write: CountingWriter<W>,
block_index: fst::MapBuilder<Vec<u8>>,
last_key: Vec<u8>,
len: usize,
_phantom_: PhantomData<V>,
}
fn common_prefix_length(left: &[u8], right: &[u8]) -> usize {
left.iter().cloned()
.zip(right.iter().cloned())
.take_while(|&(b1, b2)| b1 == b2)
.count()
}
fn fill_last<'a>(fst: &'a Fst, mut node: Node<'a>, buffer: &mut Vec<u8>) {
loop {
if let Some(transition) = node.transitions().last() {
buffer.push(transition.inp);
node = fst.node(transition.addr);
}
else {
break;
}
}
}
fn strictly_previous_key<B: AsRef<[u8]>>(fst_map: &fst::Map, key_as_ref: B) -> (Vec<u8>, u64) {
let key = key_as_ref.as_ref();
let fst = fst_map.as_fst();
let mut node = fst.root();
let mut node_stack: Vec<Node> = vec!(node.clone());
// first check the longest prefix.
for &b in &key[..key.len() - 1] {
node = match node.find_input(b) {
None => {
break;
},
Some(i) => {
fst.node(node.transition_addr(i))
},
};
node_stack.push(node);
}
let len_node_stack = node_stack.len();
for i in (1..len_node_stack).rev() {
let cur_node = &node_stack[i];
let b: u8 = key[i];
let last_transition_opt = cur_node
.transitions()
.take_while(|transition| transition.inp < b)
.last();
if let Some(last_transition) = last_transition_opt {
let mut result_buffer = Vec::from(&key[..i]);
result_buffer.push(last_transition.inp);
let mut result = Vec::from(&key[..i]);
result.push(last_transition.inp);
let fork_node = fst.node(last_transition.addr);
fill_last(fst, fork_node, &mut result);
let val = fst_map.get(&result).unwrap();
return (result, val);
}
else if cur_node.is_final() {
// the previous key is a prefix
let result_buffer = Vec::from(&key[..i]);
let val = fst_map.get(&result_buffer).unwrap();
return (result_buffer, val);
}
}
return (vec!(), 0);
}
impl<W: Write, V: BinarySerializable + Clone + Default> StreamDictionaryBuilder<W, V> {
pub fn new(write: W) -> io::Result<StreamDictionaryBuilder<W, V>> {
let buffer: Vec<u8> = vec!();
Ok(StreamDictionaryBuilder {
write: CountingWriter::wrap(write),
block_index: fst::MapBuilder::new(buffer)
.expect("This cannot fail"),
last_key: Vec::with_capacity(128),
len: 0,
_phantom_: PhantomData,
})
}
fn add_index_entry(&mut self) {
self.block_index.insert(&self.last_key, self.write.written_bytes() as u64).unwrap();
}
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()>{
self.insert_key(key)?;
self.insert_value(value)
}
pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()>{
if self.len % BLOCK_SIZE == 0 {
self.add_index_entry();
}
self.len += 1;
let common_len = common_prefix_length(key, &self.last_key);
VInt(common_len as u64).serialize(&mut self.write)?;
self.last_key.truncate(common_len);
self.last_key.extend_from_slice(&key[common_len..]);
VInt((key.len() - common_len) as u64).serialize(&mut self.write)?;
self.write.write_all(&key[common_len..])?;
Ok(())
}
pub fn insert_value(&mut self, value: &V) -> io::Result<()>{
value.serialize(&mut self.write)?;
Ok(())
}
pub fn finish(mut self) -> io::Result<W> {
self.add_index_entry();
let (mut w, split_len) = self.write.finish()?;
let fst_write = self.block_index
.into_inner()
.map_err(convert_fst_error)?;
w.write(&fst_write)?;
(split_len as u64).serialize(&mut w)?;
w.flush()?;
Ok(w)
}
}
fn stream_before<'a, V: 'a + Clone + Default + BinarySerializable>(stream_dictionary: &'a StreamDictionary<V>, target_key: &[u8]) -> StreamDictionaryStreamer<'a, V> {
let (prev_key, offset) = strictly_previous_key(&stream_dictionary.fst_index, target_key.as_ref());
let offset: usize = offset as usize;
StreamDictionaryStreamer {
cursor: &stream_dictionary.stream_data.as_slice()[offset..],
current_key: Vec::from(prev_key),
current_value: V::default(),
}
}
pub struct StreamDictionary<V> where V:BinarySerializable + Default + Clone {
stream_data: ReadOnlySource,
fst_index: fst::Map,
_phantom_: PhantomData<V>,
}
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
Ok(fst::Map::from(match source {
ReadOnlySource::Anonymous(data) => try!(Fst::from_shared_bytes(data.data, data.start, data.len).map_err(convert_fst_error)),
ReadOnlySource::Mmap(mmap_readonly) => try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)),
}))
}
impl<V: BinarySerializable + Clone + Default> StreamDictionary<V> {
pub fn from_source(source: ReadOnlySource) -> io::Result<StreamDictionary<V>> {
let total_len = source.len();
let length_offset = total_len - 8;
let split_len: usize = {
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
u64::deserialize(&mut split_len_buffer)? as usize
};
let stream_data = source.slice(0, split_len);
let fst_data = source.slice(split_len, length_offset);
let fst_index = open_fst_index(fst_data)?;
Ok(StreamDictionary {
stream_data: stream_data,
fst_index: fst_index,
_phantom_: PhantomData
})
}
pub fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<V> {
let mut streamer = stream_before(self, target_key.as_ref());
while let Some((iter_key, iter_val)) = streamer.next() {
match iter_key.cmp(target_key.as_ref()) {
Ordering::Less => {}
Ordering::Equal => {
let val: V = (*iter_val).clone();
return Some(val);
}
Ordering::Greater => {
return None;
}
}
}
return None;
}
pub fn range(&self) -> StreamDictionaryStreamerBuilder<V> {
let data: &[u8] = &self.stream_data;
StreamDictionaryStreamerBuilder {
stream_dictionary: &self,
offset_from: 0,
offset_to: (data.as_ptr() as usize) + data.len(),
current_key: vec!(),
}
}
pub fn stream(&self) -> StreamDictionaryStreamer<V> {
StreamDictionaryStreamer {
cursor: &*self.stream_data,
current_key: Vec::with_capacity(128),
current_value: V::default(),
}
}
}
pub struct StreamDictionaryStreamerBuilder<'a, V: 'a + BinarySerializable + Clone + Default> {
stream_dictionary: &'a StreamDictionary<V>,
offset_from: usize,
offset_to: usize,
current_key: Vec<u8>,
}
/// Returns offset information for the first
/// key in the stream matching a given predicate.
///
/// returns (start offset, the data required to load the value)
fn get_offset<'a, V, P: Fn(&[u8])->bool>(predicate: P, mut streamer: StreamDictionaryStreamer<V>) -> (usize, Vec<u8>)
where V: 'a + BinarySerializable + Clone + Default {
let mut prev: &[u8] = streamer.cursor;
let mut prev_data: Vec<u8> = streamer.current_key.clone();
while let Some((iter_key, _)) = streamer.next() {
if !predicate(iter_key) {
return (prev.as_ptr() as usize, prev_data);
}
prev = streamer.cursor;
prev_data.clear();
prev_data.extend_from_slice(iter_key);
}
return (prev.as_ptr() as usize, prev_data);
}
impl<'a, V: 'a + BinarySerializable + Clone + Default> StreamDictionaryStreamerBuilder<'a, V> {
pub fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> StreamDictionaryStreamerBuilder<'a, V> {
let target_key = bound.as_ref();
let streamer = stream_before(&self.stream_dictionary, target_key.as_ref());
let smaller_than = |k: &[u8]| { k.lt(target_key) };
let (offset_before, current_key) = get_offset(smaller_than, streamer);
self.current_key = current_key;
self.offset_from = offset_before;
self
}
pub fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> StreamDictionaryStreamerBuilder<'a, V> {
let target_key = bound.as_ref();
let streamer = stream_before(self.stream_dictionary, target_key.as_ref());
let smaller_than = |k: &[u8]| { k.le(target_key) };
let (offset_before, current_key) = get_offset(smaller_than, streamer);
self.current_key = current_key;
self.offset_from = offset_before;
self
}
pub fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> StreamDictionaryStreamerBuilder<'a, V> {
let target_key = bound.as_ref();
let streamer = stream_before(self.stream_dictionary, target_key.as_ref());
let smaller_than = |k: &[u8]| { k.le(target_key) };
let (offset_before, _) = get_offset(smaller_than, streamer);
self.offset_to = offset_before;
self
}
pub fn le<T: AsRef<[u8]>>(mut self, bound: T) -> StreamDictionaryStreamerBuilder<'a, V> {
let target_key = bound.as_ref();
let streamer = stream_before(self.stream_dictionary, target_key.as_ref());
let smaller_than = |k: &[u8]| { k.lt(target_key) };
let (offset_before, _) = get_offset(smaller_than, streamer);
self.offset_to = offset_before;
self
}
pub fn into_stream(self) -> StreamDictionaryStreamer<'a, V> {
let data: &[u8] = &self.stream_dictionary.stream_data.as_slice()[..];
let origin = data.as_ptr() as usize;
let start = self.offset_from - origin;
let stop = max(self.offset_to - origin, start);
StreamDictionaryStreamer {
cursor: &data[start..stop],
current_key: self.current_key,
current_value: V::default(),
}
}
}
pub struct StreamDictionaryStreamer<'a, V: BinarySerializable> {
cursor: &'a [u8],
current_key: Vec<u8>,
current_value: V,
}
impl<'a, V: BinarySerializable> StreamDictionaryStreamer<'a, V> {
pub fn next(&mut self) -> Option<(&[u8], &V)> {
if self.cursor.len() == 0 {
return None;
}
let common_length: usize = VInt::deserialize(&mut self.cursor).unwrap().0 as usize;
let new_length: usize = common_length + VInt::deserialize(&mut self.cursor).unwrap().0 as usize;
self.current_key.reserve(new_length);
unsafe {
self.current_key.set_len(new_length);
}
self.cursor.read_exact(&mut self.current_key[common_length..new_length]).unwrap();
self.current_value = V::deserialize(&mut self.cursor).unwrap();
Some((&self.current_key, &self.current_value))
}
pub fn key(&self) -> &[u8] {
&self.current_key
}
pub fn value(&self) -> &V {
&self.current_value
}
}
#[cfg(test)]
mod test {
use std::str;
use directory::ReadOnlySource;
use super::CountingWriter;
use std::io::Write;
use super::{BLOCK_SIZE, StreamDictionary, StreamDictionaryBuilder};
#[test]
fn test_stream_dictionary() {
let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i))
.collect();
let buffer: Vec<u8> = {
let mut stream_dictionary_builder = StreamDictionaryBuilder::new(vec!()).unwrap();
for &(ref id, ref i) in &ids {
stream_dictionary_builder.insert(id.as_bytes(), i).unwrap();
}
stream_dictionary_builder.finish().unwrap()
};
let source = ReadOnlySource::from(buffer);
let stream_dictionary: StreamDictionary<u32> = StreamDictionary::from_source(source).unwrap();
{
let mut streamer = stream_dictionary.stream();
let mut i = 0;
while let Some((streamer_k, streamer_v)) = streamer.next() {
let &(ref key, ref v) = &ids[i];
assert_eq!(streamer_k, key.as_bytes());
assert_eq!(streamer_v, v);
i += 1;
}
}
let &(ref key, ref _v) = &ids[2047];
stream_dictionary.get(key.as_bytes());
}
#[test]
fn test_stream_range() {
let ids: Vec<_> = (0u32..10_000u32)
.map(|i| (format!("doc{:0>6}", i), i))
.collect();
let buffer: Vec<u8> = {
let mut stream_dictionary_builder = StreamDictionaryBuilder::new(vec!()).unwrap();
for &(ref id, ref i) in &ids {
stream_dictionary_builder.insert(id.as_bytes(), i).unwrap();
}
stream_dictionary_builder.finish().unwrap()
};
let source = ReadOnlySource::from(buffer);
let stream_dictionary: StreamDictionary<u32> = StreamDictionary::from_source(source).unwrap();
{
for i in (0..20).chain((BLOCK_SIZE - 10..BLOCK_SIZE + 10)) {
let &(ref target_key, _) = &ids[i];
let mut streamer = stream_dictionary
.range()
.ge(target_key.as_bytes())
.into_stream();
for j in 0..3 {
let (streamer_k, streamer_v) = streamer.next().unwrap();
let &(ref key, ref v) = &ids[i + j];
assert_eq!(str::from_utf8(streamer_k).unwrap(), key);
assert_eq!(streamer_v, v);
}
}
}
{
for i in (0..20).chain((BLOCK_SIZE - 10..BLOCK_SIZE + 10)) {
let &(ref target_key, _) = &ids[i];
let mut streamer = stream_dictionary
.range()
.gt(target_key.as_bytes())
.into_stream();
for j in 0..3 {
let (streamer_k, streamer_v) = streamer.next().unwrap();
let &(ref key, ref v) = &ids[i + j + 1];
assert_eq!(streamer_k, key.as_bytes());
assert_eq!(streamer_v, v);
}
}
}
{
for i in (0..20).chain((BLOCK_SIZE - 10..BLOCK_SIZE + 10)) {
for j in 0..3 {
let &(ref fst_key, _) = &ids[i];
let &(ref last_key, _) = &ids[i + 3];
let mut streamer = stream_dictionary
.range()
.ge(fst_key.as_bytes())
.lt(last_key.as_bytes())
.into_stream();
for _ in 0..(j + 1) {
assert!(streamer.next().is_some());
}
assert!(streamer.next().is_some());
}
}
}
}
}

View File

@@ -1,4 +1,5 @@
use std::path::{Path, PathBuf};
use serde_json;
use directory::error::{OpenReadError, DeleteError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use std::result;
@@ -7,7 +8,6 @@ use Directory;
use std::sync::{Arc, RwLock};
use std::collections::HashSet;
use std::io::Write;
use rustc_serialize::json;
use core::MANAGED_FILEPATH;
use std::collections::HashMap;
use std::fmt;
@@ -74,7 +74,7 @@ impl ManagedDirectory {
match directory.atomic_read(&MANAGED_FILEPATH) {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> = json::decode(&managed_files_json)
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
.map_err(|e| Error::CorruptedFile(MANAGED_FILEPATH.clone(), Box::new(e)))?;
Ok(ManagedDirectory {
directory: box directory,
@@ -204,8 +204,8 @@ impl ManagedDirectory {
.expect("Managed file lock poisoned");
managed_paths = meta_informations_rlock.managed_paths.clone();
}
let mut w = vec!();
try!(write!(&mut w, "{}\n", json::as_pretty_json(&managed_paths)));
let mut w = try!(serde_json::to_vec(&managed_paths));
try!(write!(&mut w, "\n"));
self.directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
Ok(())
}

View File

@@ -53,7 +53,7 @@ fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadE
}
#[derive(Default,Clone,Debug,RustcDecodable,RustcEncodable)]
#[derive(Default,Clone,Debug,Serialize,Deserialize)]
pub struct CacheCounters {
// Number of time the cache prevents to call `mmap`
pub hit: usize,
@@ -65,7 +65,7 @@ pub struct CacheCounters {
pub miss_weak: usize,
}
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
#[derive(Clone,Debug,Serialize,Deserialize)]
pub struct CacheInfo {
pub counters: CacheCounters,
pub mmapped: Vec<PathBuf>,

View File

@@ -79,3 +79,10 @@ impl Clone for ReadOnlySource {
self.slice(0, self.len())
}
}
impl From<Vec<u8>> for ReadOnlySource {
fn from(data: Vec<u8>) -> ReadOnlySource {
let shared_data = SharedVecSlice::from(data);
ReadOnlySource::Anonymous(shared_data)
}
}

View File

@@ -35,3 +35,9 @@ impl SharedVecSlice {
}
}
}
impl From<Vec<u8>> for SharedVecSlice {
fn from(data: Vec<u8>) -> SharedVecSlice {
SharedVecSlice::new(Arc::new(data))
}
}

View File

@@ -10,9 +10,8 @@ use std::sync::PoisonError;
use directory::error::{OpenReadError, OpenWriteError, OpenDirectoryError};
use query;
use schema;
use fastfield::FastFieldNotAvailableError;
use serde_json;
/// Generic tantivy error.
@@ -38,7 +37,14 @@ pub enum Error {
ErrorInThread(String),
/// An Error appeared related to the lack of a field.
SchemaError(String),
/// Tried to access a fastfield reader for a field not configured accordingly.
FastFieldError(FastFieldNotAvailableError)
}
impl From<FastFieldNotAvailableError> for Error {
fn from(fastfield_error: FastFieldNotAvailableError) -> Error {
Error::FastFieldError(fastfield_error)
}
}
impl From<io::Error> for Error {
@@ -95,3 +101,9 @@ impl From<OpenDirectoryError> for Error {
}
}
}
impl From<serde_json::Error> for Error {
fn from(error: serde_json::Error) -> Error {
Error::IOError(error.into())
}
}

View File

@@ -6,6 +6,9 @@ use directory::ReadOnlySource;
use DocId;
use common::HasLen;
/// Write a delete BitSet
///
/// where `delete_bitset` is the set of deleted `DocId`.
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> {
let max_doc = delete_bitset.capacity();
let mut byte = 0u8;
@@ -29,14 +32,16 @@ pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io:
writer.flush()
}
/// Set of deleted `DocId`s.
#[derive(Clone)]
pub struct DeleteBitSet {
data: ReadOnlySource,
len: usize,
}
impl DeleteBitSet {
impl DeleteBitSet {
/// Opens a delete bitset given its data source.
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
let num_deleted: usize = data
.as_slice()
@@ -49,6 +54,7 @@ impl DeleteBitSet {
}
}
/// Returns an empty delete bit set.
pub fn empty() -> DeleteBitSet {
DeleteBitSet {
data: ReadOnlySource::empty(),
@@ -56,10 +62,12 @@ impl DeleteBitSet {
}
}
/// Returns true iff the segment has some deleted documents.
pub fn has_deletes(&self) -> bool {
self.len() > 0
}
/// Returns true iff the document is deleted.
pub fn is_deleted(&self, doc: DocId) -> bool {
if self.len == 0 {
false

26
src/fastfield/error.rs Normal file
View File

@@ -0,0 +1,26 @@
use std::result;
use schema::FieldEntry;
/// FastFieldNotAvailableError is returned when the
/// user requested for a fast field reader, and the field was not
/// defined in the schema as a fast field.
#[derive(Debug)]
pub struct FastFieldNotAvailableError {
field_name: String,
}
impl FastFieldNotAvailableError {
/// Creates a `FastFieldNotAvailable` error.
/// `field_entry` is the configuration of the field
/// for which fast fields are not available.
pub fn new(field_entry: &FieldEntry) -> FastFieldNotAvailableError {
FastFieldNotAvailableError {
field_name: field_entry.name().to_string(),
}
}
}
/// Result when trying to access a fast field reader.
pub type Result<R> = result::Result<R, FastFieldNotAvailableError>;

View File

@@ -1,23 +1,39 @@
/// Fast field module
///
/// Fast fields are the equivalent of `DocValues` in `Lucene`.
/// Fast fields are stored in column-oriented fashion and allow fast
/// random access given a `DocId`.
///
/// Their performance is comparable to that of an array lookup.
/// They are useful when a field is required for all or most of
/// the `DocSet` : for instance for scoring, grouping, filtering, or facetting.
///
/// Currently only u32 fastfield are supported.
//! # Fast fields
//!
//! Fast fields are the equivalent of `DocValues` in `Lucene`.
//! Fast fields is a non-compressed column-oriented fashion storage
//! of `tantivy`.
//!
//! It is designed for the fast random access of some document
//! fields given a document id.
//!
//! `FastField` are useful when a field is required for all or most of
//! the `DocSet` : for instance for scoring, grouping, filtering, or facetting.
//!
//!
//! Fields have to be declared as `FAST` in the schema.
//! Currently only 64-bits integers (signed or unsigned) are
//! supported.
//!
//! They are stored in a bitpacked fashion so that their
//! memory usage is directly linear with the amplitude of the
//! values stored.
//!
//! Read access performance is comparable to that of an array lookup.
mod reader;
mod writer;
mod serializer;
pub mod delete;
mod error;
mod delete;
pub use self::writer::{U32FastFieldsWriter, U32FastFieldWriter};
pub use self::reader::{U32FastFieldsReader, U32FastFieldReader};
pub use self::delete::write_delete_bitset;
pub use self::delete::DeleteBitSet;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
pub use self::reader::{FastFieldsReader, U64FastFieldReader, I64FastFieldReader};
pub use self::reader::FastFieldReader;
pub use self::serializer::FastFieldSerializer;
pub use self::error::{Result, FastFieldNotAvailableError};
#[cfg(test)]
mod tests {
@@ -30,6 +46,7 @@ mod tests {
use schema::FAST;
use test::Bencher;
use test;
use fastfield::FastFieldReader;
use rand::Rng;
use rand::SeedableRng;
use rand::XorShiftRng;
@@ -37,7 +54,7 @@ mod tests {
lazy_static! {
static ref SCHEMA: Schema = {
let mut schema_builder = SchemaBuilder::default();
schema_builder.add_u32_field("field", FAST);
schema_builder.add_u64_field("field", FAST);
schema_builder.build()
};
static ref FIELD: Field = {
@@ -45,15 +62,15 @@ mod tests {
};
}
fn add_single_field_doc(fast_field_writers: &mut U32FastFieldsWriter, field: Field, value: u32) {
fn add_single_field_doc(fast_field_writers: &mut FastFieldsWriter, field: Field, value: u64) {
let mut doc = Document::default();
doc.add_u32(field, value);
doc.add_u64(field, value);
fast_field_writers.add_document(&doc);
}
#[test]
pub fn test_fastfield() {
let test_fastfield = U32FastFieldReader::from(vec!(100,200,300));
let test_fastfield = U64FastFieldReader::from(vec!(100,200,300));
assert_eq!(test_fastfield.get(0), 100);
assert_eq!(test_fastfield.get(1), 200);
assert_eq!(test_fastfield.get(2), 300);
@@ -66,23 +83,23 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 2u32);
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 2u64);
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 20 as usize);
assert_eq!(source.len(), 31 as usize);
}
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
assert_eq!(fast_field_reader.get(0), 13u32);
assert_eq!(fast_field_reader.get(1), 14u32);
assert_eq!(fast_field_reader.get(2), 2u32);
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
assert_eq!(fast_field_reader.get(0), 13u64);
assert_eq!(fast_field_reader.get(1), 14u64);
assert_eq!(fast_field_reader.get(2), 2u64);
}
}
@@ -93,35 +110,35 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 4u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 3_052u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 9002u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 15_001u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 777u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_002u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 215u32);
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
add_single_field_doc(&mut fast_field_writers, *FIELD, 4u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 3_052u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 9002u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 15_001u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 777u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_002u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u64);
add_single_field_doc(&mut fast_field_writers, *FIELD, 215u64);
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 45 as usize);
assert_eq!(source.len(), 56 as usize);
}
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
assert_eq!(fast_field_reader.get(0), 4u32);
assert_eq!(fast_field_reader.get(1), 14_082_001u32);
assert_eq!(fast_field_reader.get(2), 3_052u32);
assert_eq!(fast_field_reader.get(3), 9002u32);
assert_eq!(fast_field_reader.get(4), 15_001u32);
assert_eq!(fast_field_reader.get(5), 777u32);
assert_eq!(fast_field_reader.get(6), 1_002u32);
assert_eq!(fast_field_reader.get(7), 1_501u32);
assert_eq!(fast_field_reader.get(8), 215u32);
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
assert_eq!(fast_field_reader.get(0), 4u64);
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
assert_eq!(fast_field_reader.get(2), 3_052u64);
assert_eq!(fast_field_reader.get(3), 9002u64);
assert_eq!(fast_field_reader.get(4), 15_001u64);
assert_eq!(fast_field_reader.get(5), 777u64);
assert_eq!(fast_field_reader.get(6), 1_002u64);
assert_eq!(fast_field_reader.get(7), 1_501u64);
assert_eq!(fast_field_reader.get(8), 215u64);
}
}
@@ -134,30 +151,123 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for _ in 0..10_000 {
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u32);
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64);
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 18 as usize);
assert_eq!(source.len(), 29 as usize);
}
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
for doc in 0..10_000 {
assert_eq!(fast_field_reader.get(doc), 100_000u32);
assert_eq!(fast_field_reader.get(doc), 100_000u64);
}
}
}
fn generate_permutation() -> Vec<u32> {
#[test]
fn test_intfastfield_large_numbers() {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
// forcing the amplitude to be high
add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64);
for i in 0u64..10_000u64 {
add_single_field_doc(&mut fast_field_writers, *FIELD, 5_000_000_000_000_000_000u64 + i);
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 80037 as usize);
}
{
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
assert_eq!(fast_field_reader.get(0), 0u64);
for doc in 1..10_001 {
assert_eq!(fast_field_reader.get(doc), 5_000_000_000_000_000_000u64 + doc as u64 - 1u64);
}
}
}
#[test]
fn test_signed_intfastfield() {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
let mut schema_builder = SchemaBuilder::new();
let i64_field = schema_builder.add_i64_field("field", FAST);
let schema = schema_builder.build();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
for i in -100i64..10_000i64 {
let mut doc = Document::default();
doc.add_i64(i64_field, i);
fast_field_writers.add_document(&doc);
}
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
assert_eq!(source.len(), 17704 as usize);
}
{
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: I64FastFieldReader = fast_field_readers.open_reader(i64_field).unwrap();
assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64);
for (doc, i) in (-100i64..10_000i64).enumerate() {
assert_eq!(fast_field_reader.get(doc as u32), i);
}
}
}
#[test]
fn test_signed_intfastfield_default_val() {
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
let mut schema_builder = SchemaBuilder::new();
let i64_field = schema_builder.add_i64_field("field", FAST);
let schema = schema_builder.build();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let doc = Document::default();
fast_field_writers.add_document(&doc);
fast_field_writers.serialize(&mut serializer).unwrap();
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: I64FastFieldReader = fast_field_readers.open_reader(i64_field).unwrap();
assert_eq!(fast_field_reader.get(0u32), 0i64);
}
}
fn generate_permutation() -> Vec<u64> {
let seed: &[u32; 4] = &[1, 2, 3, 4];
let mut rng = XorShiftRng::from_seed(*seed);
let mut permutation: Vec<u32> = (0u32..1_000_000u32).collect();
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
rng.shuffle(&mut permutation);
permutation
}
@@ -171,7 +281,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
}
@@ -180,10 +290,11 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
let mut a = 0u32;
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
let mut a = 0u64;
for _ in 0..n {
println!("i {}=> {} {}", a, fast_field_reader.get(a as u32), permutation[a as usize]);
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
a = fast_field_reader.get(a as u32);
}
@@ -195,7 +306,7 @@ mod tests {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u32;
let mut a = 0u64;
for i in (0u32..n).step_by(7) {
a ^= permutation[i as usize];
}
@@ -208,7 +319,7 @@ mod tests {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;
let mut a = 0u64;
for _ in 0u32..n {
a = permutation[a as usize];
}
@@ -224,7 +335,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
}
@@ -233,11 +344,11 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u32;
let mut a = 0u64;
for i in (0u32..n).step_by(7) {
a ^= fast_field_reader.get(i);
}
@@ -254,7 +365,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for x in &permutation {
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
}
@@ -263,13 +374,13 @@ mod tests {
}
let source = directory.open_read(&path).unwrap();
{
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
let fast_field_readers = FastFieldsReader::open(source).unwrap();
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
b.iter(|| {
let n = test::black_box(1000u32);
let mut a = 0u32;
for _ in 0u32..n {
a = fast_field_reader.get(a);
a = fast_field_reader.get(a) as u32;
}
a
});

View File

@@ -1,7 +1,5 @@
use std::io;
use std::collections::HashMap;
use std::ops::Deref;
use directory::ReadOnlySource;
use common::BinarySerializable;
use DocId;
@@ -10,83 +8,123 @@ use std::path::Path;
use schema::FAST;
use directory::{WritePtr, RAMDirectory, Directory};
use fastfield::FastFieldSerializer;
use fastfield::U32FastFieldsWriter;
use fastfield::FastFieldsWriter;
use common::bitpacker::compute_num_bits;
use common::bitpacker::BitUnpacker;
use schema::FieldType;
use common;
/// Trait for accessing a fastfield.
///
/// Depending on the field type, a different
/// fast field is required.
pub trait FastFieldReader: Sized {
/// Type of the value stored in the fastfield.
type ValueType;
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
fn get(&self, doc: DocId) -> Self::ValueType;
lazy_static! {
static ref U32_FAST_FIELD_EMPTY: ReadOnlySource = {
let u32_fast_field = U32FastFieldReader::from(Vec::new());
u32_fast_field._data.clone()
};
/// Opens a fast field given a source.
fn open(source: ReadOnlySource) -> Self;
/// Returns true iff the given field_type makes
/// it possible to access the field values via a
/// fastfield.
fn is_enabled(field_type: &FieldType) -> bool;
}
pub struct U32FastFieldReader {
/// FastFieldReader for unsigned 64-bits integers.
pub struct U64FastFieldReader {
_data: ReadOnlySource,
bit_unpacker: BitUnpacker,
min_val: u32,
max_val: u32,
min_value: u64,
max_value: u64,
}
impl U32FastFieldReader {
unsafe impl Send for U64FastFieldReader {}
unsafe impl Sync for U64FastFieldReader {}
pub fn empty() -> U32FastFieldReader {
U32FastFieldReader::open(U32_FAST_FIELD_EMPTY.clone())
impl U64FastFieldReader {
/// Returns the minimum value for this fast field.
///
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// of the actual minimum value.
pub fn min_value(&self,) -> u64 {
self.min_value
}
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
pub fn max_value(&self,) -> u64 {
self.max_value
}
}
impl FastFieldReader for U64FastFieldReader {
type ValueType = u64;
fn get(&self, doc: DocId) -> u64 {
self.min_value + self.bit_unpacker.get(doc as usize)
}
pub fn min_val(&self,) -> u32 {
self.min_val
}
pub fn max_val(&self,) -> u32 {
self.max_val
fn is_enabled(field_type: &FieldType) -> bool {
match field_type {
&FieldType::U64(ref integer_options) =>
integer_options.is_fast(),
_ => false,
}
}
/// Opens a new fast field reader given a read only source.
///
/// # Panics
/// Panics if the data is corrupted.
pub fn open(data: ReadOnlySource) -> U32FastFieldReader {
let min_val;
let amplitude;
let max_val;
fn open(data: ReadOnlySource) -> U64FastFieldReader {
let min_value: u64;
let max_value: u64;
let bit_unpacker: BitUnpacker;
{
let mut cursor = data.as_slice();
min_val = u32::deserialize(&mut cursor).unwrap();
amplitude = u32::deserialize(&mut cursor).unwrap();
max_val = min_val + amplitude;
let mut cursor: &[u8] = data.as_slice();
min_value = u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
let amplitude = u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
bit_unpacker = BitUnpacker::new(cursor, num_bits as usize)
}
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = {
let data_arr = &(data.deref()[8..]);
BitUnpacker::new(data_arr, num_bits as usize)
};
U32FastFieldReader {
U64FastFieldReader {
_data: data,
bit_unpacker: bit_unpacker,
min_val: min_val,
max_val: max_val,
min_value: min_value,
max_value: max_value,
}
}
pub fn get(&self, doc: DocId) -> u32 {
self.min_val + self.bit_unpacker.get(doc as usize)
}
}
impl From<Vec<u32>> for U32FastFieldReader {
fn from(vals: Vec<u32>) -> U32FastFieldReader {
impl From<Vec<u64>> for U64FastFieldReader {
fn from(vals: Vec<u64>) -> U64FastFieldReader {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_u32_field("field", FAST);
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("test");
let mut directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::new(write).unwrap();
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema);
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
for val in vals {
let mut fast_field_writer = fast_field_writers.get_field_writer(field).unwrap();
fast_field_writer.add_val(val);
@@ -95,29 +133,103 @@ impl From<Vec<u32>> for U32FastFieldReader {
serializer.close().unwrap();
}
let source = directory.open_read(&path).unwrap();
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
fast_field_readers.get_field(field).unwrap()
let fast_field_readers = FastFieldsReader::open(source).unwrap();
fast_field_readers.open_reader(field).unwrap()
}
}
pub struct U32FastFieldsReader {
/// FastFieldReader for signed 64-bits integers.
pub struct I64FastFieldReader {
underlying: U64FastFieldReader,
}
unsafe impl Send for I64FastFieldReader {}
unsafe impl Sync for I64FastFieldReader {}
impl I64FastFieldReader {
/// Returns the minimum value for this fast field.
///
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// of the actual minimum value.
pub fn min_value(&self,) -> i64 {
common::u64_to_i64(self.underlying.min_value())
}
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
pub fn max_value(&self,) -> i64 {
common::u64_to_i64(self.underlying.max_value())
}
}
impl FastFieldReader for I64FastFieldReader {
type ValueType = i64;
fn get(&self, doc: DocId) -> i64 {
common::u64_to_i64(self.underlying.get(doc))
}
/// Opens a new fast field reader given a read only source.
///
/// # Panics
/// Panics if the data is corrupted.
fn open(data: ReadOnlySource) -> I64FastFieldReader {
I64FastFieldReader {
underlying: U64FastFieldReader::open(data)
}
}
fn is_enabled(field_type: &FieldType) -> bool {
match field_type {
&FieldType::I64(ref integer_options) => {
if integer_options.is_fast() {
true
}
else {
false
}
},
_ => false,
}
}
}
/// The FastFieldsReader` is the datastructure containing
/// all of the fast fields' data.
///
/// It contains a mapping that associated these fields to
/// the proper slice in the fastfield reader file.
pub struct FastFieldsReader {
source: ReadOnlySource,
field_offsets: HashMap<Field, (u32, u32)>,
}
impl U32FastFieldsReader {
pub fn open(source: ReadOnlySource) -> io::Result<U32FastFieldsReader> {
impl FastFieldsReader {
/// Opens the `FastFieldsReader` file
///
/// When opening the fast field reader, the
/// the list of the offset is read (as a footer of the
/// data file).
pub fn open(source: ReadOnlySource) -> io::Result<FastFieldsReader> {
let header_offset;
let field_offsets: Vec<(Field, u32)>;
{
let buffer = source.as_slice();
{
let mut cursor = buffer;
header_offset = try!(u32::deserialize(&mut cursor));
header_offset = u32::deserialize(&mut cursor)?;
}
{
let mut cursor = &buffer[header_offset as usize..];
field_offsets = try!(Vec::deserialize(&mut cursor));
field_offsets = Vec::deserialize(&mut cursor)?;
}
}
let mut end_offsets: Vec<u32> = field_offsets
@@ -130,26 +242,26 @@ impl U32FastFieldsReader {
let (field, start_offset) = *field_start_offsets;
field_offsets_map.insert(field, (start_offset, *stop_offset));
}
Ok(U32FastFieldsReader {
Ok(FastFieldsReader {
field_offsets: field_offsets_map,
source: source,
})
}
/// Returns the u32 fast value reader if the field
/// is a u32 field indexed as "fast".
/// Returns the u64 fast value reader if the field
/// is a u64 field indexed as "fast".
///
/// Return None if the field is not a u32 field
/// Return None if the field is not a u64 field
/// indexed with the fast option.
///
/// # Panics
/// May panic if the index is corrupted.
pub fn get_field(&self, field: Field) -> Option<U32FastFieldReader> {
pub fn open_reader<FFReader: FastFieldReader>(&self, field: Field) -> Option<FFReader> {
self.field_offsets
.get(&field)
.map(|&(start, stop)| {
let field_source = self.source.slice(start as usize, stop as usize);
U32FastFieldReader::open(field_source)
FFReader::open(field_source)
})
}
}

View File

@@ -14,13 +14,13 @@ use std::io::{self, Write, Seek, SeekFrom};
/// the serializer.
/// The serializer expects to receive the following calls.
///
/// * `new_u32_fast_field(...)`
/// * `new_u64_fast_field(...)`
/// * `add_val(...)`
/// * `add_val(...)`
/// * `add_val(...)`
/// * ...
/// * `close_field()`
/// * `new_u32_fast_field(...)`
/// * `new_u64_fast_field(...)`
/// * `add_val(...)`
/// * ...
/// * `close_field()`
@@ -29,7 +29,7 @@ pub struct FastFieldSerializer {
write: WritePtr,
written_size: usize,
fields: Vec<(Field, u32)>,
min_value: u32,
min_value: u64,
field_open: bool,
bit_packer: BitPacker,
}
@@ -50,8 +50,8 @@ impl FastFieldSerializer {
})
}
/// Start serializing a new u32 fast field
pub fn new_u32_fast_field(&mut self, field: Field, min_value: u32, max_value: u32) -> io::Result<()> {
/// Start serializing a new u64 fast field
pub fn new_u64_fast_field(&mut self, field: Field, min_value: u64, max_value: u64) -> io::Result<()> {
if self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
}
@@ -68,14 +68,14 @@ impl FastFieldSerializer {
}
/// Pushes a new value to the currently open u32 fast field.
pub fn add_val(&mut self, val: u32) -> io::Result<()> {
let val_to_write: u32 = val - self.min_value;
/// Pushes a new value to the currently open u64 fast field.
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer.write(val_to_write, &mut self.write)?;
Ok(())
}
/// Close the u32 fast field.
/// Close the u64 fast field.
pub fn close_field(&mut self,) -> io::Result<()> {
if !self.field_open {
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));

View File

@@ -3,47 +3,85 @@ use fastfield::FastFieldSerializer;
use std::io;
use schema::Value;
use DocId;
use common;
use schema::FieldType;
pub struct U32FastFieldsWriter {
field_writers: Vec<U32FastFieldWriter>,
/// The fastfieldswriter regroup all of the fast field writers.
pub struct FastFieldsWriter {
field_writers: Vec<IntFastFieldWriter>,
}
impl U32FastFieldsWriter {
impl FastFieldsWriter {
pub fn from_schema(schema: &Schema) -> U32FastFieldsWriter {
let u32_fields: Vec<Field> = schema.fields()
/// Create all `FastFieldWriter` required by the schema.
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
let field_writers: Vec<IntFastFieldWriter> = schema
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_u32_fast())
.map(|(field_id, _)| Field(field_id as u8))
.flat_map(|(field_id, field_entry)| {
let field = Field(field_id as u32);
match field_entry.field_type() {
&FieldType::I64(ref int_options) => {
if int_options.is_fast() {
let mut fast_field_writer = IntFastFieldWriter::new(field);
fast_field_writer.set_val_if_missing(common::i64_to_u64(0i64));
Some(fast_field_writer)
}
else {
None
}
}
&FieldType::U64(ref int_options) => {
if int_options.is_fast() {
Some(IntFastFieldWriter::new(field))
}
else {
None
}
}
_ => None
}
})
.collect();
U32FastFieldsWriter::new(u32_fields)
FastFieldsWriter {
field_writers: field_writers,
}
}
pub fn new(fields: Vec<Field>) -> U32FastFieldsWriter {
U32FastFieldsWriter {
/// Returns a `FastFieldsWriter`
/// with a `IntFastFieldWriter` for each
/// of the field given in argument.
pub fn new(fields: Vec<Field>) -> FastFieldsWriter {
FastFieldsWriter {
field_writers: fields
.into_iter()
.map(U32FastFieldWriter::new)
.map(IntFastFieldWriter::new)
.collect(),
}
}
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut U32FastFieldWriter> {
/// Get the `FastFieldWriter` associated to a field.
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
// TODO optimize
self.field_writers
.iter_mut()
.find(|field_writer| field_writer.field == field)
}
/// Indexes all of the fastfields of a new document.
pub fn add_document(&mut self, doc: &Document) {
for field_writer in &mut self.field_writers {
field_writer.add_document(doc);
}
}
/// Serializes all of the `FastFieldWriter`s by pushing them in
/// order to the fast field serializer.
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
for field_writer in &self.field_writers {
try!(field_writer.serialize(serializer));
field_writer.serialize(serializer)?;
}
Ok(())
}
@@ -56,23 +94,49 @@ impl U32FastFieldsWriter {
for field_writer in &mut self.field_writers {
field_writer.fill_val_up_to(doc);
}
}
}
pub struct U32FastFieldWriter {
/// Fast field writer for ints.
/// The fast field writer just keeps the values in memory.
///
/// Only when the segment writer can be closed and
/// persisted on disc, the fast field writer is
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
/// method.
///
/// We cannot serialize earlier as the values are
/// bitpacked and the number of bits required for bitpacking
/// can only been known once we have seen all of the values.
///
/// Both u64, and i64 use the same writer.
/// i64 are just remapped to the `0..2^64 - 1`
/// using `common::i64_to_u64`.
pub struct IntFastFieldWriter {
field: Field,
vals: Vec<u32>,
vals: Vec<u64>,
val_if_missing: u64,
}
impl U32FastFieldWriter {
pub fn new(field: Field) -> U32FastFieldWriter {
U32FastFieldWriter {
impl IntFastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field) -> IntFastFieldWriter {
IntFastFieldWriter {
field: field,
vals: Vec::new(),
val_if_missing: 0u64,
}
}
/// Sets the default value.
///
/// This default value is recorded for documents if
/// a document does not have any value.
fn set_val_if_missing(&mut self, val_if_missing: u64) {
self.val_if_missing = val_if_missing;
}
/// Ensures all of the fast field writer have
/// reached `doc`. (included)
///
@@ -80,42 +144,68 @@ impl U32FastFieldWriter {
fn fill_val_up_to(&mut self, doc: DocId) {
let target = doc as usize + 1;
debug_assert!(self.vals.len() <= target);
let val_if_missing = self.val_if_missing;
while self.vals.len() < target {
self.add_val(0u32)
self.add_val(val_if_missing);
}
}
pub fn add_val(&mut self, val: u32) {
/// Records a new value.
///
/// The n-th value being recorded is implicitely
/// associated to the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u64) {
self.vals.push(val);
}
fn extract_val(&self, doc: &Document) -> u32 {
/// Extract the value associated to the fast field for
/// this document.
///
/// i64 are remapped to u64 using the logic
/// in `common::i64_to_u64`.
///
/// If the value is missing, then the default value is used
/// instead.
/// If the document has more than one value for the given field,
/// only the first one is taken in account.
fn extract_val(&self, doc: &Document) -> u64 {
match doc.get_first(self.field) {
Some(v) => {
match *v {
Value::U32(ref val) => { *val }
_ => { panic!("Expected a u32field, got {:?} ", v) }
Value::U64(ref val) => { *val },
Value::I64(ref val) => common::i64_to_u64(*val),
_ => { panic!("Expected a u64field, got {:?} ", v) }
}
},
None => {
0u32
self.val_if_missing
}
}
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
pub fn add_document(&mut self, doc: &Document) {
let val = self.extract_val(doc);
self.add_val(val);
}
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
let zero = 0;
let min = *self.vals.iter().min().unwrap_or(&zero);
let max = *self.vals.iter().max().unwrap_or(&min);
try!(serializer.new_u32_fast_field(self.field, min, max));
serializer.new_u64_fast_field(self.field, min, max)?;
for &val in &self.vals {
try!(serializer.add_val(val));
serializer.add_val(val)?;
}
serializer.close_field()
}
}

View File

@@ -6,7 +6,7 @@ use Index;
use Searcher;
use rand::distributions::{IndependentSample, Range};
fn check_index_content(searcher: &Searcher, vals: &HashSet<u32>) {
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
assert!(searcher.segment_readers().len() < 20);
assert_eq!(searcher.num_docs() as usize, vals.len());
}
@@ -17,19 +17,19 @@ fn test_indexing() {
let mut schema_builder = SchemaBuilder::default();
let id_field = schema_builder.add_u32_field("id", U32_INDEXED);
let multiples_field = schema_builder.add_u32_field("multiples", U32_INDEXED);
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap();
let universe = Range::new(0u32, 20u32);
let universe = Range::new(0u64, 20u64);
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
let mut committed_docs: HashSet<u32> = HashSet::new();
let mut uncommitted_docs: HashSet<u32> = HashSet::new();
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..200 {
let random_val = universe.ind_sample(&mut rng);
@@ -45,15 +45,15 @@ fn test_indexing() {
else {
if committed_docs.remove(&random_val) ||
uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u32(id_field, random_val);
let doc_id_term = Term::from_field_u64(id_field, random_val);
index_writer.delete_term(doc_id_term);
}
else {
uncommitted_docs.insert(random_val);
let mut doc = Document::new();
doc.add_u32(id_field, random_val);
for i in 1u32..10u32 {
doc.add_u32(multiples_field, random_val * i);
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
}
index_writer.add_document(doc);
}

View File

@@ -170,7 +170,7 @@ impl NextBlock {
}
}
}
*next_write_lock.deref_mut() = InnerNextBlock::Closed(next_block.clone()); // TODO fix
*next_write_lock.deref_mut() = InnerNextBlock::Closed(next_block.clone());
return Some(next_block)
}
}
@@ -280,10 +280,10 @@ mod tests {
let delete_queue = DeleteQueue::new();
let make_op = |i: usize| {
let field = Field(1u8);
let field = Field(1u32);
DeleteOperation {
opstamp: i as u64,
term: Term::from_field_u32(field, i as u32)
term: Term::from_field_u64(field, i as u64)
}
};

View File

@@ -11,7 +11,7 @@ use datastruct::stacker::Heap;
use directory::FileProtection;
use Error;
use Directory;
use fastfield::delete::write_delete_bitset;
use fastfield::write_delete_bitset;
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
use futures::Canceled;
use futures::Future;
@@ -36,9 +36,9 @@ use std::thread;
// Size of the margin for the heap. A segment is closed when the remaining memory
// in the heap goes below MARGIN_IN_BYTES.
pub const MARGIN_IN_BYTES: u32 = 10_000_000u32;
pub const MARGIN_IN_BYTES: u32 = 1_000_000u32;
// We impose the memory per thread to be at least 30 MB.
// We impose the memory per thread to be at least 3 MB.
pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32;
// Add document will block if the number of docs waiting in the queue to be indexed reaches PIPELINE_MAX_SIZE_IN_DOCS
@@ -120,7 +120,9 @@ pub fn open_index_writer(
let delete_queue = DeleteQueue::new();
let stamper = Stamper::new(index.opstamp());
let current_opstamp = index.opstamp();
let stamper = Stamper::new(current_opstamp);
let segment_updater = SegmentUpdater::new(index.clone(),
stamper.clone(),
@@ -143,7 +145,7 @@ pub fn open_index_writer(
delete_queue: delete_queue,
committed_opstamp: index.opstamp(),
committed_opstamp: current_opstamp,
stamper: stamper,
generation: 0,
@@ -196,10 +198,6 @@ pub fn compute_deleted_bitset(
Ok(might_have_changed)
}
// TODO skip delete operation before teh
// last delete opstamp
/// Advance delete for the given segment up
/// to the target opstamp.
pub fn advance_deletes(
@@ -268,11 +266,24 @@ fn index_documents(heap: &mut Heap,
let mut segment_writer = SegmentWriter::for_segment(heap, segment.clone(), &schema)?;
for doc in document_iterator {
try!(segment_writer.add_document(&doc, &schema));
// There is two possible conditions to close the segment.
// One is the memory arena dedicated to the segment is
// getting full.
if segment_writer.is_buffer_full() {
info!("Buffer limit reached, flushing segment with maxdoc={}.",
segment_writer.max_doc());
break;
}
// The second is the term dictionary hash table
// is reaching saturation.
//
// Tantivy does not resize its hashtable. When it reaches
// capacity, we just stop indexing new document.
if segment_writer.is_termdic_saturated() {
info!("Term dic saturated, flushing segment with maxdoc={}.",
segment_writer.max_doc());
break;
}
}
let num_docs = segment_writer.max_doc();
@@ -345,6 +356,11 @@ impl IndexWriter {
result
}
#[doc(hidden)]
pub fn new_segment(&self) -> Segment {
self.segment_updater.new_segment()
}
/// Spawns a new worker thread for indexing.
/// The thread consumes documents from the pipeline.
///
@@ -418,6 +434,12 @@ impl IndexWriter {
Ok(())
}
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
let delete_cursor = self.delete_queue.cursor();
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
self.segment_updater.add_segment(self.generation, segment_entry);
}
/// Detects and removes the files that
/// are not used by the index anymore.
pub fn garbage_collect_files(&mut self) -> Result<()> {

View File

@@ -6,17 +6,17 @@ use core::SerializableSegment;
use schema::FieldValue;
use indexer::SegmentSerializer;
use postings::PostingsSerializer;
use fastfield::U32FastFieldReader;
use fastfield::U64FastFieldReader;
use itertools::Itertools;
use postings::Postings;
use postings::DocSet;
use core::TermIterator;
use fastfield::delete::DeleteBitSet;
use fastfield::DeleteBitSet;
use schema::{Schema, Field};
use fastfield::FastFieldSerializer;
use fastfield::FastFieldReader;
use store::StoreWriter;
use std::cmp::{min, max};
use common::allocate_vec;
pub struct IndexMerger {
schema: Schema,
@@ -32,7 +32,7 @@ struct DeltaPositionComputer {
impl DeltaPositionComputer {
fn new() -> DeltaPositionComputer {
DeltaPositionComputer {
buffer: allocate_vec(512)
buffer: vec![0u32; 512]
}
}
@@ -50,32 +50,34 @@ impl DeltaPositionComputer {
}
fn compute_min_max_val(u32_reader: &U32FastFieldReader, max_doc: DocId, delete_bitset: &DeleteBitSet) -> Option<(u32, u32)> {
fn compute_min_max_val(u64_reader: &U64FastFieldReader, max_doc: DocId, delete_bitset: &DeleteBitSet) -> Option<(u64, u64)> {
if max_doc == 0 {
None
}
else if !delete_bitset.has_deletes() {
// no deleted documents,
// we can use the previous min_val, max_val.
Some((u32_reader.min_val(), u32_reader.max_val()))
Some((u64_reader.min_value(), u64_reader.max_value()))
}
else {
// some deleted documents,
// we need to recompute the max / min
(0..max_doc)
.filter(|doc_id| !delete_bitset.is_deleted(*doc_id))
.map(|doc_id| u32_reader.get(doc_id))
.map(|doc_id| u64_reader.get(doc_id))
.minmax()
.into_option()
}
}
fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> Option<U32FastFieldReader> {
fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> Option<U64FastFieldReader> {
segment_reader.get_fieldnorms_reader(field)
}
fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Option<U32FastFieldReader> {
segment_reader.get_fast_field_reader(field)
fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Option<U64FastFieldReader> {
segment_reader
.fast_fields_reader()
.open_reader(field)
}
impl IndexMerger {
@@ -103,7 +105,7 @@ impl IndexMerger {
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field_id, _)| Field(field_id as u8))
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(fieldnorm_fastfields, &extract_fieldnorm_reader, fast_field_serializer)
}
@@ -113,37 +115,37 @@ impl IndexMerger {
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_u32_fast())
.map(|(field_id, _)| Field(field_id as u8))
.filter(|&(_, field_entry)| field_entry.is_int_fast())
.map(|(field_id, _)| Field(field_id as u32))
.collect();
self.generic_write_fast_field(fast_fields, &extract_fast_field_reader, fast_field_serializer)
}
// used both to merge field norms and regular u32 fast fields.
// used both to merge field norms and regular u64 fast fields.
fn generic_write_fast_field(&self,
fields: Vec<Field>,
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U32FastFieldReader>,
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U64FastFieldReader>,
fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
for field in fields {
let mut u32_readers = vec!();
let mut min_val = u32::max_value();
let mut max_val = u32::min_value();
let mut u64_readers = vec!();
let mut min_val = u64::max_value();
let mut max_val = u64::min_value();
for reader in &self.readers {
match field_reader_extractor(reader, field) {
Some(u32_reader) => {
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(&u32_reader, reader.max_doc(), reader.delete_bitset()) {
Some(u64_reader) => {
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset()) {
// the segment has some non-deleted documents
min_val = min(min_val, seg_min_val);
max_val = max(max_val, seg_max_val);
u32_readers.push((reader.max_doc(), u32_reader, reader.delete_bitset()));
u64_readers.push((reader.max_doc(), u64_reader, reader.delete_bitset()));
}
}
None => {
let error_msg = format!("Failed to find a u32_reader for field {:?}", field);
let error_msg = format!("Failed to find a u64_reader for field {:?}", field);
error!("{}", error_msg);
return Err(Error::SchemaError(error_msg))
}
@@ -151,7 +153,7 @@ impl IndexMerger {
}
if u32_readers.is_empty() {
if u64_readers.is_empty() {
// we have actually zero documents.
min_val = 0;
max_val = 0;
@@ -159,11 +161,11 @@ impl IndexMerger {
assert!(min_val <= max_val);
try!(fast_field_serializer.new_u32_fast_field(field, min_val, max_val));
for (max_doc, u32_reader, delete_bitset) in u32_readers {
try!(fast_field_serializer.new_u64_fast_field(field, min_val, max_val));
for (max_doc, u64_reader, delete_bitset) in u64_readers {
for doc_id in 0..max_doc {
if !delete_bitset.is_deleted(doc_id) {
let val = u32_reader.get(doc_id);
let val = u64_reader.get(doc_id);
try!(fast_field_serializer.add_val(val));
}
}
@@ -199,6 +201,8 @@ impl IndexMerger {
}
merged_doc_id_map.push(segment_local_map);
}
let mut last_field: Option<Field> = None;
while merged_terms.advance() {
// Create the total list of doc ids
@@ -229,15 +233,20 @@ impl IndexMerger {
// We can now serialize this postings, by pushing each document to the
// postings serializer.
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
while segment_postings.advance() {
if let Some(remapped_doc_id) = old_to_new_doc_id[segment_postings.doc() as usize] {
if !term_written {
let current_field = term.field();
if last_field != Some(current_field) {
postings_serializer.new_field(current_field);
last_field = Some(current_field);
}
// we make sure to only write the term iff
// there is at least one document.
postings_serializer.new_term(&term)?;
postings_serializer.new_term(term.as_slice())?;
term_written = true;
}
let delta_positions: &[u32] =
@@ -295,6 +304,7 @@ mod tests {
use query::TermQuery;
use schema::{Field, FieldValue};
use core::Index;
use fastfield::U64FastFieldReader;
use Searcher;
use DocAddress;
use collector::tests::FastFieldTestCollector;
@@ -311,8 +321,8 @@ mod tests {
.set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::U32Options::default().set_fast();
let score_field = schema_builder.add_u32_field("score", score_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let index = Index::create_in_ram(schema_builder.build());
{
@@ -322,19 +332,19 @@ mod tests {
{
let mut doc = Document::default();
doc.add_text(text_field, "af b");
doc.add_u32(score_field, 3);
doc.add_u64(score_field, 3);
index_writer.add_document(doc);
}
{
let mut doc = Document::default();
doc.add_text(text_field, "a b c");
doc.add_u32(score_field, 5);
doc.add_u64(score_field, 5);
index_writer.add_document(doc);
}
{
let mut doc = Document::default();
doc.add_text(text_field, "a b c d");
doc.add_u32(score_field, 7);
doc.add_u64(score_field, 7);
index_writer.add_document(doc);
}
index_writer.commit().expect("committed");
@@ -345,13 +355,13 @@ mod tests {
{
let mut doc = Document::default();
doc.add_text(text_field, "af b");
doc.add_u32(score_field, 11);
doc.add_u64(score_field, 11);
index_writer.add_document(doc);
}
{
let mut doc = Document::default();
doc.add_text(text_field, "a b c g");
doc.add_u32(score_field, 13);
doc.add_u64(score_field, 13);
index_writer.add_document(doc);
}
index_writer.commit().expect("Commit failed");
@@ -417,7 +427,7 @@ mod tests {
}
}
fn search_term(searcher: &Searcher, term: Term) -> Vec<u32> {
fn search_term(searcher: &Searcher, term: Term) -> Vec<u64> {
let mut collector = FastFieldTestCollector::for_field(Field(1));
let term_query = TermQuery::new(term, SegmentPostingsOption::NoFreq);
searcher.search(&term_query, &mut collector).unwrap();
@@ -432,27 +442,29 @@ mod tests {
.set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::U32Options::default().set_fast();
let score_field = schema_builder.add_u32_field("score", score_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let empty_vec = Vec::<u64>::new();
{ // a first commit
index_writer.add_document(
doc!(
text_field => "a b d",
score_field => 1
score_field => 1u64
));
index_writer.add_document(
doc!(
text_field => "b c",
score_field => 2
score_field => 2u64
));
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.add_document(
doc!(
text_field => "c d",
score_field => 3
score_field => 3u64
));
index_writer.commit().expect("committed");
index.load_searchers().unwrap();
@@ -469,24 +481,24 @@ mod tests {
index_writer.add_document(
doc!(
text_field => "a d e",
score_field => 4_000
score_field => 4_000u64
));
index_writer.add_document(
doc!(
text_field => "e f",
score_field => 5_000
score_field => 5_000u64
));
index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.delete_term(Term::from_field_text(text_field, "f"));
index_writer.add_document(
doc!(
text_field => "f g",
score_field => 6_000
score_field => 6_000u64
));
index_writer.add_document(
doc!(
text_field => "g h",
score_field => 7_000
score_field => 7_000u64
));
index_writer.commit().expect("committed");
index.load_searchers().unwrap();
@@ -498,21 +510,21 @@ mod tests {
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(searcher.segment_readers()[1].num_docs(), 2);
assert_eq!(searcher.segment_readers()[1].max_doc(), 4);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_val(), 1);
assert_eq!(score_field_reader.max_val(), 3);
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_value(), 1);
assert_eq!(score_field_reader.max_value(), 3);
let score_field_reader = searcher.segment_reader(1).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_val(), 4000);
assert_eq!(score_field_reader.max_val(), 7000);
let score_field_reader: U64FastFieldReader = searcher.segment_reader(1).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_value(), 4000);
assert_eq!(score_field_reader.max_value(), 7000);
}
{ // merging the segments
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
@@ -525,16 +537,16 @@ mod tests {
assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_val(), 3);
assert_eq!(score_field_reader.max_val(), 7000);
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_value(), 3);
assert_eq!(score_field_reader.max_value(), 7000);
}
{
// test a commit with only deletes
@@ -547,16 +559,16 @@ mod tests {
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_val(), 3);
assert_eq!(score_field_reader.max_val(), 7000);
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_value(), 3);
assert_eq!(score_field_reader.max_value(), 7000);
}
{ // Test merging a single segment in order to remove deletes.
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
@@ -570,16 +582,16 @@ mod tests {
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 2);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!());
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_val(), 6000);
assert_eq!(score_field_reader.max_val(), 7000);
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
assert_eq!(score_field_reader.min_value(), 6000);
assert_eq!(score_field_reader.max_value(), 7000);
}
{ // Test removing all docs

View File

@@ -172,7 +172,6 @@ impl SegmentManager {
// ... and we make sure the target segment entry
// can be garbage collected.
registers_lock.writing.remove(&after_merge_segment_id);
}

View File

@@ -23,7 +23,7 @@ use indexer::SegmentEntry;
use indexer::SegmentSerializer;
use Result;
use futures_cpupool::CpuFuture;
use rustc_serialize::json;
use serde_json;
use indexer::delete_queue::DeleteCursor;
use schema::Schema;
use std::borrow::BorrowMut;
@@ -77,10 +77,10 @@ pub fn save_metas(segment_metas: Vec<SegmentMeta>,
schema: schema,
opstamp: opstamp,
};
let mut w = vec!();
try!(write!(&mut w, "{}\n", json::as_pretty_json(&metas)));
let mut w = try!(serde_json::to_vec_pretty(&metas));
try!(write!(&mut w, "\n"));
let res = directory.atomic_write(&META_FILEPATH, &w[..])?;
debug!("Saved metas {}", json::as_pretty_json(&metas));
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
Ok(res)
}
@@ -309,8 +309,6 @@ impl SegmentUpdater {
let merging_join_handle = thread::spawn(move || {
// first we need to apply deletes to our segment.
info!("Start merge: {:?}", segment_ids_vec);
let merged_segment = segment_updater_clone.new_segment();
let merged_segment_id = merged_segment.id();
let merge_result = perform_merge(&segment_ids_vec, &segment_updater_clone, merged_segment, target_opstamp);
@@ -374,12 +372,25 @@ impl SegmentUpdater {
self.run_async(move |segment_updater| {
debug!("End merge {:?}", after_merge_segment_entry.meta());
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
let mut _file_protection_opt = None;
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.0.index.opstamp();
if delete_operation.opstamp < committed_opstamp {
let segment = segment_updater.0.index.segment(after_merge_segment_entry.meta().clone());
// TODO check unwrap
advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp).unwrap();
match advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp) {
Ok(file_protection_opt_res) => {
_file_protection_opt = file_protection_opt_res;
}
Err(e) => {
error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}", before_merge_segment_ids, e);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
segment_updater.cancel_merge(&before_merge_segment_ids, after_merge_segment_entry.segment_id());
return;
}
}
}
}
segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids, after_merge_segment_entry);

View File

@@ -5,19 +5,15 @@ use schema::Schema;
use schema::Term;
use core::Segment;
use core::SerializableSegment;
use postings::PostingsWriter;
use fastfield::U32FastFieldsWriter;
use fastfield::FastFieldsWriter;
use schema::Field;
use schema::FieldEntry;
use schema::FieldValue;
use schema::FieldType;
use schema::TextIndexingOptions;
use postings::SpecializedPostingsWriter;
use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
use indexer::segment_serializer::SegmentSerializer;
use datastruct::stacker::Heap;
use indexer::index_writer::MARGIN_IN_BYTES;
use super::operation::AddOperation;
use postings::MultiFieldPostingsWriter;
/// A `SegmentWriter` is in charge of creating segment index from a
@@ -28,46 +24,25 @@ use super::operation::AddOperation;
pub struct SegmentWriter<'a> {
heap: &'a Heap,
max_doc: DocId,
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
multifield_postings: MultiFieldPostingsWriter<'a>,
segment_serializer: SegmentSerializer,
fast_field_writers: U32FastFieldsWriter,
fieldnorms_writer: U32FastFieldsWriter,
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FastFieldsWriter,
doc_opstamps: Vec<u64>,
}
fn create_fieldnorms_writer(schema: &Schema) -> U32FastFieldsWriter {
let u32_fields: Vec<Field> = schema.fields()
fn create_fieldnorms_writer(schema: &Schema) -> FastFieldsWriter {
let u64_fields: Vec<Field> = schema.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field_id, _)| Field(field_id as u8))
.map(|(field_id, _)| Field(field_id as u32))
.collect();
U32FastFieldsWriter::new(u32_fields)
FastFieldsWriter::new(u64_fields)
}
fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box<PostingsWriter + 'a> {
match *field_entry.field_type() {
FieldType::Str(ref text_options) => {
match text_options.get_indexing_options() {
TextIndexingOptions::TokenizedWithFreq => {
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
}
TextIndexingOptions::TokenizedWithFreqAndPosition => {
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
}
_ => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
}
}
FieldType::U32(_) => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
}
}
impl<'a> SegmentWriter<'a> {
@@ -84,18 +59,14 @@ impl<'a> SegmentWriter<'a> {
mut segment: Segment,
schema: &Schema) -> Result<SegmentWriter<'a>> {
let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment));
let mut per_field_postings_writers: Vec<Box<PostingsWriter + 'a>> = Vec::new();
for field_entry in schema.fields() {
let postings_writer: Box<PostingsWriter + 'a> = posting_from_field_entry(field_entry, heap);
per_field_postings_writers.push(postings_writer);
}
let multifield_postings = MultiFieldPostingsWriter::new(schema, heap);
Ok(SegmentWriter {
heap: heap,
max_doc: 0,
per_field_postings_writers: per_field_postings_writers,
multifield_postings: multifield_postings,
fieldnorms_writer: create_fieldnorms_writer(schema),
segment_serializer: segment_serializer,
fast_field_writers: U32FastFieldsWriter::from_schema(schema),
fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
})
}
@@ -104,15 +75,11 @@ impl<'a> SegmentWriter<'a> {
///
/// Finalize consumes the `SegmentWriter`, so that it cannot
/// be used afterwards.
pub fn finalize(mut self) -> Result<Vec<u64>> {
for per_field_postings_writer in &mut self.per_field_postings_writers {
per_field_postings_writer.close(self.heap);
}
write(&self.per_field_postings_writers,
pub fn finalize(self) -> Result<Vec<u64>> {
write(&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer,
self.heap)?;
self.segment_serializer)?;
Ok(self.doc_opstamps)
}
@@ -127,6 +94,15 @@ impl<'a> SegmentWriter<'a> {
self.heap.num_free_bytes() <= MARGIN_IN_BYTES
}
/// Return true if the term dictionary hashmap is reaching capacity.
/// It is one of the condition that triggers a `SegmentWriter` to
/// be finalized.
pub(crate) fn is_termdic_saturated(&self,) -> bool {
self.multifield_postings.is_termdic_saturated()
}
/// Indexes a new document
///
/// As a user, you should rather use `IndexWriter`'s add_document.
@@ -135,33 +111,40 @@ impl<'a> SegmentWriter<'a> {
let doc = &add_operation.document;
self.doc_opstamps.push(add_operation.opstamp);
for (field, field_values) in doc.get_sorted_field_values() {
let field_posting_writer: &mut Box<PostingsWriter> = &mut self.per_field_postings_writers[field.0 as usize];
let field_options = schema.get_field_entry(field);
match *field_options.field_type() {
FieldType::Str(ref text_options) => {
let num_tokens: u32 =
let num_tokens: u32 =
if text_options.get_indexing_options().is_tokenized() {
field_posting_writer.index_text(doc_id, field, &field_values, self.heap)
self.multifield_postings.index_text(doc_id, field, &field_values)
}
else {
let num_field_values = field_values.len() as u32;
for field_value in field_values {
let term = Term::from_field_text(field, field_value.value().text());
field_posting_writer.suscribe(doc_id, 0, &term, self.heap);
self.multifield_postings.suscribe(doc_id, &term);
}
num_field_values
};
self.fieldnorms_writer
.get_field_writer(field)
.map(|field_norms_writer| {
field_norms_writer.add_val(num_tokens as u32)
field_norms_writer.add_val(num_tokens as u64)
});
}
FieldType::U32(ref u32_options) => {
if u32_options.is_indexed() {
FieldType::U64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_u32(field_value.field(), field_value.value().u32_value());
field_posting_writer.suscribe(doc_id, 0, &term, self.heap);
let term = Term::from_field_u64(field_value.field(), field_value.value().u64_value());
self.multifield_postings.suscribe(doc_id, &term);
}
}
}
FieldType::I64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(field_value.field(), field_value.value().i64_value());
self.multifield_postings.suscribe(doc_id, &term);
}
}
}
@@ -204,28 +187,27 @@ impl<'a> SegmentWriter<'a> {
}
// This method is used as a trick to workaround the borrow checker
fn write<'a>(per_field_postings_writers: &[Box<PostingsWriter + 'a>],
fast_field_writers: &U32FastFieldsWriter,
fieldnorms_writer: &U32FastFieldsWriter,
mut serializer: SegmentSerializer,
heap: &'a Heap,) -> Result<()> {
for per_field_postings_writer in per_field_postings_writers {
try!(per_field_postings_writer.serialize(serializer.get_postings_serializer(), heap));
}
fn write<'a>(
multifield_postings: &MultiFieldPostingsWriter,
fast_field_writers: &FastFieldsWriter,
fieldnorms_writer: &FastFieldsWriter,
mut serializer: SegmentSerializer) -> Result<()> {
try!(multifield_postings.serialize(serializer.get_postings_serializer()));
try!(fast_field_writers.serialize(serializer.get_fast_field_serializer()));
try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer()));
try!(serializer.close());
Ok(())
}
impl<'a> SerializableSegment for SegmentWriter<'a> {
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
let max_doc = self.max_doc;
write(&self.per_field_postings_writers,
write(&self.multifield_postings,
&self.fast_field_writers,
&self.fieldnorms_writer,
serializer,
self.heap)?;
serializer)?;
Ok(max_doc)
}
}

View File

@@ -9,6 +9,7 @@
#![cfg_attr(test, feature(test))]
#![cfg_attr(test, feature(step_by))]
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)]
@@ -24,6 +25,9 @@
#[macro_use]
extern crate lazy_static;
#[macro_use]
extern crate serde_derive;
#[macro_use]
extern crate log;
@@ -34,10 +38,11 @@ extern crate byteorder;
extern crate memmap;
extern crate regex;
extern crate tempfile;
extern crate rustc_serialize;
extern crate atomicwrites;
extern crate tempdir;
extern crate serde;
extern crate bincode;
extern crate serde_json;
extern crate time;
extern crate lz4;
extern crate uuid;
@@ -91,13 +96,12 @@ pub type Result<T> = std::result::Result<T, Error>;
mod core;
mod compression;
mod fastfield;
mod store;
pub mod store;
mod indexer;
mod common;
pub mod common;
mod error;
mod analyzer;
mod datastruct;
pub mod datastruct;
@@ -112,15 +116,16 @@ pub mod postings;
/// Schema
pub mod schema;
pub mod fastfield;
pub use directory::Directory;
pub use core::{Index, Segment, SegmentId, SegmentMeta, Searcher};
pub use core::{Index, Segment, SegmentComponent, SegmentId, SegmentMeta, Searcher};
pub use indexer::IndexWriter;
pub use schema::{Term, Document};
pub use core::SegmentReader;
pub use self::common::TimerTree;
pub use postings::DocSet;
pub use postings::Postings;
pub use postings::SegmentPostingsOption;
@@ -200,8 +205,10 @@ mod tests {
use schema::*;
use DocSet;
use IndexWriter;
use fastfield::{FastFieldReader, U64FastFieldReader, I64FastFieldReader};
use Postings;
#[test]
fn test_indexing() {
let mut schema_builder = SchemaBuilder::default();
@@ -437,26 +444,50 @@ mod tests {
#[test]
fn test_indexed_u32() {
fn test_indexed_u64() {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_u32_field("text", U32_INDEXED);
let field = schema_builder.add_u64_field("value", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(
doc!(field=>1)
doc!(field=>1u64)
);
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_u32(field, 1u32);
let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher.segment_reader(0).read_postings(&term, SegmentPostingsOption::NoFreq).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert!(!postings.advance());
}
#[test]
fn test_indexed_i64() {
let mut schema_builder = SchemaBuilder::default();
let value_field = schema_builder.add_i64_field("value", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let negative_val = -1i64;
index_writer.add_document(
doc!(value_field => negative_val)
);
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher
.segment_reader(0)
.read_postings(&term, SegmentPostingsOption::NoFreq).unwrap();
assert!(postings.advance());
assert_eq!(postings.doc(), 0);
assert!(!postings.advance());
}
#[test]
fn test_delete_postings2() {
let mut schema_builder = SchemaBuilder::default();
@@ -630,4 +661,52 @@ mod tests {
assert_eq!(values.len(), 1);
assert_eq!(values[0].text(), "short");
}
#[test]
fn test_wrong_fast_field_type() {
let mut schema_builder = SchemaBuilder::default();
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
let text_field = schema_builder.add_text_field("text", TEXT);
let stored_int_field = schema_builder.add_u64_field("text", INT_STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
{
let document = doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64);
index_writer.add_document(document);
index_writer.commit().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
{
let fast_field_reader_res = segment_reader.get_fast_field_reader::<U64FastFieldReader>(text_field);
assert!(fast_field_reader_res.is_err());
}
{
let fast_field_reader_res = segment_reader.get_fast_field_reader::<U64FastFieldReader>(stored_int_field);
assert!(fast_field_reader_res.is_err());
}
{
let fast_field_reader_res = segment_reader.get_fast_field_reader::<U64FastFieldReader>(fast_field_signed);
assert!(fast_field_reader_res.is_err());
}
{
let fast_field_reader_res = segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64)
}
{
let fast_field_reader_res = segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64)
}
}
}

View File

@@ -17,21 +17,21 @@ mod docset;
mod segment_postings_option;
pub use self::docset::{SkipResult, DocSet};
pub use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
pub use self::serializer::PostingsSerializer;
pub use self::postings_writer::PostingsWriter;
pub use self::postings_writer::SpecializedPostingsWriter;
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::term_info::TermInfo;
pub use self::postings::Postings;
#[cfg(test)]
pub use self::vec_postings::VecPostings;
pub use self::segment_postings::SegmentPostings;
pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings};
pub use self::intersection::IntersectionDocSet;
pub use self::freq_handler::FreqHandler;
pub use self::segment_postings_option::SegmentPostingsOption;
pub use common::HasLen;
#[cfg(test)]
mod tests {
@@ -43,6 +43,7 @@ mod tests {
use core::Index;
use std::iter;
use datastruct::stacker::Heap;
use fastfield::FastFieldReader;
use query::TermQuery;
use schema::Field;
use test::Bencher;
@@ -58,8 +59,8 @@ mod tests {
let index = Index::create_in_ram(schema);
let mut segment = index.new_segment();
let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap();
let term = Term::from_field_text(text_field, "abc");
posting_serializer.new_term(&term).unwrap();
posting_serializer.new_field(text_field);
posting_serializer.new_term("abc".as_bytes()).unwrap();
for doc_id in 0u32..3u32 {
let positions = vec!(1,2,3,2);
posting_serializer.write_doc(doc_id, 2, &positions).unwrap();
@@ -67,7 +68,7 @@ mod tests {
posting_serializer.close_term().unwrap();
posting_serializer.close().unwrap();
let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
assert_eq!(read.len(), 13);
assert!(read.len() <= 16);
}
#[test]
@@ -119,7 +120,7 @@ mod tests {
assert_eq!(fieldnorm_reader.get(0), 8 + 5);
assert_eq!(fieldnorm_reader.get(1), 2);
for i in 2 .. 1000 {
assert_eq!(fieldnorm_reader.get(i), i + 1);
assert_eq!(fieldnorm_reader.get(i), (i + 1) as u64);
}
}
{
@@ -266,16 +267,26 @@ mod tests {
};
}
#[bench]
fn bench_block_segment_postings(b: &mut Bencher) {
let searcher = INDEX.searcher();
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
let mut block_segment_postings = segment_reader.read_block_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap();
while block_segment_postings.advance() {}
});
}
#[bench]
fn bench_segment_postings(b: &mut Bencher) {
let searcher = INDEX.searcher();
let segment_reader = searcher.segment_reader(0);
b.iter(|| {
let mut segment_postings = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap();
while segment_postings.advance() {}
let mut block_segment_postings = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap();
while block_segment_postings.advance() {}
});
}
}
#[bench]
fn bench_segment_intersection(b: &mut Bencher) {

View File

@@ -5,9 +5,129 @@ use postings::PostingsSerializer;
use std::io;
use postings::Recorder;
use analyzer::SimpleTokenizer;
use schema::Field;
use Result;
use schema::{Schema, Field};
use analyzer::StreamingIterator;
use std::marker::PhantomData;
use schema::extract_field_from_term_bytes;
use std::ops::DerefMut;
use datastruct::stacker::{HashMap, Heap};
use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
use schema::FieldEntry;
use schema::FieldType;
use schema::TextIndexingOptions;
fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box<PostingsWriter + 'a> {
match *field_entry.field_type() {
FieldType::Str(ref text_options) => {
match text_options.get_indexing_options() {
TextIndexingOptions::TokenizedWithFreq => {
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
}
TextIndexingOptions::TokenizedWithFreqAndPosition => {
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
}
_ => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
}
}
FieldType::U64(_) => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
FieldType::I64(_) => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
}
}
pub struct MultiFieldPostingsWriter<'a> {
heap: &'a Heap,
term_index: HashMap<'a>,
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
}
impl<'a> MultiFieldPostingsWriter<'a> {
/// Create a new `MultiFieldPostingsWriter` given
/// a schema and a heap.
pub fn new(schema: &Schema, heap: &'a Heap) -> MultiFieldPostingsWriter<'a> {
let capacity = heap.capacity();
let hashmap_size = hashmap_size_in_bits(capacity);
let term_index = HashMap::new(hashmap_size, heap);
let mut per_field_postings_writers: Vec<_> = vec!();
for field_entry in schema.fields() {
let field_entry = posting_from_field_entry(&field_entry, heap);
per_field_postings_writers.push(field_entry);
}
MultiFieldPostingsWriter {
heap: heap,
term_index: term_index,
per_field_postings_writers: per_field_postings_writers
}
}
pub fn index_text(&mut self,
doc: DocId,
field: Field,
field_values: &[&FieldValue])
-> u32 {
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
postings_writer.index_text(&mut self.term_index, doc, field, field_values, self.heap)
}
pub fn suscribe(&mut self, doc: DocId, term: &Term) {
let postings_writer = self.per_field_postings_writers[term.field().0 as usize].deref_mut();
postings_writer.suscribe(&mut self.term_index, doc, 0u32, term, self.heap)
}
/// Serialize the inverted index.
/// It pushes all term, one field at a time, towards the
/// postings serializer.
pub fn serialize(&self, serializer: &mut PostingsSerializer) -> Result<()> {
let mut term_offsets: Vec<(&[u8], u32)> = self.term_index
.iter()
.collect();
term_offsets.sort_by_key(|&(k, _v)| k);
let mut offsets: Vec<(Field, usize)> = vec!();
let term_offsets_it = term_offsets
.iter()
.map(|&(ref key, _)| {
extract_field_from_term_bytes(&key)
})
.enumerate();
let mut prev_field = Field(u32::max_value());
for (offset, field) in term_offsets_it {
if field != prev_field {
offsets.push((field, offset));
prev_field = field;
}
}
offsets.push((Field(0), term_offsets.len()));
for i in 0..(offsets.len() - 1) {
let (field, start) = offsets[i];
let (_, stop) = offsets[i+1];
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
postings_writer.serialize(
field,
&term_offsets[start..stop],
serializer,
self.heap)?;
}
Ok(())
}
/// Return true iff the term dictionary is saturated.
pub fn is_termdic_saturated(&self) -> bool {
self.term_index.is_saturated()
}
}
/// The `PostingsWriter` is in charge of receiving documenting
/// and building a `Segment` in anonymous memory.
@@ -21,17 +141,15 @@ pub trait PostingsWriter {
/// * term - the term
/// * heap - heap used to store the postings informations as well as the terms
/// in the hashmap.
fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap);
fn suscribe(&mut self, term_index: &mut HashMap, doc: DocId, pos: u32, term: &Term, heap: &Heap);
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
/// Closes all of the currently open `Recorder`'s.
fn close(&mut self, heap: &Heap);
fn serialize(&self, field: Field, term_addrs: &[(&[u8], u32)], serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
/// Tokenize a text and suscribe all of its token.
fn index_text<'a>(&mut self,
term_index: &mut HashMap,
doc_id: DocId,
field: Field,
field_values: &[&'a FieldValue],
@@ -39,14 +157,15 @@ pub trait PostingsWriter {
-> u32 {
let mut pos = 0u32;
let mut num_tokens: u32 = 0u32;
let mut term = Term::allocate(field, 100);
let mut term = unsafe { Term::with_capacity(100) };
term.set_field(field);
for field_value in field_values {
let mut tokens = SimpleTokenizer.tokenize(field_value.value().text());
// right now num_tokens and pos are redundant, but it should
// change when we get proper analyzers
while let Some(token) = tokens.next() {
term.set_text(token);
self.suscribe(doc_id, pos, &term, heap);
self.suscribe(term_index, doc_id, pos, &term, heap);
pos += 1u32;
num_tokens += 1u32;
}
@@ -61,7 +180,8 @@ pub trait PostingsWriter {
/// The `SpecializedPostingsWriter` is just here to remove dynamic
/// dispatch to the recorder information.
pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
term_index: HashMap<'a, Rec>,
heap: &'a Heap,
_recorder_type: PhantomData<Rec>,
}
/// Given a `Heap` size, computes a relevant size for the `HashMap`.
@@ -81,9 +201,10 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
/// constructor
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
let capacity = heap.capacity();
let hashmap_size = hashmap_size_in_bits(capacity);
SpecializedPostingsWriter { term_index: HashMap::new(hashmap_size, heap) }
SpecializedPostingsWriter {
heap: heap,
_recorder_type: PhantomData,
}
}
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
@@ -93,16 +214,9 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
}
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
fn close(&mut self, heap: &Heap) {
for recorder in self.term_index.values_mut() {
recorder.close_doc(heap);
}
}
#[inline]
fn suscribe(&mut self, doc: DocId, position: u32, term: &Term, heap: &Heap) {
let mut recorder = self.term_index.get_or_create(term);
fn suscribe(&mut self, term_index: &mut HashMap, doc: DocId, position: u32, term: &Term, heap: &Heap) {
let recorder: &mut Rec = term_index.get_or_create(term);
let current_doc = recorder.current_doc();
if current_doc != doc {
if current_doc != u32::max_value() {
@@ -113,19 +227,18 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
recorder.record_position(position, heap);
}
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index
.iter()
.collect();
term_offsets.sort_by_key(|&(k, _v)| k);
let mut term = Term::allocate(Field(0), 100);
for (term_bytes, (addr, recorder)) in term_offsets {
// sadly we are required to copy the data
term.set_content(term_bytes);
try!(serializer.new_term(&term));
fn serialize(&self,
field: Field,
term_addrs: &[(&[u8], u32)],
serializer: &mut PostingsSerializer,
heap: &Heap) -> io::Result<()> {
serializer.new_field(field);
for &(term_bytes, addr) in term_addrs {
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
try!(serializer.new_term(&term_bytes));
try!(recorder.serialize(addr, serializer, heap));
try!(serializer.close_term());
}
}
Ok(())
}
}

View File

@@ -76,6 +76,7 @@ impl Recorder for NothingRecorder {
}
}
/// Recorder encoding document ids, and term frequencies
#[repr(C, packed)]
pub struct TermFrequencyRecorder {
@@ -95,6 +96,7 @@ impl HeapAllocable for TermFrequencyRecorder {
}
impl Recorder for TermFrequencyRecorder {
fn current_doc(&self) -> DocId {
self.current_doc
}
@@ -114,22 +116,28 @@ impl Recorder for TermFrequencyRecorder {
self.current_tf = 0;
}
fn serialize(&self,
self_addr: u32,
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()> {
let mut doc_iter = self.stack.iter(self_addr, heap);
// the last document has not been closed...
// its term freq is self.current_tf.
let mut doc_iter = self.stack
.iter(self_addr, heap)
.chain(Some(self.current_tf).into_iter());
loop {
if let Some(doc) = doc_iter.next() {
if let Some(term_freq) = doc_iter.next() {
try!(serializer.write_doc(doc, term_freq, &EMPTY_ARRAY));
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
continue;
}
}
break;
return Ok(());
}
Ok(())
}
}
@@ -188,7 +196,8 @@ impl Recorder for TFAndPositionRecorder {
}
}
None => {
panic!("This should never happen. Pleasee report the bug.");
// the last document has not been closed...
break;
}
}
}

View File

@@ -2,11 +2,93 @@ use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder};
use DocId;
use postings::{Postings, FreqHandler, DocSet, HasLen};
use std::num::Wrapping;
use fastfield::delete::DeleteBitSet;
use fastfield::DeleteBitSet;
const EMPTY_DATA: [u8; 0] = [0u8; 0];
pub struct BlockSegmentPostings<'a> {
num_binpacked_blocks: usize,
num_vint_docs: usize,
block_decoder: BlockDecoder,
freq_handler: FreqHandler,
remaining_data: &'a [u8],
doc_offset: DocId,
len: usize,
}
impl<'a> BlockSegmentPostings<'a> {
pub fn from_data(len: usize, data: &'a [u8], freq_handler: FreqHandler) -> BlockSegmentPostings<'a> {
let num_binpacked_blocks: usize = (len as usize) / NUM_DOCS_PER_BLOCK;
let num_vint_docs = (len as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks;
BlockSegmentPostings {
num_binpacked_blocks: num_binpacked_blocks,
num_vint_docs: num_vint_docs,
block_decoder: BlockDecoder::new(),
freq_handler: freq_handler,
remaining_data: data,
doc_offset: 0,
len: len,
}
}
pub fn reset(&mut self, len: usize, data: &'a [u8]) {
let num_binpacked_blocks: usize = (len as usize) / NUM_DOCS_PER_BLOCK;
let num_vint_docs = (len as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks;
self.num_binpacked_blocks = num_binpacked_blocks;
self.num_vint_docs = num_vint_docs;
self.remaining_data = data;
self.doc_offset = 0;
self.len = len;
}
pub fn docs(&self) -> &[DocId] {
self.block_decoder.output_array()
}
pub fn freq_handler(&self) -> &FreqHandler {
&self.freq_handler
}
pub fn advance(&mut self) -> bool {
if self.num_binpacked_blocks > 0 {
self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset);
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
self.num_binpacked_blocks -= 1;
true
}
else {
if self.num_vint_docs > 0 {
self.remaining_data = self.block_decoder.uncompress_vint_sorted(self.remaining_data, self.doc_offset, self.num_vint_docs);
self.freq_handler.read_freq_vint(self.remaining_data, self.num_vint_docs);
self.num_vint_docs = 0;
true
}
else {
false
}
}
}
/// Returns an empty segment postings object
pub fn empty() -> BlockSegmentPostings<'static> {
BlockSegmentPostings {
num_binpacked_blocks: 0,
num_vint_docs: 0,
block_decoder: BlockDecoder::new(),
freq_handler: FreqHandler::new_without_freq(),
remaining_data: &EMPTY_DATA,
doc_offset: 0,
len: 0,
}
}
}
/// `SegmentPostings` represents the inverted list or postings associated to
/// a term in a `Segment`.
///
@@ -14,28 +96,14 @@ const EMPTY_DATA: [u8; 0] = [0u8; 0];
/// Positions on the other hand, are optionally entirely decoded upfront.
pub struct SegmentPostings<'a> {
len: usize,
doc_offset: u32,
block_decoder: BlockDecoder,
freq_handler: FreqHandler,
remaining_data: &'a [u8],
cur: Wrapping<usize>,
block_cursor: BlockSegmentPostings<'a>,
cur_block_len: usize,
delete_bitset: DeleteBitSet,
}
impl<'a> SegmentPostings<'a> {
fn load_next_block(&mut self) {
let num_remaining_docs = self.len - self.cur.0;
if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
self.remaining_data = self.block_decoder
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
} else {
self.remaining_data = self.block_decoder
.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs);
self.freq_handler.read_freq_vint(self.remaining_data, num_remaining_docs);
}
}
/// Reads a Segment postings from an &[u8]
///
@@ -43,39 +111,29 @@ impl<'a> SegmentPostings<'a> {
/// * `data` - data array. The complete data is not necessarily used.
/// * `freq_handler` - the freq handler is in charge of decoding
/// frequencies and/or positions
pub fn from_data(len: u32,
data: &'a [u8],
delete_bitset: &'a DeleteBitSet,
freq_handler: FreqHandler) -> SegmentPostings<'a> {
pub fn from_block_postings(
segment_block_postings: BlockSegmentPostings<'a>,
delete_bitset: DeleteBitSet) -> SegmentPostings<'a> {
SegmentPostings {
len: len as usize,
doc_offset: 0,
block_decoder: BlockDecoder::new(),
freq_handler: freq_handler,
remaining_data: data,
len: segment_block_postings.len,
block_cursor: segment_block_postings,
cur: Wrapping(usize::max_value()),
delete_bitset: delete_bitset.clone(),
cur_block_len: 0,
delete_bitset: delete_bitset,
}
}
/// Returns an empty segment postings object
pub fn empty() -> SegmentPostings<'static> {
let empty_block_cursor = BlockSegmentPostings::empty();
SegmentPostings {
len: 0,
doc_offset: 0,
block_decoder: BlockDecoder::new(),
freq_handler: FreqHandler::new_without_freq(),
remaining_data: &EMPTY_DATA,
block_cursor: empty_block_cursor,
delete_bitset: DeleteBitSet::empty(),
cur: Wrapping(usize::max_value()),
cur_block_len: 0,
}
}
/// Index within a block is used as an address when
/// interacting with the `FreqHandler`
fn index_within_block(&self) -> usize {
self.cur.0 % NUM_DOCS_PER_BLOCK
}
}
@@ -84,13 +142,16 @@ impl<'a> DocSet for SegmentPostings<'a> {
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> bool {
loop {
loop {
self.cur += Wrapping(1);
if self.cur.0 >= self.len {
return false;
}
if self.index_within_block() == 0 {
self.load_next_block();
if self.cur.0 == self.cur_block_len {
self.cur = Wrapping(0);
if !self.block_cursor.advance() {
self.cur_block_len = 0;
self.cur = Wrapping(usize::max_value());
return false;
}
self.cur_block_len = self.block_cursor.docs().len();
}
if !self.delete_bitset.is_deleted(self.doc()) {
return true;
@@ -100,7 +161,7 @@ impl<'a> DocSet for SegmentPostings<'a> {
#[inline]
fn doc(&self) -> DocId {
self.block_decoder.output(self.index_within_block())
self.block_cursor.docs()[self.cur.0]
}
}
@@ -112,10 +173,10 @@ impl<'a> HasLen for SegmentPostings<'a> {
impl<'a> Postings for SegmentPostings<'a> {
fn term_freq(&self) -> u32 {
self.freq_handler.freq(self.index_within_block())
self.block_cursor.freq_handler().freq(self.cur.0)
}
fn positions(&self) -> &[u32] {
self.freq_handler.positions(self.index_within_block())
self.block_cursor.freq_handler().positions(self.cur.0)
}
}

View File

@@ -1,7 +1,6 @@
use Result;
use datastruct::FstMapBuilder;
use datastruct::TermDictionaryBuilder;
use super::TermInfo;
use schema::Term;
use schema::Field;
use schema::FieldEntry;
use schema::FieldType;
@@ -30,7 +29,7 @@ use common::BinarySerializable;
///
/// The serializer expects to receive the following calls
/// in this order :
///
/// * `set_field(...)`
/// * `new_term(...)`
/// * `write_doc(...)`
/// * `write_doc(...)`
@@ -41,6 +40,8 @@ use common::BinarySerializable;
/// * `write_doc(...)`
/// * ...
/// * `close_term()`
/// * `set_field(...)`
/// * ...
/// * `close()`
///
/// Terms have to be pushed in a lexicographically-sorted order.
@@ -49,7 +50,7 @@ use common::BinarySerializable;
/// A description of the serialization format is
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
pub struct PostingsSerializer {
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>,
terms_fst_builder: TermDictionaryBuilder<WritePtr, TermInfo>,
postings_write: WritePtr,
positions_write: WritePtr,
written_bytes_postings: usize,
@@ -73,7 +74,7 @@ impl PostingsSerializer {
positions_write: WritePtr,
schema: Schema)
-> Result<PostingsSerializer> {
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
let terms_fst_builder = TermDictionaryBuilder::new(terms_write)?;
Ok(PostingsSerializer {
terms_fst_builder: terms_fst_builder,
postings_write: postings_write,
@@ -105,17 +106,28 @@ impl PostingsSerializer {
segment.schema())
}
fn load_indexing_options(&mut self, field: Field) {
/// Must be called before starting pushing terms of
/// a given field.
///
/// Loads the indexing options for the given field.
pub fn new_field(&mut self, field: Field) {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
self.text_indexing_options = match *field_entry.field_type() {
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
FieldType::U32(ref u32_options) => {
if u32_options.is_indexed() {
FieldType::U64(ref int_options) => {
if int_options.is_indexed() {
TextIndexingOptions::Unindexed
} else {
TextIndexingOptions::Untokenized
}
}
FieldType::I64(ref int_options) => {
if int_options.is_indexed() {
TextIndexingOptions::Unindexed
} else {
TextIndexingOptions::Untokenized
}
}
};
}
@@ -123,12 +135,11 @@ impl PostingsSerializer {
/// * term - the term. It needs to come after the previous term according
/// to the lexicographical order.
/// * doc_freq - return the number of document containing the term.
pub fn new_term(&mut self, term: &Term) -> io::Result<()> {
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
if self.term_open {
panic!("Called new_term, while the previous term was not closed.");
}
self.term_open = true;
self.load_indexing_options(term.field());
self.doc_ids.clear();
self.last_doc_id_encoded = 0;
self.term_freqs.clear();
@@ -138,7 +149,7 @@ impl PostingsSerializer {
postings_offset: self.written_bytes_postings as u32,
positions_offset: self.written_bytes_positions as u32,
};
self.terms_fst_builder.insert_key(term.as_slice())
self.terms_fst_builder.insert_key(term)
}
/// Finish the serialization for this term postings.

View File

@@ -23,7 +23,7 @@ mod tests {
use collector::tests::TestCollector;
use Index;
use schema::*;
use fastfield::{U32FastFieldReader};
use fastfield::{U64FastFieldReader};
use postings::SegmentPostingsOption;
fn abs_diff(left: f32, right: f32) -> f32 {
@@ -102,7 +102,7 @@ mod tests {
}
{
let boolean_query = BooleanQuery::from(vec![(Occur::MustNot, make_term_query("d")),]);
assert_eq!(matching_docs(&boolean_query), Vec::new());
assert_eq!(matching_docs(&boolean_query), Vec::<u32>::new());
}
}
@@ -111,7 +111,7 @@ mod tests {
let occurs = vec!(Occur::Should, Occur::Should);
let occur_filter = OccurFilter::new(&occurs);
let left_fieldnorms = U32FastFieldReader::from(vec!(100,200,300));
let left_fieldnorms = U64FastFieldReader::from(vec!(100,200,300));
let left = VecPostings::from(vec!(1, 2, 3));
let left_scorer = TermScorer {
@@ -120,7 +120,7 @@ mod tests {
postings: left,
};
let right_fieldnorms = U32FastFieldReader::from(vec!(15,25,35));
let right_fieldnorms = U64FastFieldReader::from(vec!(15,25,35));
let right = VecPostings::from(vec!(1, 3, 8));
let right_scorer = TermScorer {

View File

@@ -60,11 +60,14 @@ mod tests {
searcher.search(&phrase_query, &mut test_collector).expect("search should succeed");
test_collector.docs()
};
let empty_vec = Vec::<u32>::new();
assert_eq!(test_query(vec!("a", "b", "c")), vec!(2, 4));
assert_eq!(test_query(vec!("a", "b")), vec!(1, 2, 3, 4));
assert_eq!(test_query(vec!("b", "b")), vec!(0, 1));
assert_eq!(test_query(vec!("g", "ewrwer")), vec!());
assert_eq!(test_query(vec!("g", "a")), vec!());
assert_eq!(test_query(vec!("g", "ewrwer")), empty_vec);
assert_eq!(test_query(vec!("g", "a")), empty_vec);
}
}

View File

@@ -10,8 +10,21 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s);
phrase.or(word)
};
let field = many1(letter());
let term_query = (field, char(':'), term_val()).map(|(field_name, _, phrase)| {
let negative_numbers =
(char('-'), many1(satisfy(|c: char| c.is_numeric())))
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let field =
(
letter(),
many(satisfy(|c: char| c.is_alphanumeric() || c == '_'))
)
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
let term_val_with_field = negative_numbers.or(term_val());
let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| {
UserInputLiteral {
field_name: Some(field_name),
phrase: phrase,

View File

@@ -10,7 +10,9 @@ use postings::SegmentPostingsOption;
use query::PhraseQuery;
use analyzer::SimpleTokenizer;
use analyzer::StreamingIterator;
use schema::Term;
use schema::{Term, FieldType};
use std::str::FromStr;
use std::num::ParseIntError;
@@ -22,18 +24,25 @@ pub enum QueryParserError {
/// `FieldDoesNotExist(field_name: String)`
/// The query references a field that is not in the schema
FieldDoesNotExist(String),
/// `ExpectedU32(field_name: String, field_value: String)`
/// The query contains a term for a `u32`-field, but the value
/// is not a u32.
ExpectedU32(String, String),
/// The query contains a term for a `u64`-field, but the value
/// is not a u64.
ExpectedInt(ParseIntError),
/// It is forbidden queries that are only "excluding". (e.g. -title:pop)
AllButQueryForbidden,
/// If no default field is declared, running a query without any
/// field specified is forbbidden.
NoDefaultFieldDeclared,
/// The field searched for is not declared
/// as indexed in the schema.
FieldNotIndexed(String),
}
impl From<ParseIntError> for QueryParserError {
fn from(err: ParseIntError) -> QueryParserError {
QueryParserError::ExpectedInt(err)
}
}
/// Tantivy's Query parser
///
@@ -122,7 +131,7 @@ impl QueryParser {
fn compute_logical_ast(&self,
user_input_ast: UserInputAST)
-> Result<LogicalAST, QueryParserError> {
let (occur, ast) = try!(self.compute_logical_ast_with_occur(user_input_ast));
let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?;
if occur == Occur::MustNot {
return Err(QueryParserError::AllButQueryForbidden);
}
@@ -133,25 +142,51 @@ impl QueryParser {
field: Field,
phrase: &str)
-> Result<Option<LogicalLiteral>, QueryParserError> {
let mut token_iter = self.analyzer.tokenize(phrase);
let mut tokens: Vec<Term> = Vec::new();
loop {
if let Some(token) = token_iter.next() {
let text = token.to_string();
// TODO Handle u32
let term = Term::from_field_text(field, &text);
tokens.push(term);
} else {
break;
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
if !field_type.is_indexed() {
let field_name = field_entry.name().to_string();
return Err(QueryParserError::FieldNotIndexed(field_name));
}
match field_type {
&FieldType::I64(_) => {
let val: i64 = i64::from_str(phrase)?;
let term = Term::from_field_i64(field, val);
return Ok(Some(LogicalLiteral::Term(term)));
}
&FieldType::U64(_) => {
let val: u64 = u64::from_str(phrase)?;
let term = Term::from_field_u64(field, val);
return Ok(Some(LogicalLiteral::Term(term)));
}
&FieldType::Str(ref str_options) => {
let mut terms: Vec<Term> = Vec::new();
if str_options.get_indexing_options().is_tokenized() {
let mut token_iter = self.analyzer.tokenize(phrase);
loop {
if let Some(token) = token_iter.next() {
let term = Term::from_field_text(field, token);
terms.push(term);
} else {
break;
}
}
}
else {
terms.push(Term::from_field_text(field, phrase));
}
if terms.is_empty() {
return Ok(None);
}
else if terms.len() == 1 {
return Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap())))
} else {
return Ok(Some(LogicalLiteral::Phrase(terms)))
}
}
}
if tokens.is_empty() {
Ok(None)
} else if tokens.len() == 1 {
Ok(Some(LogicalLiteral::Term(tokens.into_iter().next().unwrap())))
} else {
Ok(Some(LogicalLiteral::Phrase(tokens)))
}
}
fn default_occur(&self) -> Occur {
@@ -209,23 +244,23 @@ impl QueryParser {
asts.push(LogicalAST::Leaf(box ast));
}
}
let result_ast = if asts.len() == 0 {
// this should never happen
return Err(QueryParserError::SyntaxError);
} else if asts.len() == 1 {
asts[0].clone()
} else {
LogicalAST::Clause(asts.into_iter()
.map(|ast| (Occur::Should, ast))
.collect())
};
let result_ast =
if asts.len() == 0 {
// this should never happen
return Err(QueryParserError::SyntaxError);
} else if asts.len() == 1 {
asts[0].clone()
} else {
LogicalAST::Clause(asts.into_iter()
.map(|ast| (Occur::Should, ast))
.collect())
};
Ok((Occur::Should, result_ast))
}
}
}
}
/// Compose two occur values.
fn compose_occur(left: Occur, right: Occur) -> Occur {
match left {
@@ -270,16 +305,23 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
#[cfg(test)]
mod test {
use schema::{SchemaBuilder, TEXT};
use schema::{SchemaBuilder, Term, TEXT, STRING, STORED, INT_INDEXED};
use query::Query;
use schema::Field;
use super::QueryParser;
use super::QueryParserError;
use super::super::logical_ast::*;
fn make_query_parser() -> QueryParser {
let mut schema_builder = SchemaBuilder::default();
let title = schema_builder.add_text_field("title", TEXT);
let text = schema_builder.add_text_field("text", TEXT);
schema_builder.add_i64_field("signed", INT_INDEXED);
schema_builder.add_u64_field("unsigned", INT_INDEXED);
schema_builder.add_text_field("notindexed_text", STORED);
schema_builder.add_text_field("notindexed_u64", STORED);
schema_builder.add_text_field("notindexed_i64", STORED);
schema_builder.add_text_field("nottokenized", STRING);
let schema = schema_builder.build();
let default_fields = vec![title, text];
QueryParser::new(schema, default_fields)
@@ -310,47 +352,105 @@ mod test {
let query_parser = make_query_parser();
assert!(query_parser.parse_query("toto").is_ok());
}
#[test]
pub fn test_parse_nonindexed_field_yields_error() {
let query_parser = make_query_parser();
let is_not_indexed_err = |query: &str| {
let result: Result<Box<Query>, QueryParserError> = query_parser.parse_query(query);
if let Err(QueryParserError::FieldNotIndexed(field_name)) = result {
Some(field_name.clone())
}
else {
None
}
};
assert_eq!(
is_not_indexed_err("notindexed_text:titi"),
Some(String::from("notindexed_text"))
);
assert_eq!(
is_not_indexed_err("notindexed_u64:23424"),
Some(String::from("notindexed_u64"))
);
assert_eq!(
is_not_indexed_err("notindexed_i64:-234324"),
Some(String::from("notindexed_i64"))
);
}
#[test]
pub fn test_parse_query_untokenized() {
test_parse_query_to_logical_ast_helper("nottokenized:\"wordone wordtwo\"",
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, 101, 32, 119, 111, 114, 100, 116, 119, 111])",
false);
}
#[test]
pub fn test_parse_query_ints() {
let query_parser = make_query_parser();
assert!(query_parser.parse_query("signed:2324").is_ok());
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
assert!(query_parser.parse_query("signed:\"-9999999999999\"").is_ok());
assert!(query_parser.parse_query("signed:\"a\"").is_err());
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
assert!(query_parser.parse_query("signed:\"18446744073709551615\"").is_err());
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
assert!(query_parser.parse_query("unsigned:\"18446744073709551615\"").is_ok());
test_parse_query_to_logical_ast_helper("unsigned:2324",
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
false);
test_parse_query_to_logical_ast_helper("signed:-2324",
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
false);
}
#[test]
pub fn test_parse_query_to_ast_disjunction() {
test_parse_query_to_logical_ast_helper("title:toto",
"Term([0, 116, 111, 116, 111])",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false);
test_parse_query_to_logical_ast_helper("+title:toto",
"Term([0, 116, 111, 116, 111])",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
false);
test_parse_query_to_logical_ast_helper("+title:toto -titi",
"(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, \
105, 116, 105]) Term([1, 116, 105, 116, 105])))",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) -(Term([0, 0, 0, 0, 116, \
105, 116, 105]) Term([0, 0, 0, 1, 116, 105, 116, 105])))",
false);
assert_eq!(parse_query_to_logical_ast("-title:toto", false).err().unwrap(),
QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b",
"(Term([0, 97]) (Term([0, 98]) Term([1, 98])))",
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) Term([0, 0, 0, 1, 98])))",
false);
test_parse_query_to_logical_ast_helper("title:\"a b\"",
"\"[Term([0, 97]), Term([0, 98])]\"",
"\"[Term([0, 0, 0, 0, 97]), Term([0, 0, 0, 0, 98])]\"",
false);
}
#[test]
pub fn test_parse_query_to_ast_conjunction() {
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 116, 111, 116, 111])", true);
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 0, 0, 0, 116, 111, 116, 111])", true);
test_parse_query_to_logical_ast_helper("+title:toto",
"Term([0, 116, 111, 116, 111])",
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
true);
test_parse_query_to_logical_ast_helper("+title:toto -titi",
"(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, \
105, 116, 105]) Term([1, 116, 105, 116, 105])))",
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) -(Term([0, 0, 0, 0, 116, \
105, 116, 105]) Term([0, 0, 0, 1, 116, 105, 116, 105])))",
true);
assert_eq!(parse_query_to_logical_ast("-title:toto", true).err().unwrap(),
QueryParserError::AllButQueryForbidden);
test_parse_query_to_logical_ast_helper("title:a b",
"(+Term([0, 97]) +(Term([0, 98]) Term([1, 98])))",
"(+Term([0, 0, 0, 0, 97]) +(Term([0, 0, 0, 0, 98]) Term([0, 0, 0, 1, 98])))",
true);
test_parse_query_to_logical_ast_helper("title:\"a b\"",
"\"[Term([0, 97]), Term([0, 98])]\"",
"\"[Term([0, 0, 0, 0, 97]), Term([0, 0, 0, 0, 98])]\"",
true);
}
}

View File

@@ -30,10 +30,8 @@ impl<'a> Scorer for Box<Scorer + 'a> {
}
fn collect(&mut self, collector: &mut Collector) {
let scorer = self.deref_mut();
while scorer.advance() {
collector.collect(scorer.doc(), scorer.score());
}
let scorer: &mut Scorer = self.deref_mut();
scorer.collect(collector);
}
}

View File

@@ -14,11 +14,12 @@ mod tests {
use query::Scorer;
use query::term_query::TermScorer;
use query::Query;
use fastfield::U32FastFieldReader;
use fastfield::U64FastFieldReader;
use query::TermQuery;
use Index;
use schema::*;
use postings::SegmentPostingsOption;
use fastfield::FastFieldReader;
fn abs_diff(left: f32, right: f32) -> f32 {
(right - left).abs()
@@ -55,7 +56,7 @@ mod tests {
#[test]
pub fn test_term_scorer() {
let left_fieldnorms = U32FastFieldReader::from(vec!(10, 4));
let left_fieldnorms = U64FastFieldReader::from(vec!(10, 4));
assert_eq!(left_fieldnorms.get(0), 10);
assert_eq!(left_fieldnorms.get(1), 4);
let left = VecPostings::from(vec!(1));

View File

@@ -1,13 +1,14 @@
use Score;
use DocId;
use fastfield::U32FastFieldReader;
use fastfield::U64FastFieldReader;
use postings::DocSet;
use query::Scorer;
use postings::Postings;
use fastfield::FastFieldReader;
pub struct TermScorer<TPostings> where TPostings: Postings {
pub idf: Score,
pub fieldnorm_reader_opt: Option<U32FastFieldReader>,
pub fieldnorm_reader_opt: Option<U64FastFieldReader>,
pub postings: TPostings,
}

View File

@@ -11,7 +11,7 @@ use itertools::Itertools;
/// Documents are really just a list of couple `(field, value)`.
/// In this list, one field may appear more than once.
#[derive(Debug, RustcEncodable, RustcDecodable, Default)]
#[derive(Debug, Serialize, Deserialize, Default)]
pub struct Document {
field_values: Vec<FieldValue>,
}
@@ -52,9 +52,14 @@ impl Document {
self.add(FieldValue::new(field, value));
}
/// Add a u32 field
pub fn add_u32(&mut self, field: Field, value: u32) {
self.add(FieldValue::new(field, Value::U32(value)));
/// Add a u64 field
pub fn add_u64(&mut self, field: Field, value: u64) {
self.add(FieldValue::new(field, Value::U64(value)));
}
/// Add a u64 field
pub fn add_i64(&mut self, field: Field, value: i64) {
self.add(FieldValue::new(field, Value::I64(value)));
}
/// Add a field value

View File

@@ -10,8 +10,8 @@ use common::BinarySerializable;
///
/// Because the field id is a `u8`, tantivy can only have at most `255` fields.
/// Value 255 is reserved.
#[derive(Copy, Clone, Debug, PartialEq,PartialOrd,Eq,Ord,Hash, RustcEncodable, RustcDecodable)]
pub struct Field(pub u8);
#[derive(Copy, Clone, Debug, PartialEq,PartialOrd,Eq,Ord,Hash, Serialize, Deserialize)]
pub struct Field(pub u32);
impl BinarySerializable for Field {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
@@ -19,7 +19,7 @@ impl BinarySerializable for Field {
}
fn deserialize(reader: &mut Read) -> io::Result<Field> {
u8::deserialize(reader).map(Field)
u32::deserialize(reader).map(Field)
}
}

View File

@@ -1,10 +1,10 @@
use schema::TextOptions;
use schema::U32Options;
use schema::IntOptions;
use rustc_serialize::Decodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encodable;
use rustc_serialize::Encoder;
use std::fmt;
use serde::{Serialize, Deserialize, Serializer, Deserializer};
use serde::ser::SerializeStruct;
use serde::de::{self, Visitor, MapAccess};
use schema::FieldType;
/// A `FieldEntry` represents a field and its configuration.
@@ -22,7 +22,7 @@ pub struct FieldEntry {
impl FieldEntry {
/// Creates a new u32 field entry in the schema, given
/// Creates a new u64 field entry in the schema, given
/// a name, and some options.
pub fn new_text(field_name: String, field_type: TextOptions) -> FieldEntry {
FieldEntry {
@@ -31,17 +31,27 @@ impl FieldEntry {
}
}
/// Creates a new u32 field entry in the schema, given
/// Creates a new u64 field entry in the schema, given
/// a name, and some options.
pub fn new_u32(field_name: String, field_type: U32Options) -> FieldEntry {
pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry {
FieldEntry {
name: field_name,
field_type: FieldType::U32(field_type),
field_type: FieldType::U64(field_type),
}
}
/// Creates a new i64 field entry in the schema, given
/// a name, and some options.
pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry {
FieldEntry {
name: field_name,
field_type: FieldType::I64(field_type),
}
}
/// Returns the name of the field
pub fn name(&self,) -> &String {
pub fn name(&self,) -> &str {
&self.name
}
@@ -54,14 +64,16 @@ impl FieldEntry {
pub fn is_indexed(&self,) -> bool {
match self.field_type {
FieldType::Str(ref options) => options.get_indexing_options().is_indexed(),
FieldType::U32(ref options) => options.is_indexed(),
FieldType::U64(ref options) => options.is_indexed(),
FieldType::I64(ref options) => options.is_indexed(),
}
}
/// Returns true iff the field is a u32 fast field
pub fn is_u32_fast(&self,) -> bool {
/// Returns true iff the field is a int (signed or unsigned) fast field
pub fn is_int_fast(&self,) -> bool {
match self.field_type {
FieldType::U32(ref options) => options.is_fast(),
FieldType::U64(ref options) => options.is_fast(),
FieldType::I64(ref options) => options.is_fast(),
_ => false,
}
}
@@ -69,7 +81,10 @@ impl FieldEntry {
/// Returns true iff the field is stored
pub fn is_stored(&self,) -> bool {
match self.field_type {
FieldType::U32(ref options) => {
FieldType::U64(ref options) => {
options.is_stored()
}
FieldType::I64(ref options) => {
options.is_stored()
}
FieldType::Str(ref options) => {
@@ -79,63 +94,99 @@ impl FieldEntry {
}
}
impl Serialize for FieldEntry {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer
{
let mut s = serializer.serialize_struct("field_entry", 3)?;
s.serialize_field("name", &self.name)?;
impl Encodable for FieldEntry {
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
s.emit_struct("field_entry", 3, |s| {
try!(s.emit_struct_field("name", 0, |s| {
self.name.encode(s)
}));
match self.field_type {
FieldType::Str(ref options) => {
try!(s.emit_struct_field("type", 1, |s| {
s.emit_str("text")
}));
try!(s.emit_struct_field("options", 2, |s| {
options.encode(s)
}));
}
FieldType::U32(ref options) => {
try!(s.emit_struct_field("type", 1, |s| {
s.emit_str("u32")
}));
try!(s.emit_struct_field("options", 2, |s| {
options.encode(s)
}));
}
match self.field_type {
FieldType::Str(ref options) => {
s.serialize_field("type", "text")?;
s.serialize_field("options", options)?;
},
FieldType::U64(ref options) => {
s.serialize_field("type", "u64")?;
s.serialize_field("options", options)?;
},
FieldType::I64(ref options) => {
s.serialize_field("type", "i64")?;
s.serialize_field("options", options)?;
}
Ok(())
})
}
s.end()
}
}
impl Decodable for FieldEntry {
fn decode<D: Decoder>(d: &mut D) -> Result<Self, D::Error> {
d.read_struct("field_entry", 3, |d| {
let name = try!(d.read_struct_field("name", 0, |d| {
d.read_str()
}));
let field_type: String = try!(d.read_struct_field("type", 1, |d| {
d.read_str()
}));
d.read_struct_field("options", 2, |d| {
match field_type.as_ref() {
"u32" => {
let u32_options = try!(U32Options::decode(d));
Ok(FieldEntry::new_u32(name, u32_options))
}
"text" => {
let text_options = try!(TextOptions::decode(d));
Ok(FieldEntry::new_text(name, text_options))
}
_ => {
Err(d.error(&format!("Field type {:?} unknown", field_type)))
impl<'de> Deserialize<'de> for FieldEntry {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de>
{
#[derive(Deserialize)]
#[serde(field_identifier, rename_all = "lowercase")]
enum Field { Name, Type, Options };
const FIELDS: &'static [&'static str] = &["name", "type", "options"];
struct FieldEntryVisitor;
impl<'de> Visitor<'de> for FieldEntryVisitor {
type Value = FieldEntry;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("struct FieldEntry")
}
fn visit_map<V>(self, mut map: V) -> Result<FieldEntry, V::Error>
where V: MapAccess<'de>
{
let mut name = None;
let mut ty = None;
let mut field_type = None;
while let Some(key) = map.next_key()? {
match key {
Field::Name => {
if name.is_some() {
return Err(de::Error::duplicate_field("name"));
}
name = Some(map.next_value()?);
}
Field::Type => {
if ty.is_some() {
return Err(de::Error::duplicate_field("type"));
}
ty = Some(map.next_value()?);
}
Field::Options => {
match ty {
None => return Err(de::Error::custom("The `type` field must be specified before `options`")),
Some(ty) => {
match ty {
"text" => field_type = Some(FieldType::Str(map.next_value()?)),
"u64" => field_type = Some(FieldType::U64(map.next_value()?)),
"i64" => field_type = Some(FieldType::I64(map.next_value()?)),
_ => return Err(de::Error::custom(format!("Unrecognised type {}", ty)))
}
}
}
}
}
}
})
})
let name = name.ok_or_else(|| de::Error::missing_field("name"))?;
ty.ok_or_else(|| de::Error::missing_field("ty"))?;
let field_type = field_type.ok_or_else(|| de::Error::missing_field("options"))?;
Ok(FieldEntry {
name: name,
field_type: field_type,
})
}
}
deserializer.deserialize_struct("field_entry", FIELDS, FieldEntryVisitor)
}
}
@@ -145,18 +196,31 @@ mod tests {
use super::*;
use schema::TEXT;
use rustc_serialize::json;
use serde_json;
#[test]
fn test_json_serialization() {
let field_value = FieldEntry::new_text(String::from("title"), TEXT);
assert_eq!(format!("{}", json::as_pretty_json(&field_value)), r#"{
let expected = r#"{
"name": "title",
"type": "text",
"options": {
"indexing": "position",
"stored": false
}
}"#);
}"#;
let field_value_json = serde_json::to_string_pretty(&field_value).unwrap();
assert_eq!(expected, &field_value_json);
let field_value: FieldEntry = serde_json::from_str(expected).unwrap();
assert_eq!("title", field_value.name);
match field_value.field_type {
FieldType::Str(_) => assert!(true),
_ => panic!("expected FieldType::Str")
}
}
}

View File

@@ -1,7 +1,6 @@
use schema::TextOptions;
use schema::U32Options;
use schema::{TextOptions, IntOptions};
use rustc_serialize::json::Json;
use serde_json::Value as JsonValue;
use schema::Value;
@@ -11,58 +10,83 @@ use schema::Value;
pub enum ValueParsingError {
/// Encounterred a numerical value that overflows or underflow its integer type.
OverflowError(String),
/// The json node is not of the correct type. (e.g. 3 for a `Str` type or `"abc"` for a u32 type)
/// The json node is not of the correct type. (e.g. 3 for a `Str` type or `"abc"` for a u64 type)
/// Tantivy will try to autocast values.
TypeError(String),
}
/// A `FieldType` describes the type (text, u32) of a field as well as
/// A `FieldType` describes the type (text, u64) of a field as well as
/// how it should be handled by tantivy.
#[derive(Clone, Debug, RustcDecodable, RustcEncodable)]
#[derive(Clone, Debug)]
pub enum FieldType {
/// String field type configuration
Str(TextOptions),
/// U32 field type configuration
U32(U32Options),
/// Unsigned 64-bits integers field type configuration
U64(IntOptions),
/// Signed 64-bits integers 64 field type configuration
I64(IntOptions),
}
impl FieldType {
/// returns true iff the field is indexed.
pub fn is_indexed(&self) -> bool {
match self {
&FieldType::Str(ref text_options) => {
text_options.get_indexing_options().is_indexed()
}
&FieldType::U64(ref int_options) => {
int_options.is_indexed()
}
&FieldType::I64(ref int_options) => {
int_options.is_indexed()
}
}
}
/// Parses a field value from json, given the target FieldType.
///
/// Tantivy will not try to cast values.
/// For instance, If the json value is the integer `3` and the
/// target field is a `Str`, this method will return an Error.
pub fn value_from_json(&self, json: &Json) -> Result<Value, ValueParsingError> {
pub fn value_from_json(&self, json: &JsonValue) -> Result<Value, ValueParsingError> {
match *json {
Json::String(ref field_text) => {
JsonValue::String(ref field_text) => {
match *self {
FieldType::Str(_) => {
Ok(Value::Str(field_text.clone()))
}
FieldType::U32(_) => {
Err(ValueParsingError::TypeError(format!("Expected a u32 int, got {:?}", json)))
FieldType::U64(_) | FieldType::I64(_) => {
Err(ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json)))
}
}
}
Json::U64(ref field_val_u64) => {
JsonValue::Number(ref field_val_num) => {
match *self {
FieldType::U32(_) => {
if *field_val_u64 > (u32::max_value() as u64) {
Err(ValueParsingError::OverflowError(format!("Expected u32, but value {:?} overflows.", field_val_u64)))
FieldType::I64(_) => {
if let Some(field_val_i64) = field_val_num.as_i64() {
Ok(Value::I64(field_val_i64))
}
else {
Ok(Value::U32(*field_val_u64 as u32))
Err(ValueParsingError::OverflowError(format!("Expected an i64 int, got {:?}", json)))
}
}
_ => {
FieldType::U64(_) => {
if let Some(field_val_u64) = field_val_num.as_u64() {
Ok(Value::U64(field_val_u64))
}
else {
Err(ValueParsingError::OverflowError(format!("Expected an u64 int, got {:?}", json)))
}
}
FieldType::Str(_) => {
Err(ValueParsingError::TypeError(format!("Expected a string, got {:?}", json)))
}
}
},
}
_ => {
Err(ValueParsingError::TypeError(format!("Expected a string or a u32, got {:?}", json)))
Err(ValueParsingError::TypeError(format!("Json value not supported error {:?}. Expected {:?}", json, self)))
}
}
}

View File

@@ -7,7 +7,7 @@ use schema::Value;
/// `FieldValue` holds together a `Field` and its `Value`.
#[derive(Debug, Clone, Ord, PartialEq, Eq, PartialOrd, RustcEncodable, RustcDecodable)]
#[derive(Debug, Clone, Ord, PartialEq, Eq, PartialOrd, Serialize, Deserialize)]
pub struct FieldValue {
field: Field,
value: Value,
@@ -36,15 +36,13 @@ impl FieldValue {
impl BinarySerializable for FieldValue {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let mut written_size = 0;
written_size += try!(self.field.serialize(writer));
written_size += try!(self.value.serialize(writer));
Ok(written_size)
Ok(self.field.serialize(writer)? +
self.value.serialize(writer)?)
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
let field = try!(Field::deserialize(reader));
let value = try!(Value::deserialize(reader));
let field = Field::deserialize(reader)?;
let value = Value::deserialize(reader)?;
Ok(FieldValue::new(field, value))
}
}

View File

@@ -1,14 +1,14 @@
use std::ops::BitOr;
/// Define how a U32 field should be handled by tantivy.
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
pub struct U32Options {
/// Define how an int field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct IntOptions {
indexed: bool,
fast: bool,
stored: bool,
}
impl U32Options {
impl IntOptions {
/// Returns true iff the value is stored.
pub fn is_stored(&self,) -> bool {
@@ -26,39 +26,39 @@ impl U32Options {
self.fast
}
/// Set the u32 options as stored.
/// Set the u64 options as stored.
///
/// Only the fields that are set as *stored* are
/// persisted into the Tantivy's store.
pub fn set_stored(mut self,) -> U32Options {
pub fn set_stored(mut self,) -> IntOptions {
self.stored = true;
self
}
/// Set the u32 options as indexed.
/// Set the u64 options as indexed.
///
/// Setting an integer as indexed will generate
/// a posting list for each value taken by the integer.
pub fn set_indexed(mut self,) -> U32Options {
pub fn set_indexed(mut self,) -> IntOptions {
self.indexed = true;
self
}
/// Set the u32 options as a fast field.
/// Set the u64 options as a fast field.
///
/// Fast fields are designed for random access.
/// Access time are similar to a random lookup in an array.
/// If more than one value is associated to a fast field, only the last one is
/// kept.
pub fn set_fast(mut self,) -> U32Options {
pub fn set_fast(mut self,) -> IntOptions {
self.fast = true;
self
}
}
impl Default for U32Options {
fn default() -> U32Options {
U32Options {
impl Default for IntOptions {
fn default() -> IntOptions {
IntOptions {
fast: false,
indexed: false,
stored: false,
@@ -67,40 +67,40 @@ impl Default for U32Options {
}
/// Shortcut for a u32 fast field.
/// Shortcut for a u64 fast field.
///
/// Such a shortcut can be composed as follows `STORED | FAST | U32_INDEXED`
pub const FAST: U32Options = U32Options {
/// Such a shortcut can be composed as follows `STORED | FAST | INT_INDEXED`
pub const FAST: IntOptions = IntOptions {
indexed: false,
stored: false,
fast: true,
};
/// Shortcut for a u32 indexed field.
/// Shortcut for a u64 indexed field.
///
/// Such a shortcut can be composed as follows `STORED | FAST | U32_INDEXED`
pub const U32_INDEXED: U32Options = U32Options {
/// Such a shortcut can be composed as follows `STORED | FAST | INT_INDEXED`
pub const INT_INDEXED: IntOptions = IntOptions {
indexed: true,
stored: false,
fast: false,
};
/// Shortcut for a u32 stored field.
/// Shortcut for a u64 stored field.
///
/// Such a shortcut can be composed as follows `STORED | FAST | U32_INDEXED`
pub const U32_STORED: U32Options = U32Options {
/// Such a shortcut can be composed as follows `STORED | FAST | INT_INDEXED`
pub const INT_STORED: IntOptions = IntOptions {
indexed: false,
stored: true,
fast: false,
};
impl BitOr for U32Options {
impl BitOr for IntOptions {
type Output = U32Options;
type Output = IntOptions;
fn bitor(self, other: U32Options) -> U32Options {
let mut res = U32Options::default();
fn bitor(self, other: IntOptions) -> IntOptions {
let mut res = IntOptions::default();
res.indexed = self.indexed | other.indexed;
res.stored = self.stored | other.stored;
res.fast = self.fast | other.fast;

View File

@@ -7,7 +7,7 @@ Tantivy has a very strict schema.
The schema defines information about the fields your index contains, that is, for each field :
* the field name (may only contain letters `[a-zA-Z]`, number `[0-9]`, and `_`)
* the type of the field (currently only `text` and `u32` are supported)
* the type of the field (currently only `text` and `u64` are supported)
* how the field should be indexed / stored.
This very last point is critical as it will enable / disable some of the functionality
@@ -64,17 +64,17 @@ let schema = schema_builder.build();
## Setting a u32 field
## Setting a u64 field
### Example
```
use tantivy::schema::*;
let mut schema_builder = SchemaBuilder::default();
let num_stars_options = U32Options::default()
let num_stars_options = IntOptions::default()
.set_stored()
.set_indexed();
schema_builder.add_u32_field("num_stars", num_stars_options);
schema_builder.add_u64_field("num_stars", num_stars_options);
let schema = schema_builder.build();
```
@@ -82,15 +82,15 @@ Just like for Text fields (see above),
setting the field as stored defines whether the field will be
returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called,
and setting the field as indexed means that we will be able perform queries such as `num_stars:10`.
Note that unlike text fields, u32 can only be indexed in one way for the moment.
Note that unlike text fields, u64 can only be indexed in one way for the moment.
This may change when we will start supporting range queries.
The `fast` option on the other hand is specific to u32 fields, and is only relevant
The `fast` option on the other hand is specific to u64 fields, and is only relevant
if you are implementing your own queries. This functionality is somewhat similar to Lucene's
`DocValues`.
u32 that are indexed as fast will be stored in a special data structure that will
make it possible to access the u32 value given the doc id rapidly. This is useful if the value of
u64 that are indexed as fast will be stored in a special data structure that will
make it possible to access the u64 value given the doc id rapidly. This is useful if the value of
the field is required during scoring or collection for instance.
*/
@@ -104,12 +104,12 @@ mod field_entry;
mod field_value;
mod text_options;
mod u32_options;
mod int_options;
mod field;
mod value;
mod named_field_document;
pub(crate) use self::term::extract_field_from_term_bytes;
pub use self::named_field_document::NamedFieldDocument;
pub use self::schema::{Schema, SchemaBuilder};
pub use self::value::Value;
@@ -129,10 +129,10 @@ pub use self::text_options::TEXT;
pub use self::text_options::STRING;
pub use self::text_options::STORED;
pub use self::u32_options::U32Options;
pub use self::u32_options::FAST;
pub use self::u32_options::U32_INDEXED;
pub use self::u32_options::U32_STORED;
pub use self::int_options::IntOptions;
pub use self::int_options::FAST;
pub use self::int_options::INT_INDEXED;
pub use self::int_options::INT_STORED;
use regex::Regex;

View File

@@ -1,7 +1,5 @@
use std::collections::BTreeMap;
use schema::Value;
use rustc_serialize::Encodable;
use rustc_serialize::Encoder;
@@ -11,33 +9,5 @@ use rustc_serialize::Encoder;
/// A `NamedFieldDocument` is a simple representation of a document
/// as a `BTreeMap<String, Vec<Value>>`.
///
#[derive(Serialize)]
pub struct NamedFieldDocument(pub BTreeMap<String, Vec<Value>>);
impl Encodable for NamedFieldDocument {
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
s.emit_struct("named_field_document", self.0.len(), |s| {
for (i, (name, vals)) in self.0.iter().enumerate() {
try!(s.emit_struct_field(name, i, |s| {
for (j, val) in vals.iter().enumerate() {
try!(s.emit_seq(vals.len(), |s| {
s.emit_seq_elt(j, |s| {
match *val {
Value::Str(ref text) => {
s.emit_str(text)
},
Value::U32(ref val) => {
s.emit_u32(*val)
}
}
})
}));
}
Ok(())
}));
}
Ok(())
})
}
}

View File

@@ -1,19 +1,15 @@
use std::collections::HashMap;
use rustc_serialize::Decodable;
use rustc_serialize::Encodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encoder;
use rustc_serialize::json;
use rustc_serialize::json::Json;
use std::collections::BTreeMap;
use schema::field_type::ValueParsingError;
use std::sync::Arc;
use serde_json::{self, Value as JsonValue, Map as JsonObject};
use serde::{Serialize, Serializer, Deserialize, Deserializer};
use serde::ser::SerializeSeq;
use serde::de::{Visitor, SeqAccess};
use super::*;
use std::fmt;
const MAX_NUM_FIELDS: usize = 255;
/// Tantivy has a very strict schema.
/// You need to specify in advance whether a field is indexed or not,
/// stored or not, and RAM-based or not.
@@ -48,7 +44,7 @@ impl SchemaBuilder {
SchemaBuilder::default()
}
/// Adds a new u32 field.
/// Adds a new u64 field.
/// Returns the associated field handle
///
/// # Caution
@@ -58,12 +54,31 @@ impl SchemaBuilder {
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_u32_field(
pub fn add_u64_field(
&mut self,
field_name_str: &str,
field_options: U32Options) -> Field {
field_options: IntOptions) -> Field {
let field_name = String::from(field_name_str);
let field_entry = FieldEntry::new_u32(field_name, field_options);
let field_entry = FieldEntry::new_u64(field_name, field_options);
self.add_field(field_entry)
}
/// Adds a new i64 field.
/// Returns the associated field handle
///
/// # Caution
///
/// Appending two fields with the same name
/// will result in the shadowing of the first
/// by the second one.
/// The first field will get a field id
/// but only the second one will be indexed
pub fn add_i64_field(
&mut self,
field_name_str: &str,
field_options: IntOptions) -> Field {
let field_name = String::from(field_name_str);
let field_entry = FieldEntry::new_i64(field_name, field_options);
self.add_field(field_entry)
}
@@ -89,8 +104,8 @@ impl SchemaBuilder {
/// Adds a field entry to the schema in build.
fn add_field(&mut self, field_entry: FieldEntry) -> Field {
let field = Field(self.fields.len() as u8);
let field_name = field_entry.name().clone();
let field = Field(self.fields.len() as u32);
let field_name = field_entry.name().to_string();
self.fields.push(field_entry);
self.fields_map.insert(field_name, field);
field
@@ -100,9 +115,6 @@ impl SchemaBuilder {
/// Finalize the creation of a `Schema`
/// This will consume your `SchemaBuilder`
pub fn build(self,) -> Schema {
if self.fields.len() > MAX_NUM_FIELDS {
panic!("There may be at most 255 fields.");
}
Schema(Arc::new(InnerSchema {
fields: self.fields,
fields_map: self.fields_map,
@@ -159,7 +171,7 @@ impl Schema {
}
/// Return the field name for a given `Field`.
pub fn get_field_name(&self, field: Field) -> &String {
pub fn get_field_name(&self, field: Field) -> &str {
self.get_field_entry(field).name()
}
@@ -191,7 +203,7 @@ impl Schema {
.map(|field_val| field_val.value() )
.cloned()
.collect();
field_map.insert(field_name.clone(), values);
field_map.insert(field_name.to_string(), values);
}
NamedFieldDocument(field_map)
}
@@ -201,14 +213,12 @@ impl Schema {
///
/// Encoding a document cannot fail.
pub fn to_json(&self, doc: &Document) -> String {
json::encode(&self.to_named_doc(doc)).unwrap()
serde_json::to_string(&self.to_named_doc(doc)).expect("doc encoding failed. This is a bug")
}
/// Build a document object from a json-object.
pub fn parse_document(&self, doc_json: &str) -> Result<Document, DocParsingError> {
let json_node = try!(Json::from_str(doc_json));
let some_json_obj = json_node.as_object();
if !some_json_obj.is_some() {
let json_obj: JsonObject<String, JsonValue> = serde_json::from_str(doc_json).map_err(|_| {
let doc_json_sample: String =
if doc_json.len() < 20 {
String::from(doc_json)
@@ -216,9 +226,9 @@ impl Schema {
else {
format!("{:?}...", &doc_json[0..20])
};
return Err(DocParsingError::NotJSONObject(doc_json_sample))
}
let json_obj = some_json_obj.unwrap();
DocParsingError::NotJSON(doc_json_sample)
})?;
let mut doc = Document::default();
for (field_name, json_value) in json_obj.iter() {
match self.get_field(field_name) {
@@ -226,7 +236,7 @@ impl Schema {
let field_entry = self.get_field_entry(field);
let field_type = field_entry.field_type();
match *json_value {
Json::Array(ref json_items) => {
JsonValue::Array(ref json_items) => {
for json_item in json_items {
let value = try!(
field_type
@@ -262,30 +272,50 @@ impl fmt::Debug for Schema {
}
}
impl Decodable for Schema {
fn decode<D: Decoder>(d: &mut D) -> Result <Self, D::Error> {
let mut schema_builder = SchemaBuilder::default();
try!(d.read_seq(|d, num_fields| {
for _ in 0..num_fields {
let field_entry = try!(FieldEntry::decode(d));
schema_builder.add_field(field_entry);
}
Ok(())
}));
Ok(schema_builder.build())
impl Serialize for Schema {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer
{
let mut seq = serializer.serialize_seq(Some(self.0.fields.len()))?;
for e in &self.0.fields {
seq.serialize_element(e)?;
}
seq.end()
}
}
impl Encodable for Schema {
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
try!(s.emit_seq(self.0.fields.len(),
|mut e| {
for (ord, field) in self.0.fields.iter().enumerate() {
try!(e.emit_seq_elt(ord, |e| field.encode(e)));
impl<'de> Deserialize<'de> for Schema
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de>
{
struct SchemaVisitor;
impl<'de> Visitor<'de> for SchemaVisitor
{
type Value = Schema;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("struct Schema")
}
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
where A: SeqAccess<'de>
{
let mut schema = SchemaBuilder {
fields: Vec::with_capacity(seq.size_hint().unwrap_or(0)),
fields_map: HashMap::with_capacity(seq.size_hint().unwrap_or(0)),
};
while let Some(value) = seq.next_element()? {
schema.add_field(value);
}
Ok(())
}));
Ok(())
Ok(schema.build())
}
}
deserializer.deserialize_map(SchemaVisitor)
}
}
@@ -305,39 +335,33 @@ impl From<SchemaBuilder> for Schema {
#[derive(Debug)]
pub enum DocParsingError {
/// The payload given is not valid JSON.
NotJSON(json::ParserError),
/// The payload given is not a JSON Object (`{...}`).
NotJSONObject(String),
NotJSON(String),
/// One of the value node could not be parsed.
ValueError(String, ValueParsingError),
/// The json-document contains a field that is not declared in the schema.
NoSuchFieldInSchema(String),
}
impl From<json::ParserError> for DocParsingError {
fn from(err: json::ParserError) -> DocParsingError {
DocParsingError::NotJSON(err)
}
}
#[cfg(test)]
mod tests {
use schema::*;
use rustc_serialize::json;
use serde_json;
use schema::field_type::ValueParsingError;
use schema::schema::DocParsingError::NotJSON;
#[test]
pub fn test_schema_serialization() {
let mut schema_builder = SchemaBuilder::default();
let count_options = U32Options::default().set_stored().set_fast();
let count_options = IntOptions::default().set_stored().set_fast();
let popularity_options = IntOptions::default().set_stored().set_fast();
schema_builder.add_text_field("title", TEXT);
schema_builder.add_text_field("author", STRING);
schema_builder.add_u32_field("count", count_options);
schema_builder.add_u64_field("count", count_options);
schema_builder.add_i64_field("popularity", popularity_options);
let schema = schema_builder.build();
let schema_json: String = format!("{}", json::as_pretty_json(&schema));
let schema_json = serde_json::to_string_pretty(&schema).unwrap();
let expected = r#"[
{
"name": "title",
@@ -357,7 +381,16 @@ mod tests {
},
{
"name": "count",
"type": "u32",
"type": "u64",
"options": {
"indexed": false,
"fast": true,
"stored": true
}
},
{
"name": "popularity",
"type": "i64",
"options": {
"indexed": false,
"fast": true,
@@ -365,8 +398,18 @@ mod tests {
}
}
]"#;
println!("{}", schema_json);
println!("{}", expected);
assert_eq!(schema_json, expected);
let schema: Schema = serde_json::from_str(expected).unwrap();
let mut fields = schema.fields().iter();
assert_eq!("title", fields.next().unwrap().name());
assert_eq!("author", fields.next().unwrap().name());
assert_eq!("count", fields.next().unwrap().name());
assert_eq!("popularity", fields.next().unwrap().name());
}
@@ -374,10 +417,10 @@ mod tests {
#[test]
pub fn test_document_to_json() {
let mut schema_builder = SchemaBuilder::default();
let count_options = U32Options::default().set_stored().set_fast();
let count_options = IntOptions::default().set_stored().set_fast();
schema_builder.add_text_field("title", TEXT);
schema_builder.add_text_field("author", STRING);
schema_builder.add_u32_field("count", count_options);
schema_builder.add_u64_field("count", count_options);
let schema = schema_builder.build();
let doc_json = r#"{
"title": "my title",
@@ -385,6 +428,7 @@ mod tests {
"count": 4
}"#;
let doc = schema.parse_document(doc_json).unwrap();
let doc_serdeser = schema.parse_document(&schema.to_json(&doc)).unwrap();
assert_eq!(doc, doc_serdeser);
}
@@ -392,10 +436,12 @@ mod tests {
#[test]
pub fn test_parse_document() {
let mut schema_builder = SchemaBuilder::default();
let count_options = U32Options::default().set_stored().set_fast();
let count_options = IntOptions::default().set_stored().set_fast();
let popularity_options = IntOptions::default().set_stored().set_fast();
let title_field = schema_builder.add_text_field("title", TEXT);
let author_field = schema_builder.add_text_field("author", STRING);
let count_field = schema_builder.add_u32_field("count", count_options);
let count_field = schema_builder.add_u64_field("count", count_options);
let popularity_field = schema_builder.add_i64_field("popularity", popularity_options);
let schema = schema_builder.build();
{
let doc = schema.parse_document("{}").unwrap();
@@ -405,32 +451,20 @@ mod tests {
let doc = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": 4
"count": 4,
"popularity": 10
}"#).unwrap();
assert_eq!(doc.get_first(title_field).unwrap().text(), "my title");
assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton");
assert_eq!(doc.get_first(count_field).unwrap().u32_value(), 4);
}
{
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton"
"count": 4
}"#);
match json_err {
Err(DocParsingError::NotJSON(__)) => {
assert!(true);
}
_ => {
assert!(false);
}
}
assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4);
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
}
{
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": 4,
"popularity": 10,
"jambon": "bayonne"
}"#);
match json_err {
@@ -438,7 +472,7 @@ mod tests {
assert_eq!(field_name, "jambon");
}
_ => {
assert!(false);
panic!("expected additional field 'jambon' to fail but didn't");
}
}
}
@@ -447,6 +481,7 @@ mod tests {
"title": "my title",
"author": "fulmicoton",
"count": "5",
"popularity": "10",
"jambon": "bayonne"
}"#);
match json_err {
@@ -454,7 +489,7 @@ mod tests {
assert!(true);
}
_ => {
assert!(false);
panic!("expected string of 5 to fail but didn't");
}
}
}
@@ -462,29 +497,62 @@ mod tests {
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": -5
}"#);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => {
assert!(true);
}
_ => {
assert!(false);
}
}
}
{
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": 5000000000
"count": -5,
"popularity": 10
}"#);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
assert!(true);
}
_ => {
assert!(false);
panic!("expected -5 to fail but didn't");
}
}
}
{
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": 9223372036854775808,
"popularity": 10
}"#);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
panic!("expected 9223372036854775808 to fit into u64, but it didn't");
}
_ => {
assert!(true);
}
}
}
{
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": 50,
"popularity": 9223372036854775808
}"#);
match json_err {
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
assert!(true);
},
_ => {
panic!("expected 9223372036854775808 to overflow i64, but it didn't");
}
}
}
{
let json_err = schema.parse_document(r#"{
"title": "my title",
"author": "fulmicoton",
"count": 50,
}"#);
match json_err {
Err(NotJSON(_)) => {
assert!(true);
},
_ => {
panic!("expected invalid JSON to fail parsing, but it didn't");
}
}
}

View File

@@ -1,56 +1,82 @@
use std::fmt;
use common::BinarySerializable;
use common::allocate_vec;
use common;
use byteorder::{BigEndian, ByteOrder};
use super::Field;
use std::str;
/// Size (in bytes) of the buffer of a int field.
const INT_TERM_LEN: usize = 4 + 8;
/// Term represents the value that the token can take.
///
/// It actually wraps a `Vec<u8>`.
#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
pub struct Term(Vec<u8>);
/// Extract `field` from Term.
pub(crate) fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field {
Field(BigEndian::read_u32(&term_bytes[..4]))
}
impl Term {
/// Pre-allocate a term buffer.
pub fn allocate(field: Field, num_bytes: usize) -> Term {
let mut term = Term(Vec::with_capacity(num_bytes));
field.serialize(&mut term.0).expect("Serializing term in a Vec should never fail");
term
}
/// Set the content of the term.
pub fn set_content(&mut self, content: &[u8]) {
assert!(content.len() >= 4);
self.0.resize(content.len(), 0u8);
(&mut self.0[..]).clone_from_slice(content);
}
/// Returns the field id.
fn field_id(&self,) -> u8 {
self.0[0]
}
/// Returns the field.
pub fn field(&self,) -> Field {
Field(self.field_id())
extract_field_from_term_bytes(&self.0)
}
/// Builds a term given a field, and a u32-value
/// Returns the field.
pub fn set_field(&mut self, field: Field) {
if self.0.len() < 4 {
self.0.resize(4, 0u8);
}
BigEndian::write_u32(&mut self.0[0..4], field.0);
}
/// Builds a term given a field, and a u64-value
///
/// Assuming the term has a field id of 1, and a u32 value of 3234,
/// the Term will have 5 bytes.
/// The first byte is `1`, and the 4 following bytes are that of the u32.
pub fn from_field_u32(field: Field, val: u32) -> Term {
const U32_TERM_LEN: usize = 1 + 4;
let mut buffer = allocate_vec(U32_TERM_LEN);
buffer[0] = field.0;
// we want BigEndian here to have lexicographic order
// match the natural order of vals.
BigEndian::write_u32(&mut buffer[1..5], val);
Term(buffer)
/// Assuming the term has a field id of 1, and a u64 value of 3234,
/// the Term will have 8 bytes.
///
/// The first four byte are dedicated to storing the field id as a u64.
/// The 4 following bytes are encoding the u64 value.
pub fn from_field_u64(field: Field, val: u64) -> Term {
let mut term = Term(vec![0u8; INT_TERM_LEN]);
term.set_field(field);
term.set_u64(val);
term
}
/// Sets a u64 value in the term.
///
/// U64 are serialized using (8-byte) BigEndian
/// representation.
/// The use of BigEndian has the benefit of preserving
/// the natural order of the values.
pub fn set_u64(&mut self, val: u64) {
self.0.resize(INT_TERM_LEN, 0u8);
BigEndian::write_u64(&mut self.0[4..], val);
}
/// Builds a term given a field, and a u64-value
///
/// Assuming the term has a field id of 1, and a u64 value of 3234,
/// the Term will have 8 bytes.
///
/// The first four byte are dedicated to storing the field id as a u64.
/// The 4 following bytes are encoding the u64 value.
pub fn from_field_i64(field: Field, val: i64) -> Term {
let val_u64: u64 = common::i64_to_u64(val);
Term::from_field_u64(field, val_u64)
}
/// Builds a term given a field, and a string value
@@ -60,18 +86,28 @@ impl Term {
/// The first byte is 2, and the three following bytes are the utf-8
/// representation of "abc".
pub fn from_field_text(field: Field, text: &str) -> Term {
let mut buffer = Vec::with_capacity(1 + text.len());
buffer.clear();
field.serialize(&mut buffer).unwrap();
buffer.extend(text.as_bytes());
Term(buffer)
let buffer = Vec::with_capacity(4 + text.len());
let mut term = Term(buffer);
term.set_field(field);
term.set_text(text);
term
}
/// Assume the term is a u32 field.
/// Creates a new Term with an empty buffer,
/// but with a given capacity.
///
/// Panics if the term is not a u32 field.
pub fn get_u32(&self) -> u32 {
BigEndian::read_u32(&self.0[1..])
/// It is declared unsafe, as the term content
/// is not initialized, and a call to `.field()`
/// would panic.
pub(crate) unsafe fn with_capacity(num_bytes: usize) -> Term {
Term(Vec::with_capacity(num_bytes))
}
/// Assume the term is a u64 field.
///
/// Panics if the term is not a u64 field.
pub fn get_u64(&self) -> u64 {
BigEndian::read_u64(&self.0[4..])
}
/// Builds a term from its byte representation.
@@ -86,10 +122,10 @@ impl Term {
/// (this does not include the field.)
///
/// If the term is a string, its value is utf-8 encoded.
/// If the term is a u32, its value is encoded according
/// If the term is a u64, its value is encoded according
/// to `byteorder::LittleEndian`.
pub fn value(&self) -> &[u8] {
&self.0[1..]
&self.0[4..]
}
/// Returns the text associated with the term.
@@ -98,13 +134,13 @@ impl Term {
/// If the value is not valid utf-8. This may happen
/// if the index is corrupted or if you try to
/// call this method on a non-string type.
pub unsafe fn text(&self) -> &str {
str::from_utf8_unchecked(self.value())
pub fn text(&self) -> &str {
str::from_utf8(self.value()).expect("Term does not contain valid utf-8.")
}
/// Set the texts only, keeping the field untouched.
pub fn set_text(&mut self, text: &str) {
self.0.resize(1, 0u8);
self.0.resize(4, 0u8);
self.0.extend(text.as_bytes());
}
@@ -141,18 +177,22 @@ mod tests {
{
let term = Term::from_field_text(title_field, "test");
assert_eq!(term.field(), title_field);
assert_eq!(term.as_slice()[0], 1u8);
assert_eq!(&term.as_slice()[1..], "test".as_bytes());
assert_eq!(&term.as_slice()[0..4], &[0u8,0u8,0u8,1u8]);
assert_eq!(&term.as_slice()[4..], "test".as_bytes());
}
{
let term = Term::from_field_u32(count_field, 983u32);
let term = Term::from_field_u64(count_field, 983u64);
assert_eq!(term.field(), count_field);
assert_eq!(term.as_slice()[0], 2u8);
assert_eq!(term.as_slice().len(), 5);
assert_eq!(term.as_slice()[1], 0u8);
assert_eq!(term.as_slice()[2], 0u8);
assert_eq!(term.as_slice()[3], (933u32 / 256u32) as u8);
assert_eq!(term.as_slice()[4], (983u32 % 256u32) as u8);
assert_eq!(&term.as_slice()[0..4], &[0u8, 0u8, 0u8, 2u8]);
assert_eq!(term.as_slice().len(), 4 + 8);
assert_eq!(term.as_slice()[4], 0u8);
assert_eq!(term.as_slice()[5], 0u8);
assert_eq!(term.as_slice()[6], 0u8);
assert_eq!(term.as_slice()[7], 0u8);
assert_eq!(term.as_slice()[8], 0u8);
assert_eq!(term.as_slice()[9], 0u8);
assert_eq!(term.as_slice()[10], (933u64 / 256u64) as u8);
assert_eq!(term.as_slice()[11], (983u64 % 256u64) as u8);
}
}

View File

@@ -1,12 +1,8 @@
use std::ops::BitOr;
use rustc_serialize::Decodable;
use rustc_serialize::Decoder;
use rustc_serialize::Encodable;
use rustc_serialize::Encoder;
/// Define how a text field should be handled by tantivy.
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
#[derive(Clone,Debug,PartialEq,Eq, Serialize, Deserialize)]
pub struct TextOptions {
indexing: TextIndexingOptions,
stored: bool,
@@ -51,9 +47,10 @@ impl Default for TextOptions {
/// Describe how a field should be indexed
#[derive(Clone,Copy,Debug,PartialEq,PartialOrd,Eq,Hash)]
#[derive(Clone,Copy,Debug,PartialEq,PartialOrd,Eq,Hash, Serialize, Deserialize)]
pub enum TextIndexingOptions {
/// Unindexed fields will not generate any postings. They will not be searchable either.
#[serde(rename="unindexed")]
Unindexed,
/// Untokenized means that the field text will not be split into tokens before being indexed.
/// A field with the value "Hello world", will have the document suscribe to one single
@@ -61,62 +58,26 @@ pub enum TextIndexingOptions {
///
/// It will **not** be searchable if the user enter "hello" for instance.
/// This can be useful for tags, or ids for instance.
#[serde(rename="untokenized")]
Untokenized,
/// TokenizedNoFreq will tokenize the field value, and append the document doc id
/// to the posting lists associated to all of the tokens.
/// The frequence of appearance of the term in the document however will be lost.
/// The term frequency used in the TfIdf formula will always be 1.
#[serde(rename="tokenize")]
TokenizedNoFreq,
/// TokenizedWithFreq will tokenize the field value, and encode
/// both the docid and the term frequency in the posting lists associated to all
#[serde(rename="freq")]
// of the tokens.
TokenizedWithFreq,
/// Like TokenizedWithFreq, but also encodes the positions of the
/// terms in a separate file. This option is required for phrase queries.
/// Don't use this if you are certain you won't need it, the term positions file can be very big.
#[serde(rename="position")]
TokenizedWithFreqAndPosition,
}
impl Encodable for TextIndexingOptions {
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
let name = match *self {
TextIndexingOptions::Unindexed => {
"unindexed"
}
TextIndexingOptions::Untokenized => {
"untokenized"
}
TextIndexingOptions::TokenizedNoFreq => {
"tokenize"
}
TextIndexingOptions::TokenizedWithFreq => {
"freq"
}
TextIndexingOptions::TokenizedWithFreqAndPosition => {
"position"
}
};
s.emit_str(name)
}
}
impl Decodable for TextIndexingOptions {
fn decode<D: Decoder>(d: &mut D) -> Result<Self, D::Error> {
use self::TextIndexingOptions::*;
let option_name: String = try!(d.read_str());
Ok(match option_name.as_ref() {
"unindexed" => Unindexed,
"untokenized" => Untokenized,
"tokenize" => TokenizedNoFreq,
"freq" => TokenizedWithFreq,
"position" => TokenizedWithFreqAndPosition,
_ => {
return Err(d.error(&format!("Encoding option {:?} unknown", option_name)));
}
})
}
}
impl TextIndexingOptions {
/// Returns true iff the term frequency will be encoded.

View File

@@ -1,17 +1,65 @@
use common::BinarySerializable;
use std::io;
use std::io::Write;
use std::io::Read;
use std::fmt;
use serde::{Serialize, Serializer, Deserialize, Deserializer};
use serde::de::Visitor;
/// Value represents the value of a any field.
/// It is an enum over all over all of the possible field type.
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, RustcEncodable, RustcDecodable)]
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
pub enum Value {
/// The str type is used for any text information.
Str(String),
/// Unsigned 32-bits Integer `u32`
U32(u32),
/// Unsigned 64-bits Integer `u64`
U64(u64),
/// Signed 64-bits Integer `i64`
I64(i64)
}
impl Serialize for Value {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer
{
match *self {
Value::Str(ref v) => serializer.serialize_str(v),
Value::U64(u) => serializer.serialize_u64(u),
Value::I64(u) => serializer.serialize_i64(u),
}
}
}
impl<'de> Deserialize<'de> for Value
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where D: Deserializer<'de>
{
struct ValueVisitor;
impl<'de> Visitor<'de> for ValueVisitor
{
type Value = Value;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("a string or u32")
}
fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E> {
Ok(Value::U64(v))
}
fn visit_i64<E>(self, v: i64) -> Result<Self::Value, E> {
Ok(Value::I64(v))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> {
Ok(Value::Str(v.to_owned()))
}
fn visit_string<E>(self, v: String) -> Result<Self::Value, E> {
Ok(Value::Str(v))
}
}
deserializer.deserialize_any(ValueVisitor)
}
}
impl Value {
@@ -30,13 +78,28 @@ impl Value {
}
}
/// Returns the u32-value, provided the value is of the `U32` type.
/// Returns the u64-value, provided the value is of the `U64` type.
///
/// # Panics
/// If the value is not of type `U32`
pub fn u32_value(&self) -> u32 {
/// If the value is not of type `U64`
pub fn u64_value(&self) -> u64 {
match *self {
Value::U32(ref value) => {
Value::U64(ref value) => {
*value
}
_ => {
panic!("This is not a text field.")
}
}
}
/// Returns the i64-value, provided the value is of the `I64` type.
///
/// # Panics
/// If the value is not of type `I64`
pub fn i64_value(&self) -> i64 {
match *self {
Value::I64(ref value) => {
*value
}
_ => {
@@ -53,9 +116,15 @@ impl From<String> for Value {
}
impl From<u32> for Value {
fn from(v: u32) -> Value {
Value::U32(v)
impl From<u64> for Value {
fn from(v: u64) -> Value {
Value::U64(v)
}
}
impl From<i64> for Value {
fn from(v: i64) -> Value {
Value::I64(v)
}
}
@@ -65,39 +134,53 @@ impl<'a> From<&'a str> for Value {
}
}
const TEXT_CODE: u8 = 0;
const U32_CODE: u8 = 1;
mod binary_serialize {
use common::BinarySerializable;
use std::io::{self, Read, Write};
use super::Value;
const TEXT_CODE: u8 = 0;
const U64_CODE: u8 = 1;
const I64_CODE: u8 = 2;
impl BinarySerializable for Value {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let mut written_size = 0;
match *self {
Value::Str(ref text) => {
written_size += try!(TEXT_CODE.serialize(writer));
written_size += try!(text.serialize(writer));
},
Value::U32(ref val) => {
written_size += try!(U32_CODE.serialize(writer));
written_size += try!(val.serialize(writer));
},
impl BinarySerializable for Value {
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
let mut written_size = 0;
match *self {
Value::Str(ref text) => {
written_size += try!(TEXT_CODE.serialize(writer));
written_size += try!(text.serialize(writer));
},
Value::U64(ref val) => {
written_size += try!(U64_CODE.serialize(writer));
written_size += try!(val.serialize(writer));
},
Value::I64(ref val) => {
written_size += try!(I64_CODE.serialize(writer));
written_size += try!(val.serialize(writer));
},
}
Ok(written_size)
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
let type_code = try!(u8::deserialize(reader));
match type_code {
TEXT_CODE => {
let text = try!(String::deserialize(reader));
Ok(Value::Str(text))
}
U64_CODE => {
let value = try!(u64::deserialize(reader));
Ok(Value::U64(value))
}
I64_CODE => {
let value = try!(i64::deserialize(reader));
Ok(Value::I64(value))
}
_ => {
Err(io::Error::new(io::ErrorKind::InvalidData, format!("No field type is associated with code {:?}", type_code)))
}
}
}
Ok(written_size)
}
fn deserialize(reader: &mut Read) -> io::Result<Self> {
let type_code = try!(u8::deserialize(reader));
match type_code {
TEXT_CODE => {
let text = try!(String::deserialize(reader));
Ok(Value::Str(text))
}
U32_CODE => {
let value = try!(u32::deserialize(reader));
Ok(Value::U32(value))
}
_ => {
Err(io::Error::new(io::ErrorKind::InvalidData, format!("No field type is associated with code {:?}", type_code)))
}
}
}
}