mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
Compare commits
81 Commits
0.3.1
...
tantivy-im
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d07e896a2f | ||
|
|
574feb8026 | ||
|
|
ecbdd70c37 | ||
|
|
fb1b2be782 | ||
|
|
0bc047f8e1 | ||
|
|
2c2aa3d66c | ||
|
|
4c4c28e2c4 | ||
|
|
9f9e588905 | ||
|
|
6fd17e0ead | ||
|
|
65dc5b0d83 | ||
|
|
15d15c01f8 | ||
|
|
106832a66a | ||
|
|
477b9136b9 | ||
|
|
7852d097b8 | ||
|
|
da99bbcb9d | ||
|
|
0bd56241bb | ||
|
|
54ab897755 | ||
|
|
1369d2d144 | ||
|
|
d3f829dc8a | ||
|
|
e82ccf9627 | ||
|
|
d3d29f7f54 | ||
|
|
3566717979 | ||
|
|
90bc3e3773 | ||
|
|
ffb62b6835 | ||
|
|
4f9ce91d6a | ||
|
|
3c3a2fbfe8 | ||
|
|
0508571d1a | ||
|
|
7b733dd34f | ||
|
|
2c798e3147 | ||
|
|
33f9426dd6 | ||
|
|
2c13f210bc | ||
|
|
647c97fa3d | ||
|
|
8029aea548 | ||
|
|
3b33484cf8 | ||
|
|
2a909ddcc7 | ||
|
|
0dad02791c | ||
|
|
2947364ae1 | ||
|
|
05111599b3 | ||
|
|
83263eabbb | ||
|
|
5cb5c9a8f2 | ||
|
|
9ab92b7739 | ||
|
|
962bddfbbf | ||
|
|
26cfe2909f | ||
|
|
afdfb1a69b | ||
|
|
b26ad1d57a | ||
|
|
1dbd54edbb | ||
|
|
deb04eb090 | ||
|
|
bed34bf502 | ||
|
|
925b9063a7 | ||
|
|
5e1ce381fe | ||
|
|
67381e448f | ||
|
|
19d535c28e | ||
|
|
95bfb71901 | ||
|
|
e00d6538fa | ||
|
|
8d7445f08a | ||
|
|
74e10843a7 | ||
|
|
1b922e6d23 | ||
|
|
a7c6c31538 | ||
|
|
9d071c8d46 | ||
|
|
202e69b7e0 | ||
|
|
a650969509 | ||
|
|
04074f7bcb | ||
|
|
8a28d1643d | ||
|
|
c8d06d63b9 | ||
|
|
7eec9f038d | ||
|
|
57870fdcef | ||
|
|
c0f2055e32 | ||
|
|
9a8f06a523 | ||
|
|
bb57bee099 | ||
|
|
bc2a1f00e6 | ||
|
|
391f258ff3 | ||
|
|
673712a339 | ||
|
|
29ad1d84e5 | ||
|
|
62d9236200 | ||
|
|
f5f8e130b0 | ||
|
|
d5d9218093 | ||
|
|
a44f34c49d | ||
|
|
d8ea083177 | ||
|
|
d32dff1da9 | ||
|
|
f9ca0b16f1 | ||
|
|
a39fe90930 |
@@ -28,6 +28,7 @@ script:
|
||||
travis-cargo test &&
|
||||
travis-cargo bench &&
|
||||
travis-cargo doc
|
||||
- cargo run --example simple_search
|
||||
after_success:
|
||||
- bash ./script/build-doc.sh
|
||||
- travis-cargo doc-upload
|
||||
|
||||
17
CHANGELOG.md
17
CHANGELOG.md
@@ -1,3 +1,20 @@
|
||||
Tantivy 0.4.0
|
||||
==========================
|
||||
- Raise the limit of number of fields (previously 256 fields)
|
||||
- Removed u32 fields. They are replaced by u64 and i64 fields (#65)
|
||||
- Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola
|
||||
- QueryParser:
|
||||
- Explicit error returned when searched for a term that is not indexed
|
||||
- Searching for a int term via the query parser was broken `(age:1)`
|
||||
- Searching for a non-indexed field returns an explicit Error
|
||||
- Phrase query for non-tokenized field are not tokenized by the query parser.
|
||||
|
||||
Tantivy 0.3.1
|
||||
==========================
|
||||
|
||||
- Expose a method to trigger files garbage collection
|
||||
|
||||
|
||||
Tantivy 0.3
|
||||
==========================
|
||||
|
||||
|
||||
10
Cargo.toml
10
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.3.1"
|
||||
version = "0.4.0-alpha"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
build = "build.rs"
|
||||
license = "MIT"
|
||||
@@ -20,18 +20,20 @@ regex = "0.2"
|
||||
fst = "0.1.37"
|
||||
atomicwrites = "0.1.3"
|
||||
tempfile = "2.1"
|
||||
rustc-serialize = "0.3"
|
||||
log = "0.3.6"
|
||||
combine = "2.2"
|
||||
tempdir = "0.3"
|
||||
bincode = "0.5"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
bincode = "0.7.0-alpha7"
|
||||
libc = {version = "0.2.20", optional=true}
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.5.9"
|
||||
lz4 = "1.20"
|
||||
bit-set = "0.4.0"
|
||||
time = "0.1"
|
||||
uuid = { version = "0.4", features = ["v4", "rustc-serialize"] }
|
||||
uuid = { version = "0.5", features = ["v4", "serde"] }
|
||||
chan = "0.1"
|
||||
version = "2"
|
||||
crossbeam = "0.2"
|
||||
|
||||
@@ -21,4 +21,5 @@ install:
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
|
||||
- REM SET RUST_LOG=tantivy,test & cargo run --example simple_search
|
||||
19
build.rs
19
build.rs
@@ -4,7 +4,8 @@ mod build {
|
||||
|
||||
pub fn build() {
|
||||
let mut config = gcc::Config::new();
|
||||
config.include("./cpp/simdcomp/include")
|
||||
config
|
||||
.include("./cpp/simdcomp/include")
|
||||
.file("cpp/simdcomp/src/avxbitpacking.c")
|
||||
.file("cpp/simdcomp/src/simdintegratedbitpacking.c")
|
||||
.file("cpp/simdcomp/src/simdbitpacking.c")
|
||||
@@ -18,18 +19,26 @@ mod build {
|
||||
config.opt_level(3);
|
||||
|
||||
if cfg!(target_env = "msvc") {
|
||||
config.define("NDEBUG", None)
|
||||
config
|
||||
.define("NDEBUG", None)
|
||||
.flag("/Gm-")
|
||||
.flag("/GS-")
|
||||
.flag("/Gy")
|
||||
.flag("/Oi")
|
||||
.flag("/GL");
|
||||
} else {
|
||||
config.flag("-msse4.1")
|
||||
.flag("-march=native");
|
||||
}
|
||||
}
|
||||
|
||||
if !cfg!(target_env = "msvc") {
|
||||
config
|
||||
.include("./cpp/streamvbyte/include")
|
||||
.file("cpp/streamvbyte/src/streamvbyte.c")
|
||||
.file("cpp/streamvbyte/src/streamvbytedelta.c")
|
||||
.flag("-msse4.1")
|
||||
.flag("-march=native")
|
||||
.flag("-std=c99");
|
||||
}
|
||||
|
||||
config.compile("libsimdcomp.a");
|
||||
|
||||
// Workaround for linking static libraries built with /GL
|
||||
|
||||
32
cpp/streamvbyte/.gitignore
vendored
Normal file
32
cpp/streamvbyte/.gitignore
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
# Object files
|
||||
*.o
|
||||
*.ko
|
||||
*.obj
|
||||
*.elf
|
||||
|
||||
# Precompiled Headers
|
||||
*.gch
|
||||
*.pch
|
||||
|
||||
# Libraries
|
||||
*.lib
|
||||
*.a
|
||||
*.la
|
||||
*.lo
|
||||
|
||||
# Shared objects (inc. Windows DLLs)
|
||||
*.dll
|
||||
*.so
|
||||
*.so.*
|
||||
*.dylib
|
||||
|
||||
# Executables
|
||||
*.exe
|
||||
*.out
|
||||
*.app
|
||||
*.i*86
|
||||
*.x86_64
|
||||
*.hex
|
||||
|
||||
# Debug files
|
||||
*.dSYM/
|
||||
7
cpp/streamvbyte/.travis.yml
Normal file
7
cpp/streamvbyte/.travis.yml
Normal file
@@ -0,0 +1,7 @@
|
||||
language: c
|
||||
sudo: false
|
||||
compiler:
|
||||
- gcc
|
||||
- clang
|
||||
|
||||
script: make && ./unit
|
||||
202
cpp/streamvbyte/LICENSE
Normal file
202
cpp/streamvbyte/LICENSE
Normal file
@@ -0,0 +1,202 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "{}"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright {yyyy} {name of copyright owner}
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
60
cpp/streamvbyte/README.md
Normal file
60
cpp/streamvbyte/README.md
Normal file
@@ -0,0 +1,60 @@
|
||||
streamvbyte
|
||||
===========
|
||||
[](https://travis-ci.org/lemire/streamvbyte)
|
||||
|
||||
StreamVByte is a new integer compression technique that applies SIMD instructions (vectorization) to
|
||||
Google's Group Varint approach. The net result is faster than other byte-oriented compression
|
||||
techniques.
|
||||
|
||||
The approach is patent-free, the code is available under the Apache License.
|
||||
|
||||
|
||||
It includes fast differential coding.
|
||||
|
||||
It assumes a recent Intel processor (e.g., haswell or better) .
|
||||
|
||||
The code should build using most standard-compliant C99 compilers. The provided makefile
|
||||
expects a Linux-like system.
|
||||
|
||||
|
||||
Usage:
|
||||
|
||||
make
|
||||
./unit
|
||||
|
||||
See example.c for an example.
|
||||
|
||||
Short code sample:
|
||||
```C
|
||||
// suppose that datain is an array of uint32_t integers
|
||||
size_t compsize = streamvbyte_encode(datain, N, compressedbuffer); // encoding
|
||||
// here the result is stored in compressedbuffer using compsize bytes
|
||||
streamvbyte_decode(compressedbuffer, recovdata, N); // decoding (fast)
|
||||
```
|
||||
|
||||
If the values are sorted, then it might be preferable to use differential coding:
|
||||
```C
|
||||
// suppose that datain is an array of uint32_t integers
|
||||
size_t compsize = streamvbyte_delta_encode(datain, N, compressedbuffer,0); // encoding
|
||||
// here the result is stored in compressedbuffer using compsize bytes
|
||||
streamvbyte_delta_decode(compressedbuffer, recovdata, N,0); // decoding (fast)
|
||||
```
|
||||
You have to know how many integers were coded when you decompress. You can store this
|
||||
information along with the compressed stream.
|
||||
|
||||
See also
|
||||
--------
|
||||
* SIMDCompressionAndIntersection: A C++ library to compress and intersect sorted lists of integers using SIMD instructions https://github.com/lemire/SIMDCompressionAndIntersect
|
||||
* The FastPFOR C++ library : Fast integer compression https://github.com/lemire/FastPFor
|
||||
* High-performance dictionary coding https://github.com/lemire/dictionary
|
||||
* LittleIntPacker: C library to pack and unpack short arrays of integers as fast as possible https://github.com/lemire/LittleIntPacker
|
||||
* The SIMDComp library: A simple C library for compressing lists of integers using binary packing https://github.com/lemire/simdcomp
|
||||
* MaskedVByte: Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte
|
||||
* CSharpFastPFOR: A C# integer compression library https://github.com/Genbox/CSharpFastPFOR
|
||||
* JavaFastPFOR: A java integer compression library https://github.com/lemire/JavaFastPFOR
|
||||
* Encoding: Integer Compression Libraries for Go https://github.com/zhenjl/encoding
|
||||
* FrameOfReference is a C++ library dedicated to frame-of-reference (FOR) compression: https://github.com/lemire/FrameOfReference
|
||||
* libvbyte: A fast implementation for varbyte 32bit/64bit integer compression https://github.com/cruppstahl/libvbyte
|
||||
* TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor
|
||||
* Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch
|
||||
|
||||
24
cpp/streamvbyte/example.c
Normal file
24
cpp/streamvbyte/example.c
Normal file
@@ -0,0 +1,24 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "streamvbyte.h"
|
||||
|
||||
int main() {
|
||||
int N = 5000;
|
||||
uint32_t * datain = malloc(N * sizeof(uint32_t));
|
||||
uint8_t * compressedbuffer = malloc(N * sizeof(uint32_t));
|
||||
uint32_t * recovdata = malloc(N * sizeof(uint32_t));
|
||||
for (int k = 0; k < N; ++k)
|
||||
datain[k] = 120;
|
||||
size_t compsize = streamvbyte_encode(datain, N, compressedbuffer); // encoding
|
||||
// here the result is stored in compressedbuffer using compsize bytes
|
||||
size_t compsize2 = streamvbyte_decode(compressedbuffer, recovdata,
|
||||
N); // decoding (fast)
|
||||
assert(compsize == compsize2);
|
||||
free(datain);
|
||||
free(compressedbuffer);
|
||||
free(recovdata);
|
||||
printf("Compressed %d integers down to %d bytes.\n",N,(int) compsize);
|
||||
return 0;
|
||||
}
|
||||
19
cpp/streamvbyte/include/streamvbyte.h
Normal file
19
cpp/streamvbyte/include/streamvbyte.h
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
#ifndef VARINTDECODE_H_
|
||||
#define VARINTDECODE_H_
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <inttypes.h>
|
||||
#include <stdint.h>// please use a C99-compatible compiler
|
||||
#include <stddef.h>
|
||||
|
||||
|
||||
// Encode an array of a given length read from in to bout in varint format.
|
||||
// Returns the number of bytes written.
|
||||
size_t streamvbyte_encode(const uint32_t *in, uint32_t length, uint8_t *out);
|
||||
|
||||
// Read "length" 32-bit integers in varint format from in, storing the result in out.
|
||||
// Returns the number of bytes read.
|
||||
size_t streamvbyte_decode(const uint8_t* in, uint32_t* out, uint32_t length);
|
||||
|
||||
|
||||
#endif /* VARINTDECODE_H_ */
|
||||
24
cpp/streamvbyte/include/streamvbytedelta.h
Normal file
24
cpp/streamvbyte/include/streamvbytedelta.h
Normal file
@@ -0,0 +1,24 @@
|
||||
/*
|
||||
* streamvbytedelta.h
|
||||
*
|
||||
* Created on: Apr 14, 2016
|
||||
* Author: lemire
|
||||
*/
|
||||
|
||||
#ifndef INCLUDE_STREAMVBYTEDELTA_H_
|
||||
#define INCLUDE_STREAMVBYTEDELTA_H_
|
||||
|
||||
|
||||
// Encode an array of a given length read from in to bout in StreamVByte format.
|
||||
// Returns the number of bytes written.
|
||||
// this version uses differential coding (coding differences between values) starting at prev (you can often set prev to zero)
|
||||
size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t length, uint8_t *out, uint32_t prev);
|
||||
|
||||
// Read "length" 32-bit integers in StreamVByte format from in, storing the result in out.
|
||||
// Returns the number of bytes read.
|
||||
// this version uses differential coding (coding differences between values) starting at prev (you can often set prev to zero)
|
||||
size_t streamvbyte_delta_decode(const uint8_t* in, uint32_t* out, uint32_t length, uint32_t prev);
|
||||
|
||||
|
||||
|
||||
#endif /* INCLUDE_STREAMVBYTEDELTA_H_ */
|
||||
58
cpp/streamvbyte/makefile
Normal file
58
cpp/streamvbyte/makefile
Normal file
@@ -0,0 +1,58 @@
|
||||
# minimalist makefile
|
||||
.SUFFIXES:
|
||||
#
|
||||
.SUFFIXES: .cpp .o .c .h
|
||||
|
||||
CFLAGS = -fPIC -march=native -std=c99 -O3 -Wall -Wextra -pedantic -Wshadow
|
||||
LDFLAGS = -shared
|
||||
LIBNAME=libstreamvbyte.so.0.0.1
|
||||
all: unit $(LIBNAME)
|
||||
test:
|
||||
./unit
|
||||
install: $(OBJECTS)
|
||||
cp $(LIBNAME) /usr/local/lib
|
||||
ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libstreamvbyte.so
|
||||
ldconfig
|
||||
cp $(HEADERS) /usr/local/include
|
||||
|
||||
|
||||
|
||||
HEADERS=./include/streamvbyte.h ./include/streamvbytedelta.h
|
||||
|
||||
uninstall:
|
||||
for h in $(HEADERS) ; do rm /usr/local/$$h; done
|
||||
rm /usr/local/lib/$(LIBNAME)
|
||||
rm /usr/local/lib/libstreamvbyte.so
|
||||
ldconfig
|
||||
|
||||
|
||||
OBJECTS= streamvbyte.o streamvbytedelta.o
|
||||
|
||||
|
||||
|
||||
streamvbytedelta.o: ./src/streamvbytedelta.c $(HEADERS)
|
||||
$(CC) $(CFLAGS) -c ./src/streamvbytedelta.c -Iinclude
|
||||
|
||||
|
||||
streamvbyte.o: ./src/streamvbyte.c $(HEADERS)
|
||||
$(CC) $(CFLAGS) -c ./src/streamvbyte.c -Iinclude
|
||||
|
||||
|
||||
|
||||
$(LIBNAME): $(OBJECTS)
|
||||
$(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS)
|
||||
|
||||
|
||||
|
||||
|
||||
example: ./example.c $(HEADERS) $(OBJECTS)
|
||||
$(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS)
|
||||
|
||||
unit: ./tests/unit.c $(HEADERS) $(OBJECTS)
|
||||
$(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude $(OBJECTS)
|
||||
|
||||
dynunit: ./tests/unit.c $(HEADERS) $(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude -lstreamvbyte
|
||||
|
||||
clean:
|
||||
rm -f unit *.o $(LIBNAME) example
|
||||
495
cpp/streamvbyte/src/streamvbyte.c
Normal file
495
cpp/streamvbyte/src/streamvbyte.c
Normal file
@@ -0,0 +1,495 @@
|
||||
#include "streamvbyte.h"
|
||||
#if defined(_MSC_VER)
|
||||
/* Microsoft C/C++-compatible compiler */
|
||||
#include <intrin.h>
|
||||
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
||||
/* GCC-compatible compiler, targeting x86/x86-64 */
|
||||
#include <x86intrin.h>
|
||||
#elif defined(__GNUC__) && defined(__ARM_NEON__)
|
||||
/* GCC-compatible compiler, targeting ARM with NEON */
|
||||
#include <arm_neon.h>
|
||||
#elif defined(__GNUC__) && defined(__IWMMXT__)
|
||||
/* GCC-compatible compiler, targeting ARM with WMMX */
|
||||
#include <mmintrin.h>
|
||||
#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
|
||||
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
|
||||
#include <altivec.h>
|
||||
#elif defined(__GNUC__) && defined(__SPE__)
|
||||
/* GCC-compatible compiler, targeting PowerPC with SPE */
|
||||
#include <spe.h>
|
||||
#endif
|
||||
|
||||
static uint8_t lengthTable[256] = { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9,
|
||||
10, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8,
|
||||
9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
|
||||
11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
|
||||
11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10,
|
||||
8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
|
||||
12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
|
||||
11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12,
|
||||
13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
|
||||
11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8,
|
||||
9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12,
|
||||
10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
|
||||
13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15,
|
||||
13, 14, 15, 16 };
|
||||
|
||||
static uint8_t shuffleTable[256][16] = { { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1,
|
||||
-1, -1, 3, -1, -1, -1 }, // 1111
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 2111
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 3111
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 4111
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 1211
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 2211
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 3211
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 4211
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 1311
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 2311
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 3311
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1 }, // 4311
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1 }, // 1411
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1 }, // 2411
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1 }, // 3411
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1 }, // 4411
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 }, // 1121
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 2121
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 3121
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 4121
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 1221
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 2221
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 3221
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 4221
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 1321
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 2321
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 3321
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1 }, // 4321
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1 }, // 1421
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1 }, // 2421
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1 }, // 3421
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1 }, // 4421
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1 }, // 1131
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 2131
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 3131
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 4131
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 1231
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 2231
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 3231
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 4231
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 1331
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 2331
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 3331
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1 }, // 4331
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1 }, // 1431
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1 }, // 2431
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1 }, // 3431
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1 }, // 4431
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 }, // 1141
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 2141
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 3141
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 4141
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 1241
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 2241
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 3241
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 4241
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 1341
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 2341
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 3341
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1 }, // 4341
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1 }, // 1441
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1 }, // 2441
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1 }, // 3441
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 }, // 4441
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 }, // 1112
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 2112
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 3112
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 4112
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 1212
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 2212
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 3212
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 4212
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 1312
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 2312
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 3312
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1 }, // 4312
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1 }, // 1412
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1 }, // 2412
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1 }, // 3412
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1 }, // 4412
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 }, // 1122
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 2122
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 3122
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 4122
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 1222
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 2222
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 3222
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 4222
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 1322
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 2322
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 3322
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1 }, // 4322
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1 }, // 1422
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1 }, // 2422
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1 }, // 3422
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1 }, // 4422
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1 }, // 1132
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 2132
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 3132
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 4132
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 1232
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 2232
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 3232
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 4232
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 1332
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 2332
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 3332
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1 }, // 4332
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1 }, // 1432
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1 }, // 2432
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1 }, // 3432
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1 }, // 4432
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 }, // 1142
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 2142
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 3142
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 4142
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 1242
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 2242
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 3242
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 4242
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 1342
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 2342
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 3342
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1 }, // 4342
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1 }, // 1442
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1 }, // 2442
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1 }, // 3442
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 }, // 4442
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1 }, // 1113
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 2113
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 3113
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 4113
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 1213
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 2213
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 3213
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 4213
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 1313
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 2313
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 3313
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1 }, // 4313
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1 }, // 1413
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1 }, // 2413
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1 }, // 3413
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1 }, // 4413
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1 }, // 1123
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 2123
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 3123
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 4123
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 1223
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 2223
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 3223
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 4223
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 1323
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 2323
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 3323
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1 }, // 4323
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1 }, // 1423
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1 }, // 2423
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1 }, // 3423
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1 }, // 4423
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1 }, // 1133
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 2133
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 3133
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 4133
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 1233
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 2233
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 3233
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 4233
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 1333
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 2333
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 3333
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1 }, // 4333
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1 }, // 1433
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1 }, // 2433
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1 }, // 3433
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1 }, // 4433
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1 }, // 1143
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 2143
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 3143
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 4143
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 1243
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 2243
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 3243
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 4243
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 1343
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 2343
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 3343
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1 }, // 4343
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1 }, // 1443
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1 }, // 2443
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 }, // 3443
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1 }, // 4443
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 }, // 1114
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 2114
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 3114
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 4114
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 1214
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 2214
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 3214
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 4214
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 1314
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 2314
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 3314
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11 }, // 4314
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9 }, // 1414
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10 }, // 2414
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11 }, // 3414
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12 }, // 4414
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 }, // 1124
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 2124
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 3124
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 4124
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 1224
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 2224
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 3224
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 4224
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 1324
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 2324
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 3324
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12 }, // 4324
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10 }, // 1424
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11 }, // 2424
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12 }, // 3424
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13 }, // 4424
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8 }, // 1134
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 2134
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 3134
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 4134
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 1234
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 2234
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 3234
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 4234
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 1334
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 2334
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 3334
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13 }, // 4334
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11 }, // 1434
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12 }, // 2434
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13 }, // 3434
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14 }, // 4434
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 }, // 1144
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 2144
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 3144
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 4144
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 1244
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 2244
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 3244
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 4244
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 1344
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 2344
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 3344
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14 }, // 4344
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, // 1444
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, // 2444
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, // 3444
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } // 4444
|
||||
};
|
||||
|
||||
static uint8_t _encode_data(uint32_t val, uint8_t *__restrict__ *dataPtrPtr) {
|
||||
uint8_t *dataPtr = *dataPtrPtr;
|
||||
uint8_t code;
|
||||
|
||||
if (val < (1 << 8)) { // 1 byte
|
||||
*dataPtr = (uint8_t)(val);
|
||||
*dataPtrPtr += 1;
|
||||
code = 0;
|
||||
} else if (val < (1 << 16)) { // 2 bytes
|
||||
*(uint16_t *) dataPtr = (uint16_t)(val);
|
||||
*dataPtrPtr += 2;
|
||||
code = 1;
|
||||
} else if (val < (1 << 24)) { // 3 bytes
|
||||
*(uint16_t *) dataPtr = (uint16_t)(val);
|
||||
*(dataPtr + 2) = (uint8_t)(val >> 16);
|
||||
*dataPtrPtr += 3;
|
||||
code = 2;
|
||||
} else { // 4 bytes
|
||||
*(uint32_t *) dataPtr = val;
|
||||
*dataPtrPtr += 4;
|
||||
code = 3;
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
static uint8_t *svb_encode_scalar(const uint32_t *in,
|
||||
uint8_t *__restrict__ keyPtr, uint8_t *__restrict__ dataPtr,
|
||||
uint32_t count) {
|
||||
if (count == 0)
|
||||
return dataPtr; // exit immediately if no data
|
||||
|
||||
uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ...
|
||||
uint8_t key = 0;
|
||||
for (uint32_t c = 0; c < count; c++) {
|
||||
if (shift == 8) {
|
||||
shift = 0;
|
||||
*keyPtr++ = key;
|
||||
key = 0;
|
||||
}
|
||||
uint32_t val = in[c];
|
||||
uint8_t code = _encode_data(val, &dataPtr);
|
||||
key |= code << shift;
|
||||
shift += 2;
|
||||
}
|
||||
|
||||
*keyPtr = key; // write last key (no increment needed)
|
||||
return dataPtr; // pointer to first unused data byte
|
||||
}
|
||||
|
||||
// Encode an array of a given length read from in to bout in streamvbyte format.
|
||||
// Returns the number of bytes written.
|
||||
size_t streamvbyte_encode(const uint32_t *in, uint32_t count, uint8_t *out) {
|
||||
uint8_t *keyPtr = out;
|
||||
uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte
|
||||
uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys
|
||||
return svb_encode_scalar(in, keyPtr, dataPtr, count) - out;
|
||||
}
|
||||
|
||||
static inline __m128i _decode_avx(uint32_t key,
|
||||
const uint8_t *__restrict__ *dataPtrPtr) {
|
||||
uint8_t len = lengthTable[key];
|
||||
__m128i Data = _mm_loadu_si128((__m128i *) *dataPtrPtr);
|
||||
__m128i Shuf = *(__m128i *) &shuffleTable[key];
|
||||
|
||||
Data = _mm_shuffle_epi8(Data, Shuf);
|
||||
*dataPtrPtr += len;
|
||||
return Data;
|
||||
}
|
||||
|
||||
static inline void _write_avx(uint32_t *out, __m128i Vec) {
|
||||
_mm_storeu_si128((__m128i *) out, Vec);
|
||||
}
|
||||
|
||||
static inline uint32_t _decode_data(const uint8_t **dataPtrPtr, uint8_t code) {
|
||||
const uint8_t *dataPtr = *dataPtrPtr;
|
||||
uint32_t val;
|
||||
|
||||
if (code == 0) { // 1 byte
|
||||
val = (uint32_t) * dataPtr;
|
||||
dataPtr += 1;
|
||||
} else if (code == 1) { // 2 bytes
|
||||
val = (uint32_t) * (uint16_t *) dataPtr;
|
||||
dataPtr += 2;
|
||||
} else if (code == 2) { // 3 bytes
|
||||
val = (uint32_t) * (uint16_t *) dataPtr;
|
||||
val |= *(dataPtr + 2) << 16;
|
||||
dataPtr += 3;
|
||||
} else { // code == 3
|
||||
val = *(uint32_t *) dataPtr; // 4 bytes
|
||||
dataPtr += 4;
|
||||
}
|
||||
|
||||
*dataPtrPtr = dataPtr;
|
||||
return val;
|
||||
}
|
||||
static const uint8_t *svb_decode_scalar(uint32_t *outPtr, const uint8_t *keyPtr,
|
||||
const uint8_t *dataPtr, uint32_t count) {
|
||||
if (count == 0)
|
||||
return dataPtr; // no reads or writes if no data
|
||||
|
||||
uint8_t shift = 0;
|
||||
uint32_t key = *keyPtr++;
|
||||
for (uint32_t c = 0; c < count; c++) {
|
||||
if (shift == 8) {
|
||||
shift = 0;
|
||||
key = *keyPtr++;
|
||||
}
|
||||
uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3);
|
||||
*outPtr++ = val;
|
||||
shift += 2;
|
||||
}
|
||||
|
||||
return dataPtr; // pointer to first unused byte after end
|
||||
}
|
||||
|
||||
const uint8_t *svb_decode_avx_simple(uint32_t *out,
|
||||
const uint8_t *__restrict__ keyPtr, const uint8_t *__restrict__ dataPtr,
|
||||
uint64_t count) {
|
||||
|
||||
uint64_t keybytes = count / 4; // number of key bytes
|
||||
__m128i Data;
|
||||
if (keybytes >= 8) {
|
||||
|
||||
int64_t Offset = -(int64_t) keybytes / 8 + 1;
|
||||
|
||||
const uint64_t *keyPtr64 = (const uint64_t *) keyPtr - Offset;
|
||||
uint64_t nextkeys = keyPtr64[Offset];
|
||||
for (; Offset != 0; ++Offset) {
|
||||
uint64_t keys = nextkeys;
|
||||
nextkeys = keyPtr64[Offset + 1];
|
||||
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 4, Data);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out + 8, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 12, Data);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out + 16, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 20, Data);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out + 24, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 28, Data);
|
||||
|
||||
out += 32;
|
||||
}
|
||||
{
|
||||
uint64_t keys = nextkeys;
|
||||
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 4, Data);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out + 8, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 12, Data);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out + 16, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 20, Data);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out + 24, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 28, Data);
|
||||
|
||||
out += 32;
|
||||
}
|
||||
}
|
||||
uint64_t consumedkeys = keybytes - (keybytes & 7);
|
||||
return svb_decode_scalar(out, keyPtr + consumedkeys, dataPtr, count & 31);
|
||||
}
|
||||
|
||||
// Read count 32-bit integers in maskedvbyte format from in, storing the result in out. Returns the number of bytes read.
|
||||
size_t streamvbyte_decode(const uint8_t* in, uint32_t* out, uint32_t count) {
|
||||
if (count == 0)
|
||||
return 0;
|
||||
const uint8_t *keyPtr = in; // full list of keys is next
|
||||
uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up)
|
||||
const uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys
|
||||
return svb_decode_avx_simple(out, keyPtr, dataPtr, count) - in;
|
||||
|
||||
}
|
||||
575
cpp/streamvbyte/src/streamvbytedelta.c
Normal file
575
cpp/streamvbyte/src/streamvbytedelta.c
Normal file
@@ -0,0 +1,575 @@
|
||||
#include "streamvbyte.h"
|
||||
#if defined(_MSC_VER)
|
||||
/* Microsoft C/C++-compatible compiler */
|
||||
#include <intrin.h>
|
||||
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
||||
/* GCC-compatible compiler, targeting x86/x86-64 */
|
||||
#include <x86intrin.h>
|
||||
#elif defined(__GNUC__) && defined(__ARM_NEON__)
|
||||
/* GCC-compatible compiler, targeting ARM with NEON */
|
||||
#include <arm_neon.h>
|
||||
#elif defined(__GNUC__) && defined(__IWMMXT__)
|
||||
/* GCC-compatible compiler, targeting ARM with WMMX */
|
||||
#include <mmintrin.h>
|
||||
#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
|
||||
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
|
||||
#include <altivec.h>
|
||||
#elif defined(__GNUC__) && defined(__SPE__)
|
||||
/* GCC-compatible compiler, targeting PowerPC with SPE */
|
||||
#include <spe.h>
|
||||
#endif
|
||||
|
||||
static uint8_t lengthTable[256] = { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9,
|
||||
10, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8,
|
||||
9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
|
||||
11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
|
||||
11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10,
|
||||
8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
|
||||
12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
|
||||
11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12,
|
||||
13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
|
||||
11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8,
|
||||
9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12,
|
||||
10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
|
||||
13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15,
|
||||
13, 14, 15, 16 };
|
||||
|
||||
static uint8_t shuffleTable[256][16] = { { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1,
|
||||
-1, -1, 3, -1, -1, -1 }, // 1111
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 2111
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 3111
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 4111
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 1211
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 2211
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 3211
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 4211
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 1311
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 2311
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 3311
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1 }, // 4311
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1 }, // 1411
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1 }, // 2411
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1 }, // 3411
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1 }, // 4411
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 }, // 1121
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 2121
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 3121
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 4121
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 1221
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 2221
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 3221
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 4221
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 1321
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 2321
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 3321
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1 }, // 4321
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1 }, // 1421
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1 }, // 2421
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1 }, // 3421
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1 }, // 4421
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1 }, // 1131
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 2131
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 3131
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 4131
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 1231
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 2231
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 3231
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 4231
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 1331
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 2331
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 3331
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1 }, // 4331
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1 }, // 1431
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1 }, // 2431
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1 }, // 3431
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1 }, // 4431
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 }, // 1141
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 2141
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 3141
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 4141
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 1241
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 2241
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 3241
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 4241
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 1341
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 2341
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 3341
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1 }, // 4341
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1 }, // 1441
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1 }, // 2441
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1 }, // 3441
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 }, // 4441
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 }, // 1112
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 2112
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 3112
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 4112
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 1212
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 2212
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 3212
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 4212
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 1312
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 2312
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 3312
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1 }, // 4312
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1 }, // 1412
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1 }, // 2412
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1 }, // 3412
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1 }, // 4412
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 }, // 1122
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 2122
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 3122
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 4122
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 1222
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 2222
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 3222
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 4222
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 1322
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 2322
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 3322
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1 }, // 4322
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1 }, // 1422
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1 }, // 2422
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1 }, // 3422
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1 }, // 4422
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1 }, // 1132
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 2132
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 3132
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 4132
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 1232
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 2232
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 3232
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 4232
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 1332
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 2332
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 3332
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1 }, // 4332
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1 }, // 1432
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1 }, // 2432
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1 }, // 3432
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1 }, // 4432
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 }, // 1142
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 2142
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 3142
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 4142
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 1242
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 2242
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 3242
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 4242
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 1342
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 2342
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 3342
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1 }, // 4342
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1 }, // 1442
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1 }, // 2442
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1 }, // 3442
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 }, // 4442
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1 }, // 1113
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 2113
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 3113
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 4113
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 1213
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 2213
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 3213
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 4213
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 1313
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 2313
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 3313
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1 }, // 4313
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1 }, // 1413
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1 }, // 2413
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1 }, // 3413
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1 }, // 4413
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1 }, // 1123
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 2123
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 3123
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 4123
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 1223
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 2223
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 3223
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 4223
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 1323
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 2323
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 3323
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1 }, // 4323
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1 }, // 1423
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1 }, // 2423
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1 }, // 3423
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1 }, // 4423
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1 }, // 1133
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 2133
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 3133
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 4133
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 1233
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 2233
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 3233
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 4233
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 1333
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 2333
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 3333
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1 }, // 4333
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1 }, // 1433
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1 }, // 2433
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1 }, // 3433
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1 }, // 4433
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1 }, // 1143
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 2143
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 3143
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 4143
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 1243
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 2243
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 3243
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 4243
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 1343
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 2343
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 3343
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1 }, // 4343
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1 }, // 1443
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1 }, // 2443
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 }, // 3443
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1 }, // 4443
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 }, // 1114
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 2114
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 3114
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 4114
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 1214
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 2214
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 3214
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 4214
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 1314
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 2314
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 3314
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11 }, // 4314
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9 }, // 1414
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10 }, // 2414
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11 }, // 3414
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12 }, // 4414
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 }, // 1124
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 2124
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 3124
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 4124
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 1224
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 2224
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 3224
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 4224
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 1324
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 2324
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 3324
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12 }, // 4324
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10 }, // 1424
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11 }, // 2424
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12 }, // 3424
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13 }, // 4424
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8 }, // 1134
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 2134
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 3134
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 4134
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 1234
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 2234
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 3234
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 4234
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 1334
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 2334
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 3334
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13 }, // 4334
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11 }, // 1434
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12 }, // 2434
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13 }, // 3434
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14 }, // 4434
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 }, // 1144
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 2144
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 3144
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 4144
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 1244
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 2244
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 3244
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 4244
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 1344
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 2344
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 3344
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14 }, // 4344
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, // 1444
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, // 2444
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, // 3444
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } // 4444
|
||||
};
|
||||
|
||||
static uint8_t _encode_data(uint32_t val, uint8_t *__restrict__ *dataPtrPtr) {
|
||||
uint8_t *dataPtr = *dataPtrPtr;
|
||||
uint8_t code;
|
||||
|
||||
if (val < (1 << 8)) { // 1 byte
|
||||
*dataPtr = (uint8_t)(val);
|
||||
*dataPtrPtr += 1;
|
||||
code = 0;
|
||||
} else if (val < (1 << 16)) { // 2 bytes
|
||||
*(uint16_t *) dataPtr = (uint16_t)(val);
|
||||
*dataPtrPtr += 2;
|
||||
code = 1;
|
||||
} else if (val < (1 << 24)) { // 3 bytes
|
||||
*(uint16_t *) dataPtr = (uint16_t)(val);
|
||||
*(dataPtr + 2) = (uint8_t)(val >> 16);
|
||||
*dataPtrPtr += 3;
|
||||
code = 2;
|
||||
} else { // 4 bytes
|
||||
*(uint32_t *) dataPtr = val;
|
||||
*dataPtrPtr += 4;
|
||||
code = 3;
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
static uint8_t *svb_encode_scalar_d1_init(const uint32_t *in,
|
||||
uint8_t *__restrict__ keyPtr, uint8_t *__restrict__ dataPtr,
|
||||
uint32_t count, uint32_t prev) {
|
||||
if (count == 0)
|
||||
return dataPtr; // exit immediately if no data
|
||||
|
||||
uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ...
|
||||
uint8_t key = 0;
|
||||
for (uint32_t c = 0; c < count; c++) {
|
||||
if (shift == 8) {
|
||||
shift = 0;
|
||||
*keyPtr++ = key;
|
||||
key = 0;
|
||||
}
|
||||
uint32_t val = in[c] - prev;
|
||||
prev = in[c];
|
||||
uint8_t code = _encode_data(val, &dataPtr);
|
||||
key |= code << shift;
|
||||
shift += 2;
|
||||
}
|
||||
|
||||
*keyPtr = key; // write last key (no increment needed)
|
||||
return dataPtr; // pointer to first unused data byte
|
||||
}
|
||||
|
||||
size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t count, uint8_t *out,
|
||||
uint32_t prev) {
|
||||
uint8_t *keyPtr = out; // keys come immediately after 32-bit count
|
||||
uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte
|
||||
uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys
|
||||
|
||||
return svb_encode_scalar_d1_init(in, keyPtr, dataPtr, count, prev) - out;
|
||||
|
||||
}
|
||||
|
||||
static inline __m128i _decode_avx(uint32_t key, const uint8_t *__restrict__ *dataPtrPtr) {
|
||||
uint8_t len = lengthTable[key];
|
||||
__m128i Data = _mm_loadu_si128((__m128i *) *dataPtrPtr);
|
||||
__m128i Shuf = *(__m128i *) &shuffleTable[key];
|
||||
|
||||
Data = _mm_shuffle_epi8(Data, Shuf);
|
||||
*dataPtrPtr += len;
|
||||
|
||||
return Data;
|
||||
}
|
||||
#define BroadcastLastXMM 0xFF // bits 0-7 all set to choose highest element
|
||||
|
||||
|
||||
|
||||
static inline void _write_avx(uint32_t *out, __m128i Vec) {
|
||||
_mm_storeu_si128((__m128i *) out, Vec);
|
||||
}
|
||||
|
||||
static __m128i _write_avx_d1(uint32_t *out, __m128i Vec, __m128i Prev) {
|
||||
__m128i Add = _mm_slli_si128(Vec, 4); // Cycle 1: [- A B C] (already done)
|
||||
Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // Cycle 2: [P P P P]
|
||||
Vec = _mm_add_epi32(Vec, Add); // Cycle 2: [A AB BC CD]
|
||||
Add = _mm_slli_si128(Vec, 8); // Cycle 3: [- - A AB]
|
||||
Vec = _mm_add_epi32(Vec, Prev); // Cycle 3: [PA PAB PBC PCD]
|
||||
Vec = _mm_add_epi32(Vec, Add); // Cycle 4: [PA PAB PABC PABCD]
|
||||
|
||||
_write_avx(out, Vec);
|
||||
return Vec;
|
||||
}
|
||||
|
||||
#ifndef _MSC_VER
|
||||
static __m128i High16To32 = {0xFFFF0B0AFFFF0908, 0xFFFF0F0EFFFF0D0C};
|
||||
#else
|
||||
static __m128i High16To32 = {8, 9, -1, -1, 10, 11, -1, -1,
|
||||
12, 13, -1, -1, 14, 15, -1, -1};
|
||||
#endif
|
||||
|
||||
static inline __m128i _write_16bit_avx_d1(uint32_t *out, __m128i Vec, __m128i Prev) {
|
||||
// vec == [A B C D E F G H] (16 bit values)
|
||||
__m128i Add = _mm_slli_si128(Vec, 2); // [- A B C D E F G]
|
||||
Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // [P P P P] (32-bit)
|
||||
Vec = _mm_add_epi32(Vec, Add); // [A AB BC CD DE FG GH]
|
||||
Add = _mm_slli_si128(Vec, 4); // [- - A AB BC CD DE EF]
|
||||
Vec = _mm_add_epi32(Vec, Add); // [A AB ABC ABCD BCDE CDEF DEFG EFGH]
|
||||
__m128i V1 = _mm_cvtepu16_epi32(Vec); // [A AB ABC ABCD] (32-bit)
|
||||
V1 = _mm_add_epi32(V1, Prev); // [PA PAB PABC PABCD] (32-bit)
|
||||
__m128i V2 =
|
||||
_mm_shuffle_epi8(Vec, High16To32); // [BCDE CDEF DEFG EFGH] (32-bit)
|
||||
V2 = _mm_add_epi32(V1, V2); // [PABCDE PABCDEF PABCDEFG PABCDEFGH] (32-bit)
|
||||
_write_avx(out, V1);
|
||||
_write_avx(out + 4, V2);
|
||||
return V2;
|
||||
}
|
||||
|
||||
static inline uint32_t _decode_data(const uint8_t **dataPtrPtr, uint8_t code) {
|
||||
const uint8_t *dataPtr = *dataPtrPtr;
|
||||
uint32_t val;
|
||||
|
||||
if (code == 0) { // 1 byte
|
||||
val = (uint32_t) * dataPtr;
|
||||
dataPtr += 1;
|
||||
} else if (code == 1) { // 2 bytes
|
||||
val = (uint32_t) * (uint16_t *) dataPtr;
|
||||
dataPtr += 2;
|
||||
} else if (code == 2) { // 3 bytes
|
||||
val = (uint32_t) * (uint16_t *) dataPtr;
|
||||
val |= *(dataPtr + 2) << 16;
|
||||
dataPtr += 3;
|
||||
} else { // code == 3
|
||||
val = *(uint32_t *) dataPtr; // 4 bytes
|
||||
dataPtr += 4;
|
||||
}
|
||||
|
||||
*dataPtrPtr = dataPtr;
|
||||
return val;
|
||||
}
|
||||
|
||||
const uint8_t *svb_decode_scalar_d1_init(uint32_t *outPtr, const uint8_t *keyPtr,
|
||||
const uint8_t *dataPtr, uint32_t count,
|
||||
uint32_t prev) {
|
||||
if (count == 0)
|
||||
return dataPtr; // no reads or writes if no data
|
||||
|
||||
uint8_t shift = 0;
|
||||
uint32_t key = *keyPtr++;
|
||||
|
||||
for (uint32_t c = 0; c < count; c++) {
|
||||
if (shift == 8) {
|
||||
shift = 0;
|
||||
key = *keyPtr++;
|
||||
}
|
||||
uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3);
|
||||
val += prev;
|
||||
*outPtr++ = val;
|
||||
prev = val;
|
||||
shift += 2;
|
||||
}
|
||||
|
||||
return dataPtr; // pointer to first unused byte after end
|
||||
}
|
||||
|
||||
const uint8_t *svb_decode_avx_d1_init(uint32_t *out, const uint8_t *__restrict__ keyPtr,
|
||||
const uint8_t *__restrict__ dataPtr, uint64_t count, uint32_t prev) {
|
||||
uint64_t keybytes = count / 4; // number of key bytes
|
||||
if (keybytes >= 8) {
|
||||
__m128i Prev = _mm_set1_epi32(prev);
|
||||
__m128i Data;
|
||||
|
||||
int64_t Offset = -(int64_t) keybytes / 8 + 1;
|
||||
|
||||
const uint64_t *keyPtr64 = (const uint64_t *) keyPtr - Offset;
|
||||
uint64_t nextkeys = keyPtr64[Offset];
|
||||
for (; Offset != 0; ++Offset) {
|
||||
uint64_t keys = nextkeys;
|
||||
nextkeys = keyPtr64[Offset + 1];
|
||||
// faster 16-bit delta since we only have 8-bit values
|
||||
if (!keys) { // 32 1-byte ints in a row
|
||||
|
||||
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((__m128i *) (dataPtr)));
|
||||
Prev = _write_16bit_avx_d1(out, Data, Prev);
|
||||
Data = _mm_cvtepu8_epi16(
|
||||
_mm_lddqu_si128((__m128i *) (dataPtr + 8)));
|
||||
Prev = _write_16bit_avx_d1(out + 8, Data, Prev);
|
||||
Data = _mm_cvtepu8_epi16(
|
||||
_mm_lddqu_si128((__m128i *) (dataPtr + 16)));
|
||||
Prev = _write_16bit_avx_d1(out + 16, Data, Prev);
|
||||
Data = _mm_cvtepu8_epi16(
|
||||
_mm_lddqu_si128((__m128i *) (dataPtr + 24)));
|
||||
Prev = _write_16bit_avx_d1(out + 24, Data, Prev);
|
||||
out += 32;
|
||||
dataPtr += 32;
|
||||
continue;
|
||||
}
|
||||
|
||||
Data = _decode_avx(keys & 0x00FF, &dataPtr);
|
||||
Prev = _write_avx_d1(out, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 4, Data, Prev);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0x00FF), &dataPtr);
|
||||
Prev = _write_avx_d1(out + 8, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 12, Data, Prev);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0x00FF), &dataPtr);
|
||||
Prev = _write_avx_d1(out + 16, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 20, Data, Prev);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0x00FF), &dataPtr);
|
||||
Prev = _write_avx_d1(out + 24, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 28, Data, Prev);
|
||||
|
||||
out += 32;
|
||||
}
|
||||
{
|
||||
uint64_t keys = nextkeys;
|
||||
// faster 16-bit delta since we only have 8-bit values
|
||||
if (!keys) { // 32 1-byte ints in a row
|
||||
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((__m128i *) (dataPtr)));
|
||||
Prev = _write_16bit_avx_d1(out, Data, Prev);
|
||||
Data = _mm_cvtepu8_epi16(
|
||||
_mm_lddqu_si128((__m128i *) (dataPtr + 8)));
|
||||
Prev = _write_16bit_avx_d1(out + 8, Data, Prev);
|
||||
Data = _mm_cvtepu8_epi16(
|
||||
_mm_lddqu_si128((__m128i *) (dataPtr + 16)));
|
||||
Prev = _write_16bit_avx_d1(out + 16, Data, Prev);
|
||||
Data = _mm_cvtepu8_epi16(
|
||||
_mm_loadl_epi64((__m128i *) (dataPtr + 24)));
|
||||
Prev = _write_16bit_avx_d1(out + 24, Data, Prev);
|
||||
out += 32;
|
||||
dataPtr += 32;
|
||||
|
||||
} else {
|
||||
|
||||
Data = _decode_avx(keys & 0x00FF, &dataPtr);
|
||||
Prev = _write_avx_d1(out, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 4, Data, Prev);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0x00FF), &dataPtr);
|
||||
Prev = _write_avx_d1(out + 8, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 12, Data, Prev);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0x00FF), &dataPtr);
|
||||
Prev = _write_avx_d1(out + 16, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 20, Data, Prev);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0x00FF), &dataPtr);
|
||||
Prev = _write_avx_d1(out + 24, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 28, Data, Prev);
|
||||
|
||||
out += 32;
|
||||
}
|
||||
}
|
||||
prev = out[-1];
|
||||
}
|
||||
uint64_t consumedkeys = keybytes - (keybytes & 7);
|
||||
return svb_decode_scalar_d1_init(out, keyPtr + consumedkeys, dataPtr,
|
||||
count & 31, prev);
|
||||
}
|
||||
|
||||
size_t streamvbyte_delta_decode(const uint8_t* in, uint32_t* out,
|
||||
uint32_t count, uint32_t prev) {
|
||||
uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up)
|
||||
const uint8_t *keyPtr = in;
|
||||
const uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys
|
||||
return svb_decode_avx_d1_init(out, keyPtr, dataPtr, count, prev) - in;
|
||||
}
|
||||
73
cpp/streamvbyte/tests/unit.c
Normal file
73
cpp/streamvbyte/tests/unit.c
Normal file
@@ -0,0 +1,73 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "streamvbyte.h"
|
||||
#include "streamvbytedelta.h"
|
||||
|
||||
int main() {
|
||||
int N = 4096;
|
||||
uint32_t * datain = malloc(N * sizeof(uint32_t));
|
||||
uint8_t * compressedbuffer = malloc(2 * N * sizeof(uint32_t));
|
||||
uint32_t * recovdata = malloc(N * sizeof(uint32_t));
|
||||
|
||||
for (int length = 0; length <= N;) {
|
||||
printf("length = %d \n", length);
|
||||
for (uint32_t gap = 1; gap <= 387420489; gap *= 3) {
|
||||
for (int k = 0; k < length; ++k)
|
||||
datain[k] = gap;
|
||||
size_t compsize = streamvbyte_encode(datain, length,
|
||||
compressedbuffer);
|
||||
size_t usedbytes = streamvbyte_decode(compressedbuffer, recovdata,
|
||||
length);
|
||||
if (compsize != usedbytes) {
|
||||
printf(
|
||||
"[streamvbyte_decode] code is buggy gap = %d, size mismatch %d %d \n",
|
||||
(int) gap, (int) compsize, (int) usedbytes);
|
||||
return -1;
|
||||
}
|
||||
for (int k = 0; k < length; ++k) {
|
||||
if (recovdata[k] != datain[k]) {
|
||||
printf("[streamvbyte_decode] code is buggy gap = %d\n",
|
||||
(int) gap);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("Delta \n");
|
||||
for (size_t gap = 1; gap <= 531441; gap *= 3) {
|
||||
for (int k = 0; k < length; ++k)
|
||||
datain[k] = gap * k;
|
||||
size_t compsize = streamvbyte_delta_encode(datain, length,
|
||||
compressedbuffer, 0);
|
||||
size_t usedbytes = streamvbyte_delta_decode(compressedbuffer,
|
||||
recovdata, length, 0);
|
||||
if (compsize != usedbytes) {
|
||||
printf(
|
||||
"[streamvbyte_delta_decode] code is buggy gap = %d, size mismatch %d %d \n",
|
||||
(int) gap, (int) compsize, (int) usedbytes);
|
||||
return -1;
|
||||
}
|
||||
for (int k = 0; k < length; ++k) {
|
||||
if (recovdata[k] != datain[k]) {
|
||||
printf(
|
||||
"[streamvbyte_delta_decode] code is buggy gap = %d\n",
|
||||
(int) gap);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (length < 128)
|
||||
++length;
|
||||
else {
|
||||
length *= 2;
|
||||
}
|
||||
}
|
||||
free(datain);
|
||||
free(compressedbuffer);
|
||||
free(recovdata);
|
||||
printf("Code looks good.\n");
|
||||
return 0;
|
||||
}
|
||||
@@ -1,4 +1,3 @@
|
||||
extern crate rustc_serialize;
|
||||
extern crate tantivy;
|
||||
extern crate tempdir;
|
||||
|
||||
|
||||
@@ -74,7 +74,8 @@ pub mod tests {
|
||||
use Score;
|
||||
use core::SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use fastfield::U32FastFieldReader;
|
||||
use fastfield::U64FastFieldReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
@@ -125,9 +126,9 @@ pub mod tests {
|
||||
///
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct FastFieldTestCollector {
|
||||
vals: Vec<u32>,
|
||||
vals: Vec<u64>,
|
||||
field: Field,
|
||||
ff_reader: Option<U32FastFieldReader>,
|
||||
ff_reader: Option<U64FastFieldReader>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
@@ -139,14 +140,14 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn vals(self,) -> Vec<u32> {
|
||||
pub fn vals(self,) -> Vec<u64> {
|
||||
self.vals
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for FastFieldTestCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = reader.get_fast_field_reader(self.field);
|
||||
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -4,8 +4,38 @@ use common::serialize::BinarySerializable;
|
||||
use std::mem;
|
||||
|
||||
|
||||
pub fn compute_num_bits(amplitude: u32) -> u8 {
|
||||
(32u32 - amplitude.leading_zeros()) as u8
|
||||
/// Computes the number of bits that will be used for bitpacking.
|
||||
///
|
||||
/// In general the target is the minimum number of bits
|
||||
/// required to express the amplitude given in argument.
|
||||
///
|
||||
/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
|
||||
///
|
||||
/// The logic is slightly more convoluted here as for optimization
|
||||
/// reasons, we want to ensure that a value spawns over at most 8 bytes
|
||||
/// of aligns bytes.
|
||||
///
|
||||
/// Spawning over 9 bytes is possible for instance, if we do
|
||||
/// bitpacking with an amplitude of 63 bits.
|
||||
/// In this case, the second int will start on bit
|
||||
/// 63 (which belongs to byte 7) and ends at byte 15;
|
||||
/// Hence 9 bytes (from byte 7 to byte 15 included).
|
||||
///
|
||||
/// To avoid this, we force the number of bits to 64bits
|
||||
/// when the result is greater than `64-8 = 56 bits`.
|
||||
///
|
||||
/// Note that this only affects rare use cases spawning over
|
||||
/// a very large range of values. Even in this case, it results
|
||||
/// in an extra cost of at most 12% compared to the optimal
|
||||
/// number of bits.
|
||||
pub fn compute_num_bits(amplitude: u64) -> u8 {
|
||||
let amplitude = (64u32 - amplitude.leading_zeros()) as u8;
|
||||
if amplitude <= 64 - 8 {
|
||||
amplitude
|
||||
}
|
||||
else {
|
||||
64
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BitPacker {
|
||||
@@ -15,7 +45,7 @@ pub struct BitPacker {
|
||||
written_size: usize,
|
||||
}
|
||||
|
||||
impl BitPacker {
|
||||
impl BitPacker {
|
||||
|
||||
pub fn new(num_bits: usize) -> BitPacker {
|
||||
BitPacker {
|
||||
@@ -26,7 +56,7 @@ impl BitPacker {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write<TWrite: Write>(&mut self, val: u32, output: &mut TWrite) -> io::Result<()> {
|
||||
pub fn write<TWrite: Write>(&mut self, val: u64, output: &mut TWrite) -> io::Result<()> {
|
||||
let val_u64 = val as u64;
|
||||
if self.mini_buffer_written + self.num_bits > 64 {
|
||||
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
||||
@@ -67,22 +97,29 @@ impl BitPacker {
|
||||
|
||||
pub struct BitUnpacker {
|
||||
num_bits: usize,
|
||||
mask: u32,
|
||||
mask: u64,
|
||||
data_ptr: *const u8,
|
||||
data_len: usize,
|
||||
}
|
||||
|
||||
impl BitUnpacker {
|
||||
pub fn new(data: &[u8], num_bits: usize) -> BitUnpacker {
|
||||
let mask: u64 =
|
||||
if num_bits == 64 {
|
||||
!0u64
|
||||
}
|
||||
else {
|
||||
(1u64 << num_bits) - 1u64
|
||||
};
|
||||
BitUnpacker {
|
||||
num_bits: num_bits,
|
||||
mask: (1u32 << num_bits) - 1u32,
|
||||
mask: mask,
|
||||
data_ptr: data.as_ptr(),
|
||||
data_len: data.len()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: usize) -> u32 {
|
||||
pub fn get(&self, idx: usize) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0;
|
||||
}
|
||||
@@ -101,7 +138,7 @@ impl BitUnpacker {
|
||||
}
|
||||
val_unshifted_unmasked = unsafe { mem::transmute::<[u8; 8], u64>(arr) };
|
||||
}
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u32;
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
(val_shifted & self.mask)
|
||||
}
|
||||
|
||||
@@ -123,13 +160,14 @@ mod test {
|
||||
assert_eq!(compute_num_bits(4), 3u8);
|
||||
assert_eq!(compute_num_bits(255), 8u8);
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
|
||||
fn test_bitpacker_util(len: usize, num_bits: usize) {
|
||||
let mut data = Vec::new();
|
||||
let mut bitpacker = BitPacker::new(num_bits);
|
||||
let max_val: u32 = (1 << num_bits) - 1;
|
||||
let vals: Vec<u32> = (0u32..len as u32).map(|i| {
|
||||
let max_val: u64 = (1 << num_bits) - 1;
|
||||
let vals: Vec<u64> = (0u64..len as u64).map(|i| {
|
||||
if max_val == 0 {
|
||||
0
|
||||
}
|
||||
|
||||
58
src/common/counting_writer.rs
Normal file
58
src/common/counting_writer.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
use std::io::Write;
|
||||
use std::io;
|
||||
|
||||
|
||||
pub struct CountingWriter<W: Write> {
|
||||
underlying: W,
|
||||
written_bytes: usize,
|
||||
}
|
||||
|
||||
impl<W: Write> CountingWriter<W> {
|
||||
pub fn wrap(underlying: W) -> CountingWriter<W> {
|
||||
CountingWriter {
|
||||
underlying: underlying,
|
||||
written_bytes: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn written_bytes(&self,) -> usize {
|
||||
self.written_bytes
|
||||
}
|
||||
|
||||
pub fn finish(mut self) -> io::Result<(W, usize)> {
|
||||
self.flush()?;
|
||||
Ok((self.underlying, self.written_bytes))
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write> Write for CountingWriter<W> {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
let written_size = self.underlying.write(buf)?;
|
||||
self.written_bytes += written_size;
|
||||
Ok(written_size)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.underlying.flush()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::CountingWriter;
|
||||
use std::io::Write;
|
||||
|
||||
#[test]
|
||||
fn test_counting_writer() {
|
||||
let buffer: Vec<u8> = vec!();
|
||||
let mut counting_writer = CountingWriter::wrap(buffer);
|
||||
let bytes = (0u8..10u8).collect::<Vec<u8>>();
|
||||
counting_writer.write_all(&bytes).unwrap();
|
||||
let (w, len): (Vec<u8>, usize) = counting_writer.finish().unwrap();
|
||||
assert_eq!(len, 10);
|
||||
assert_eq!(w.len(), 10);
|
||||
}
|
||||
}
|
||||
@@ -2,13 +2,15 @@ mod serialize;
|
||||
mod timer;
|
||||
mod vint;
|
||||
pub mod bitpacker;
|
||||
mod counting_writer;
|
||||
|
||||
|
||||
pub use self::serialize::BinarySerializable;
|
||||
pub use self::timer::Timing;
|
||||
pub use self::timer::TimerTree;
|
||||
pub use self::timer::OpenTimer;
|
||||
pub use self::vint::VInt;
|
||||
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
use std::io;
|
||||
|
||||
/// Create a default io error given a string.
|
||||
@@ -28,15 +30,53 @@ pub trait HasLen {
|
||||
}
|
||||
}
|
||||
|
||||
const HIGHEST_BIT: u64 = 1 << 63;
|
||||
|
||||
/// Creates an uninitialized Vec of a given usize
|
||||
|
||||
/// Maps `i64` to `u64` so that
|
||||
/// `-2^63 .. 2^63-1` is mapped
|
||||
/// to
|
||||
/// `0 .. 2^64`
|
||||
/// in that order.
|
||||
///
|
||||
/// `allocate_vec` does an unsafe call to `set_len`
|
||||
/// as other solution are extremely slow in debug mode.
|
||||
pub fn allocate_vec<T>(capacity: usize) -> Vec<T> {
|
||||
let mut v = Vec::with_capacity(capacity);
|
||||
unsafe {
|
||||
v.set_len(capacity);
|
||||
/// This is more suited than simply casting (`val as u64`)
|
||||
/// because of bitpacking.
|
||||
///
|
||||
/// Imagine a list of `i64` ranging from -10 to 10.
|
||||
/// When casting negative values, the negative values are projected
|
||||
/// to values over 2^63, and all values end up requiring 64 bits.
|
||||
#[inline(always)]
|
||||
pub fn i64_to_u64(val: i64) -> u64 {
|
||||
(val as u64) ^ HIGHEST_BIT
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by
|
||||
/// `i64_to_u64`.
|
||||
#[inline(always)]
|
||||
pub fn u64_to_i64(val: u64) -> i64 {
|
||||
(val ^ HIGHEST_BIT) as i64
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::{i64_to_u64, u64_to_i64};
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
}
|
||||
v
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_i64_converter() {
|
||||
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
|
||||
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
|
||||
test_i64_converter_helper(0i64);
|
||||
test_i64_converter_helper(i64::min_value());
|
||||
test_i64_converter_helper(i64::max_value());
|
||||
for i in -1000i64..1000i64 {
|
||||
test_i64_converter_helper(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,6 +71,16 @@ impl BinarySerializable for u64 {
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for i64 {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
writer.write_i64::<Endianness>(*self)
|
||||
.map(|_| 8)
|
||||
}
|
||||
fn deserialize(reader: &mut Read) -> io::Result<i64> {
|
||||
reader.read_i64::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl BinarySerializable for u8 {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
|
||||
@@ -33,7 +33,7 @@ impl<'a> Drop for OpenTimer<'a> {
|
||||
}
|
||||
|
||||
/// Timing recording
|
||||
#[derive(Debug, RustcEncodable)]
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Timing {
|
||||
name: &'static str,
|
||||
duration: i64,
|
||||
@@ -41,7 +41,7 @@ pub struct Timing {
|
||||
}
|
||||
|
||||
/// Timer tree
|
||||
#[derive(Debug, RustcEncodable)]
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct TimerTree {
|
||||
timings: Vec<Timing>,
|
||||
}
|
||||
|
||||
@@ -110,7 +110,7 @@ pub mod tests {
|
||||
let data = generate_array(10_000, 0.1);
|
||||
let mut encoder = CompositeEncoder::new();
|
||||
let compressed = encoder.compress_unsorted(&data);
|
||||
assert_eq!(compressed.len(), 19_790);
|
||||
assert!(compressed.len() <= 19_794);
|
||||
let mut decoder = CompositeDecoder::new();
|
||||
let result = decoder.uncompress_unsorted(&compressed, data.len());
|
||||
for i in 0..data.len() {
|
||||
@@ -123,7 +123,7 @@ pub mod tests {
|
||||
let data = generate_array(10_000, 0.1);
|
||||
let mut encoder = CompositeEncoder::new();
|
||||
let compressed = encoder.compress_sorted(&data);
|
||||
assert_eq!(compressed.len(), 7_822);
|
||||
assert!(compressed.len() <= 7_826);
|
||||
let mut decoder = CompositeDecoder::new();
|
||||
let result = decoder.uncompress_sorted(&compressed, data.len());
|
||||
for i in 0..data.len() {
|
||||
|
||||
@@ -4,16 +4,32 @@
|
||||
mod composite;
|
||||
pub use self::composite::{CompositeEncoder, CompositeDecoder};
|
||||
|
||||
#[cfg(feature="simdcompression")]
|
||||
mod compression_simd;
|
||||
#[cfg(feature="simdcompression")]
|
||||
pub use self::compression_simd::{BlockEncoder, BlockDecoder};
|
||||
|
||||
|
||||
#[cfg(not(feature="simdcompression"))]
|
||||
mod compression_nosimd;
|
||||
#[cfg(not(feature="simdcompression"))]
|
||||
pub use self::compression_nosimd::{BlockEncoder, BlockDecoder};
|
||||
mod pack {
|
||||
mod compression_pack_nosimd;
|
||||
pub use self::compression_pack_nosimd::*;
|
||||
}
|
||||
|
||||
#[cfg(feature="simdcompression")]
|
||||
mod pack {
|
||||
mod compression_pack_simd;
|
||||
pub use self::compression_pack_simd::*;
|
||||
}
|
||||
|
||||
pub use self::pack::{BlockEncoder, BlockDecoder};
|
||||
|
||||
#[cfg( any(not(feature="simdcompression"), target_env="msvc") )]
|
||||
mod vint {
|
||||
mod compression_vint_nosimd;
|
||||
pub use self::compression_vint_nosimd::*;
|
||||
}
|
||||
|
||||
#[cfg( all(feature="simdcompression", not(target_env="msvc")) )]
|
||||
mod vint {
|
||||
mod compression_vint_simd;
|
||||
pub use self::compression_vint_simd::*;
|
||||
}
|
||||
|
||||
|
||||
pub trait VIntEncoder {
|
||||
@@ -26,51 +42,16 @@ pub trait VIntDecoder {
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> &'a [u8];
|
||||
}
|
||||
|
||||
impl VIntEncoder for BlockEncoder{
|
||||
impl VIntEncoder for BlockEncoder {
|
||||
|
||||
fn compress_vint_sorted(&mut self, input: &[u32], mut offset: u32) -> &[u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v - offset;
|
||||
offset = v;
|
||||
loop {
|
||||
let next_byte: u8 = (to_encode % 128u32) as u8;
|
||||
to_encode /= 128u32;
|
||||
if to_encode == 0u32 {
|
||||
self.output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
self.output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
&self.output[..byte_written]
|
||||
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8] {
|
||||
vint::compress_sorted(input, &mut self.output, offset)
|
||||
}
|
||||
|
||||
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v;
|
||||
loop {
|
||||
let next_byte: u8 = (to_encode % 128u32) as u8;
|
||||
to_encode /= 128u32;
|
||||
if to_encode == 0u32 {
|
||||
self.output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
self.output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
&self.output[..byte_written]
|
||||
vint::compress_unsorted(input, &mut self.output)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl VIntDecoder for BlockDecoder {
|
||||
|
||||
@@ -79,52 +60,19 @@ impl VIntDecoder for BlockDecoder {
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize) -> &'a [u8] {
|
||||
let mut read_byte = 0;
|
||||
let mut result = offset;
|
||||
for i in 0..num_els {
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[read_byte];
|
||||
read_byte += 1;
|
||||
result += ((cur_byte % 128u8) as u32) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
self.output[i] = result;
|
||||
}
|
||||
self.output_len = num_els;
|
||||
&compressed_data[read_byte..]
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
num_els: usize) -> &'a [u8] {
|
||||
let mut read_byte = 0;
|
||||
for i in 0..num_els {
|
||||
let mut result = 0u32;
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[read_byte];
|
||||
read_byte += 1;
|
||||
result += ((cur_byte % 128u8) as u32) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
self.output[i] = result;
|
||||
}
|
||||
self.output_len = num_els;
|
||||
&compressed_data[read_byte..]
|
||||
}
|
||||
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
pub const NUM_DOCS_PER_BLOCK: usize = 128; //< should be a power of 2 to let the compiler optimize.
|
||||
|
||||
@@ -224,7 +172,7 @@ pub mod tests {
|
||||
#[test]
|
||||
fn test_encode_vint() {
|
||||
{
|
||||
let expected_length = 123;
|
||||
let expected_length = 154;
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let input: Vec<u32> = (0u32..123u32)
|
||||
.map(|i| 4 + i * 7 / 2)
|
||||
@@ -232,23 +180,13 @@ pub mod tests {
|
||||
.collect();
|
||||
for offset in &[0u32, 1u32, 2u32] {
|
||||
let encoded_data = encoder.compress_vint_sorted(&input, *offset);
|
||||
assert_eq!(encoded_data.len(), expected_length);
|
||||
assert!(encoded_data.len() <= expected_length);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
let remaining_data = decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
|
||||
assert_eq!(0, remaining_data.len());
|
||||
assert_eq!(input, decoder.output_array());
|
||||
}
|
||||
}
|
||||
{
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let input = vec!(3u32, 17u32, 187u32);
|
||||
let encoded_data = encoder.compress_vint_sorted(&input, 0);
|
||||
assert_eq!(encoded_data.len(), 4);
|
||||
assert_eq!(encoded_data[0], 3u8 + 128u8);
|
||||
assert_eq!(encoded_data[1], (17u8 - 3u8) + 128u8);
|
||||
assert_eq!(encoded_data[2], (187u8 - 17u8 - 128u8));
|
||||
assert_eq!(encoded_data[3], (1u8 + 128u8));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -272,4 +210,27 @@ pub mod tests {
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
const NUM_INTS_BENCH_VINT: usize = 10;
|
||||
|
||||
#[bench]
|
||||
fn bench_compress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
b.iter(|| {
|
||||
encoder.compress_vint_sorted(&data, 0u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_uncompress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let compressed = encoder.compress_vint_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use common::bitpacker::compute_num_bits;
|
||||
use common::bitpacker::{BitPacker, BitUnpacker};
|
||||
use std::cmp;
|
||||
use std::io::Write;
|
||||
use super::NUM_DOCS_PER_BLOCK;
|
||||
use super::super::NUM_DOCS_PER_BLOCK;
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::NUM_DOCS_PER_BLOCK;
|
||||
use super::super::NUM_DOCS_PER_BLOCK;
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
|
||||
|
||||
92
src/compression/vint/compression_vint_nosimd.rs
Normal file
92
src/compression/vint/compression_vint_nosimd.rs
Normal file
@@ -0,0 +1,92 @@
|
||||
|
||||
#[inline(always)]
|
||||
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v - offset;
|
||||
offset = v;
|
||||
loop {
|
||||
let next_byte: u8 = (to_encode % 128u32) as u8;
|
||||
to_encode /= 128u32;
|
||||
if to_encode == 0u32 {
|
||||
output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
&output[..byte_written]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v;
|
||||
loop {
|
||||
let next_byte: u8 = (to_encode % 128u32) as u8;
|
||||
to_encode /= 128u32;
|
||||
if to_encode == 0u32 {
|
||||
output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
&output[..byte_written]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_sorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32) -> &'a [u8] {
|
||||
let mut read_byte = 0;
|
||||
let mut result = offset;
|
||||
let num_els = output.len();
|
||||
for i in 0..num_els {
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[read_byte];
|
||||
read_byte += 1;
|
||||
result += ((cur_byte % 128u8) as u32) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
output[i] = result;
|
||||
}
|
||||
&compressed_data[read_byte..]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_unsorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32]) -> &'a [u8] {
|
||||
let mut read_byte = 0;
|
||||
let num_els = output.len();
|
||||
for i in 0..num_els {
|
||||
let mut result = 0u32;
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[read_byte];
|
||||
read_byte += 1;
|
||||
result += ((cur_byte % 128u8) as u32) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
output[i] = result;
|
||||
}
|
||||
&compressed_data[read_byte..]
|
||||
}
|
||||
82
src/compression/vint/compression_vint_simd.rs
Normal file
82
src/compression/vint/compression_vint_simd.rs
Normal file
@@ -0,0 +1,82 @@
|
||||
|
||||
mod streamvbyte {
|
||||
|
||||
use libc::size_t;
|
||||
|
||||
extern {
|
||||
pub fn streamvbyte_delta_encode(
|
||||
data: *const u32,
|
||||
num_els: u32,
|
||||
output: *mut u8,
|
||||
offset: u32) -> size_t;
|
||||
|
||||
pub fn streamvbyte_delta_decode(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: u32,
|
||||
offset: u32) -> size_t;
|
||||
|
||||
pub fn streamvbyte_encode(
|
||||
data: *const u32,
|
||||
num_els: u32,
|
||||
output: *mut u8) -> size_t;
|
||||
|
||||
pub fn streamvbyte_decode(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: usize) -> size_t;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[inline(always)]
|
||||
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
|
||||
let compress_length = unsafe {
|
||||
streamvbyte::streamvbyte_delta_encode(
|
||||
input.as_ptr(),
|
||||
input.len() as u32,
|
||||
output.as_mut_ptr(),
|
||||
offset)
|
||||
};
|
||||
&output[..compress_length]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
let compress_length = unsafe {
|
||||
streamvbyte::streamvbyte_encode(
|
||||
input.as_ptr(),
|
||||
input.len() as u32,
|
||||
output.as_mut_ptr())
|
||||
};
|
||||
&output[..compress_length]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_sorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32) -> &'a [u8] {
|
||||
let consumed_bytes = unsafe {
|
||||
streamvbyte::streamvbyte_delta_decode(
|
||||
compressed_data.as_ptr(),
|
||||
output.as_mut_ptr(),
|
||||
output.len() as u32,
|
||||
offset)
|
||||
};
|
||||
&compressed_data[consumed_bytes..]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_unsorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32]) -> &'a [u8] {
|
||||
let consumed_bytes = unsafe {
|
||||
streamvbyte::streamvbyte_decode(
|
||||
compressed_data.as_ptr(),
|
||||
output.as_mut_ptr(),
|
||||
output.len())
|
||||
};
|
||||
&compressed_data[consumed_bytes..]
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use Result;
|
||||
use Error;
|
||||
use serde_json;
|
||||
use schema::Schema;
|
||||
use std::sync::Arc;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::fmt;
|
||||
use rustc_serialize::json;
|
||||
use core::SegmentId;
|
||||
use directory::{Directory, MmapDirectory, RAMDirectory};
|
||||
use indexer::index_writer::open_index_writer;
|
||||
@@ -29,7 +29,7 @@ const NUM_SEARCHERS: usize = 12;
|
||||
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||
let meta_string = String::from_utf8_lossy(&meta_data);
|
||||
json::decode(&meta_string)
|
||||
serde_json::from_str(&meta_string)
|
||||
.map_err(|e| Error::CorruptedFile(META_FILEPATH.clone(), Box::new(e)))
|
||||
}
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ use core::SegmentMeta;
|
||||
/// * the index docstamp
|
||||
/// * the schema
|
||||
///
|
||||
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
|
||||
#[derive(Clone,Debug,Serialize, Deserialize)]
|
||||
pub struct IndexMeta {
|
||||
pub segments: Vec<SegmentMeta>,
|
||||
pub schema: Schema,
|
||||
|
||||
@@ -68,8 +68,8 @@ impl Searcher {
|
||||
}
|
||||
|
||||
/// Returns the segment_reader associated with the given segment_ordinal
|
||||
pub fn segment_reader(&self, segment_ord: usize) -> &SegmentReader {
|
||||
&self.segment_readers[segment_ord]
|
||||
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
|
||||
&self.segment_readers[segment_ord as usize]
|
||||
}
|
||||
|
||||
/// Runs a query on the segment readers wrapped by the searcher
|
||||
@@ -78,6 +78,7 @@ impl Searcher {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl From<Vec<SegmentReader>> for Searcher {
|
||||
fn from(segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
Searcher {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use uuid::Uuid;
|
||||
use std::fmt;
|
||||
use rustc_serialize::{Encoder, Decoder, Encodable, Decodable};
|
||||
use std::cmp::{Ordering, Ord};
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -14,7 +13,7 @@ use std::sync::atomic;
|
||||
///
|
||||
/// In unit test, for reproducability, the SegmentId are
|
||||
/// simply generated in an autoincrement fashion.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct SegmentId(Uuid);
|
||||
|
||||
|
||||
@@ -65,18 +64,6 @@ impl SegmentId {
|
||||
}
|
||||
}
|
||||
|
||||
impl Encodable for SegmentId {
|
||||
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
|
||||
self.0.encode(s)
|
||||
}
|
||||
}
|
||||
|
||||
impl Decodable for SegmentId {
|
||||
fn decode<D: Decoder>(d: &mut D) -> Result<Self, D::Error> {
|
||||
Uuid::decode(d).map(SegmentId)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "Seg({:?})", self.short_uuid_string())
|
||||
|
||||
@@ -3,7 +3,7 @@ use super::SegmentComponent;
|
||||
use std::path::PathBuf;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[derive(Clone, Debug, RustcDecodable,RustcEncodable)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
num_deleted_docs: u32,
|
||||
opstamp: u64,
|
||||
@@ -13,7 +13,7 @@ struct DeleteMeta {
|
||||
///
|
||||
/// For instance the number of docs it contains,
|
||||
/// how many are deleted, etc.
|
||||
#[derive(Clone, Debug, RustcDecodable,RustcEncodable)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SegmentMeta {
|
||||
segment_id: SegmentId,
|
||||
max_doc: u32,
|
||||
|
||||
@@ -5,25 +5,29 @@ use core::SegmentComponent;
|
||||
use schema::Term;
|
||||
use common::HasLen;
|
||||
use core::SegmentMeta;
|
||||
use fastfield::delete::DeleteBitSet;
|
||||
use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::DeleteBitSet;
|
||||
use postings::BlockSegmentPostings;
|
||||
use store::StoreReader;
|
||||
use schema::Document;
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use std::str;
|
||||
use postings::TermInfo;
|
||||
use datastruct::FstMap;
|
||||
use datastruct::TermDictionary;
|
||||
use std::sync::Arc;
|
||||
use std::fmt;
|
||||
use schema::Field;
|
||||
use postings::SegmentPostingsOption;
|
||||
use postings::SegmentPostings;
|
||||
use fastfield::{U32FastFieldsReader, U32FastFieldReader};
|
||||
use fastfield::{FastFieldsReader, FastFieldReader, U64FastFieldReader};
|
||||
use schema::Schema;
|
||||
use schema::FieldType;
|
||||
use postings::FreqHandler;
|
||||
use schema::TextIndexingOptions;
|
||||
|
||||
|
||||
|
||||
/// Entry point to access all of the datastructures of the `Segment`
|
||||
///
|
||||
/// - term dictionary
|
||||
@@ -39,11 +43,11 @@ use schema::TextIndexingOptions;
|
||||
pub struct SegmentReader {
|
||||
segment_id: SegmentId,
|
||||
segment_meta: SegmentMeta,
|
||||
term_infos: Arc<FstMap<TermInfo>>,
|
||||
term_infos: Arc<TermDictionary<TermInfo>>,
|
||||
postings_data: ReadOnlySource,
|
||||
store_reader: StoreReader,
|
||||
fast_fields_reader: Arc<U32FastFieldsReader>,
|
||||
fieldnorms_reader: Arc<U32FastFieldsReader>,
|
||||
fast_fields_reader: Arc<FastFieldsReader>,
|
||||
fieldnorms_reader: Arc<FastFieldsReader>,
|
||||
delete_bitset: DeleteBitSet,
|
||||
positions_data: ReadOnlySource,
|
||||
schema: Schema,
|
||||
@@ -58,6 +62,11 @@ impl SegmentReader {
|
||||
self.segment_meta.max_doc()
|
||||
}
|
||||
|
||||
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
/// Returns the number of documents.
|
||||
/// Deleted documents are not counted.
|
||||
///
|
||||
@@ -73,31 +82,32 @@ impl SegmentReader {
|
||||
self.delete_bitset.len() as DocId
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn fast_fields_reader(&self) -> &FastFieldsReader {
|
||||
&*self.fast_fields_reader
|
||||
}
|
||||
|
||||
/// Accessor to a segment's fast field reader given a field.
|
||||
pub fn get_fast_field_reader(&self, field: Field) -> Option<U32FastFieldReader> {
|
||||
/// Returns the u32 fast value reader if the field
|
||||
/// is a u32 field indexed as "fast".
|
||||
///
|
||||
/// Return None if the field is not a u32 field
|
||||
/// indexed with the fast option.
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
///
|
||||
/// Returns the u64 fast value reader if the field
|
||||
/// is a u64 field indexed as "fast".
|
||||
///
|
||||
/// Return a FastFieldNotAvailableError if the field is not
|
||||
/// declared as a fast field in the schema.
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
pub fn get_fast_field_reader<TFastFieldReader: FastFieldReader>(&self, field: Field) -> fastfield::Result<TFastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
match field_entry.field_type() {
|
||||
&FieldType::Str(_) => {
|
||||
warn!("Field <{}> is not a fast field. It is a text field, and fast text fields are not supported yet.", field_entry.name());
|
||||
None
|
||||
},
|
||||
&FieldType::U32(ref u32_options) => {
|
||||
if u32_options.is_fast() {
|
||||
self.fast_fields_reader.get_field(field)
|
||||
}
|
||||
else {
|
||||
warn!("Field <{}> is not defined as a fast field.", field_entry.name());
|
||||
None
|
||||
}
|
||||
},
|
||||
if !TFastFieldReader::is_enabled(field_entry.field_type()) {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
else {
|
||||
Ok(
|
||||
self.fast_fields_reader
|
||||
.open_reader(field)
|
||||
.expect("Fast field file corrupted.")
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -108,8 +118,8 @@ impl SegmentReader {
|
||||
///
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U32FastFieldReader> {
|
||||
self.fieldnorms_reader.get_field(field)
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U64FastFieldReader> {
|
||||
self.fieldnorms_reader.open_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
@@ -129,15 +139,16 @@ impl SegmentReader {
|
||||
pub fn open(segment: Segment) -> Result<SegmentReader> {
|
||||
|
||||
let source = try!(segment.open_read(SegmentComponent::TERMS));
|
||||
let term_infos = try!(FstMap::from_source(source));
|
||||
let term_infos = try!(TermDictionary::from_source(source));
|
||||
let store_reader = StoreReader::from(try!(segment.open_read(SegmentComponent::STORE)));
|
||||
let postings_shared_mmap = try!(segment.open_read(SegmentComponent::POSTINGS));
|
||||
|
||||
let fast_field_data = try!(segment.open_read(SegmentComponent::FASTFIELDS));
|
||||
let fast_fields_reader = try!(U32FastFieldsReader::open(fast_field_data));
|
||||
|
||||
let fast_fields_reader = try!(FastFieldsReader::open(fast_field_data));
|
||||
|
||||
let fieldnorms_data = try!(segment.open_read(SegmentComponent::FIELDNORMS));
|
||||
let fieldnorms_reader = try!(U32FastFieldsReader::open(fieldnorms_data));
|
||||
let fieldnorms_reader = try!(FastFieldsReader::open(fieldnorms_data));
|
||||
|
||||
let positions_data = segment
|
||||
.open_read(SegmentComponent::POSITIONS)
|
||||
@@ -168,7 +179,7 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
/// Return the term dictionary datastructure.
|
||||
pub fn term_infos(&self) -> &FstMap<TermInfo> {
|
||||
pub fn term_infos(&self) -> &TermDictionary<TermInfo> {
|
||||
&self.term_infos
|
||||
}
|
||||
|
||||
@@ -181,16 +192,29 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
/// or `None` if the term has never been encounterred and indexed.
|
||||
///
|
||||
/// If the field was not indexed with the indexing options that cover
|
||||
/// the requested options, the returned `SegmentPostings` the method does not fail
|
||||
/// and returns a `SegmentPostings` with as much information as possible.
|
||||
///
|
||||
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions`
|
||||
/// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<SegmentPostings> {
|
||||
pub fn postings_data(&self, offset: usize) -> &[u8] {
|
||||
&self.postings_data[offset..]
|
||||
}
|
||||
|
||||
pub fn get_block_postings(&self) -> BlockSegmentPostings {
|
||||
BlockSegmentPostings::from_data(0, &self.postings_data[..], FreqHandler::new_without_freq())
|
||||
}
|
||||
|
||||
pub fn read_block_postings_from_terminfo(&self, term_info: &TermInfo, field_type: &FieldType) -> Option<BlockSegmentPostings> {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data = &self.postings_data[offset..];
|
||||
let freq_handler = match *field_type {
|
||||
FieldType::Str(_) => {
|
||||
FreqHandler::new_without_freq()
|
||||
}
|
||||
_ => {
|
||||
FreqHandler::new_without_freq()
|
||||
}
|
||||
};
|
||||
Some(BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler))
|
||||
}
|
||||
|
||||
pub fn read_block_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<BlockSegmentPostings> {
|
||||
let field = term.field();
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let term_info = get!(self.get_term_info(&term));
|
||||
@@ -230,7 +254,23 @@ impl SegmentReader {
|
||||
FreqHandler::new_without_freq()
|
||||
}
|
||||
};
|
||||
Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, &self.delete_bitset, freq_handler))
|
||||
Some(BlockSegmentPostings::from_data(term_info.doc_freq as usize, postings_data, freq_handler))
|
||||
}
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
/// or `None` if the term has never been encounterred and indexed.
|
||||
///
|
||||
/// If the field was not indexed with the indexing options that cover
|
||||
/// the requested options, the returned `SegmentPostings` the method does not fail
|
||||
/// and returns a `SegmentPostings` with as much information as possible.
|
||||
///
|
||||
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions`
|
||||
/// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<SegmentPostings> {
|
||||
self.read_block_postings(term, option)
|
||||
.map(|block_postings| {
|
||||
SegmentPostings::from_block_postings(block_postings, self.delete_bitset.clone())
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -249,7 +289,7 @@ impl SegmentReader {
|
||||
_ => SegmentPostingsOption::NoFreq,
|
||||
}
|
||||
}
|
||||
FieldType::U32(_) => SegmentPostingsOption::NoFreq
|
||||
FieldType::U64(_) | FieldType::I64(_) => SegmentPostingsOption::NoFreq
|
||||
};
|
||||
self.read_postings(term, segment_posting_option)
|
||||
}
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use fst::Streamer;
|
||||
use std::mem;
|
||||
use std::collections::BinaryHeap;
|
||||
use fst::map::Keys;
|
||||
use postings::TermInfo;
|
||||
use datastruct::TermDictionaryStreamer;
|
||||
use schema::Field;
|
||||
use schema::Term;
|
||||
use core::SegmentReader;
|
||||
@@ -34,7 +35,7 @@ impl Ord for HeapItem {
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
pub struct TermIterator<'a> {
|
||||
key_streams: Vec<Keys<'a>>,
|
||||
key_streams: Vec<TermDictionaryStreamer<'a, TermInfo>>,
|
||||
heap: BinaryHeap<HeapItem>,
|
||||
// Buffer hosting the list of segment ordinals containing
|
||||
// the current term.
|
||||
@@ -43,7 +44,7 @@ pub struct TermIterator<'a> {
|
||||
}
|
||||
|
||||
impl<'a> TermIterator<'a> {
|
||||
fn new(key_streams: Vec<Keys<'a>>) -> TermIterator<'a> {
|
||||
fn new(key_streams: Vec<TermDictionaryStreamer<'a, TermInfo>>) -> TermIterator<'a> {
|
||||
let key_streams_len = key_streams.len();
|
||||
TermIterator {
|
||||
key_streams: key_streams,
|
||||
@@ -98,7 +99,7 @@ impl<'a> TermIterator<'a> {
|
||||
|
||||
fn advance_segments(&mut self) {
|
||||
for segment_ord in self.current_segment_ords.drain(..) {
|
||||
if let Some(term) = self.key_streams[segment_ord].next() {
|
||||
if let Some((term, _val)) = self.key_streams[segment_ord].next() {
|
||||
self.heap.push(HeapItem {
|
||||
term: Term::from_bytes(term),
|
||||
segment_ord: segment_ord,
|
||||
@@ -126,7 +127,7 @@ impl<'a> From<&'a [SegmentReader]> for TermIterator<'a> {
|
||||
TermIterator::new(
|
||||
segment_readers
|
||||
.iter()
|
||||
.map(|reader| reader.term_infos().keys())
|
||||
.map(|reader| reader.term_infos().stream())
|
||||
.collect()
|
||||
)
|
||||
}
|
||||
@@ -175,9 +176,7 @@ mod tests {
|
||||
let mut term_it = searcher.terms();
|
||||
let mut terms = String::new();
|
||||
while let Some(term) = term_it.next() {
|
||||
unsafe {
|
||||
terms.push_str(term.text());
|
||||
}
|
||||
terms.push_str(term.text());
|
||||
}
|
||||
assert_eq!(terms, "abcdef");
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
|
||||
fn convert_fst_error(e: fst::Error) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e)
|
||||
}
|
||||
@@ -92,6 +93,10 @@ impl<V: BinarySerializable> FstMap<V> {
|
||||
self.fst_index.keys()
|
||||
}
|
||||
|
||||
pub fn fst_index(&self) -> &fst::Map {
|
||||
&self.fst_index
|
||||
}
|
||||
|
||||
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
|
||||
let total_len = source.len();
|
||||
let length_offset = total_len - 4;
|
||||
@@ -107,8 +112,8 @@ impl<V: BinarySerializable> FstMap<V> {
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
fn read_value(&self, offset: u64) -> V {
|
||||
|
||||
pub fn read_value(&self, offset: u64) -> V {
|
||||
let buffer = self.values_mmap.as_slice();
|
||||
let mut cursor = &buffer[(offset as usize)..];
|
||||
V::deserialize(&mut cursor).expect("Data in FST is corrupted")
|
||||
|
||||
@@ -1,7 +1,15 @@
|
||||
mod fstmap;
|
||||
mod skip;
|
||||
pub mod stacker;
|
||||
mod stream_dictionary;
|
||||
|
||||
|
||||
//pub use self::fstmap::FstMapBuilder as TermDictionaryBuilder;
|
||||
//pub use self::fstmap::FstMap as TermDictionary;
|
||||
|
||||
|
||||
pub use self::stream_dictionary::StreamDictionaryBuilder as TermDictionaryBuilder;
|
||||
pub use self::stream_dictionary::StreamDictionary as TermDictionary;
|
||||
pub use self::stream_dictionary::StreamDictionaryStreamer as TermDictionaryStreamer;
|
||||
|
||||
pub use self::fstmap::FstMapBuilder;
|
||||
pub use self::fstmap::FstMap;
|
||||
pub use self::skip::{SkipListBuilder, SkipList};
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
use std::iter;
|
||||
use std::marker::PhantomData;
|
||||
use super::heap::{Heap, HeapAllocable, BytesRef};
|
||||
|
||||
|
||||
|
||||
|
||||
/// dbj2 hash function
|
||||
fn djb2(key: &[u8]) -> u64 {
|
||||
let mut state: u64 = 5381;
|
||||
@@ -57,17 +53,40 @@ pub enum Entry {
|
||||
/// the computation of the hash of the key twice,
|
||||
/// or copying the key as long as there is no insert.
|
||||
///
|
||||
pub struct HashMap<'a, V> where V: HeapAllocable {
|
||||
pub struct HashMap<'a> {
|
||||
table: Box<[KeyValue]>,
|
||||
heap: &'a Heap,
|
||||
_phantom: PhantomData<V>,
|
||||
mask: usize,
|
||||
occupied: Vec<usize>,
|
||||
}
|
||||
|
||||
impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
|
||||
struct QuadraticProbing {
|
||||
hash: usize,
|
||||
i: usize,
|
||||
mask: usize,
|
||||
}
|
||||
|
||||
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a, V> {
|
||||
impl QuadraticProbing {
|
||||
fn compute(key: &[u8], mask: usize) -> QuadraticProbing {
|
||||
let hash = djb2(key) as usize;
|
||||
QuadraticProbing {
|
||||
hash: hash,
|
||||
i: 0,
|
||||
mask: mask,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> usize {
|
||||
self.i += 1;
|
||||
(self.hash + self.i * self.i) & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> HashMap<'a> {
|
||||
|
||||
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a> {
|
||||
let table_size = 1 << num_bucket_power_of_2;
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default())
|
||||
.take(table_size)
|
||||
@@ -75,16 +94,17 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
|
||||
HashMap {
|
||||
table: table.into_boxed_slice(),
|
||||
heap: heap,
|
||||
_phantom: PhantomData,
|
||||
mask: table_size - 1,
|
||||
occupied: Vec::with_capacity(table_size / 2),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn bucket(&self, key: &[u8]) -> usize {
|
||||
let hash: u64 = djb2(key);
|
||||
(hash as usize) & self.mask
|
||||
fn probe(&self, key: &[u8]) -> QuadraticProbing {
|
||||
QuadraticProbing::compute(key, self.mask)
|
||||
}
|
||||
|
||||
pub fn is_saturated(&self) -> bool {
|
||||
self.table.len() < self.occupied.len() * 5
|
||||
}
|
||||
|
||||
fn get_key(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
@@ -100,7 +120,7 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
|
||||
addr
|
||||
}
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self,) -> impl Iterator<Item=(&'a [u8], (u32, &'a V))> + 'b {
|
||||
pub fn iter<'b: 'a>(&'b self,) -> impl Iterator<Item=(&'a [u8], u32)> + 'b {
|
||||
let heap: &'a Heap = self.heap;
|
||||
let table: &'b [KeyValue] = &self.table;
|
||||
self.occupied
|
||||
@@ -109,23 +129,11 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
|
||||
.map(move |bucket: usize| {
|
||||
let kv = table[bucket];
|
||||
let addr = kv.value_addr;
|
||||
let v: &V = heap.get_mut_ref::<V>(addr);
|
||||
(heap.get_slice(kv.key), (addr, v))
|
||||
(heap.get_slice(kv.key), addr)
|
||||
})
|
||||
// .map(move |addr: u32| (heap.get_mut_ref::<V>(addr)) )
|
||||
}
|
||||
|
||||
pub fn values_mut<'b: 'a>(&'b self,) -> impl Iterator<Item=&'a mut V> + 'b {
|
||||
let heap: &'a Heap = self.heap;
|
||||
let table: &'b [KeyValue] = &self.table;
|
||||
self.occupied
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(move |bucket: usize| table[bucket].value_addr)
|
||||
.map(move |addr: u32| heap.get_mut_ref::<V>(addr))
|
||||
}
|
||||
|
||||
pub fn get_or_create<S: AsRef<[u8]>>(&mut self, key: S) -> &mut V {
|
||||
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(&mut self, key: S) -> &mut V {
|
||||
let entry = self.lookup(key.as_ref());
|
||||
match entry {
|
||||
Entry::Occupied(addr) => {
|
||||
@@ -141,8 +149,9 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
|
||||
|
||||
pub fn lookup<S: AsRef<[u8]>>(&self, key: S) -> Entry {
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let mut bucket = self.bucket(key_bytes);
|
||||
let mut probe = self.probe(key_bytes);
|
||||
loop {
|
||||
let bucket = probe.next();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
return Entry::Vacant(bucket);
|
||||
@@ -150,7 +159,6 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
|
||||
if self.get_key(kv.key) == key_bytes {
|
||||
return Entry::Occupied(kv.value_addr);
|
||||
}
|
||||
bucket = (bucket + 1) & self.mask;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -183,14 +191,11 @@ mod tests {
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let heap = Heap::with_capacity(2_000_000);
|
||||
let mut hash_map: HashMap<TestValue> = HashMap::new(18, &heap);
|
||||
let mut hash_map: HashMap = HashMap::new(18, &heap);
|
||||
{
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc");
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 3u32;
|
||||
|
||||
}
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd");
|
||||
@@ -205,10 +210,18 @@ mod tests {
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd");
|
||||
assert_eq!(v.val, 4u32);
|
||||
}
|
||||
let mut iter_values = hash_map.values_mut();
|
||||
assert_eq!(iter_values.next().unwrap().val, 3u32);
|
||||
assert_eq!(iter_values.next().unwrap().val, 4u32);
|
||||
assert!(!iter_values.next().is_some());
|
||||
let mut iter_values = hash_map.iter();
|
||||
{
|
||||
let (_, addr) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 3u32);
|
||||
}
|
||||
{
|
||||
let (_, addr) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 4u32);
|
||||
}
|
||||
assert!(iter_values.next().is_none());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::cell::UnsafeCell;
|
||||
use std::mem;
|
||||
use common::allocate_vec;
|
||||
use std::ptr;
|
||||
|
||||
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
|
||||
@@ -46,11 +45,6 @@ impl Heap {
|
||||
pub fn capacity(&self,) -> u32 {
|
||||
self.inner().capacity()
|
||||
}
|
||||
|
||||
/// Return the amount of memory that has been allocated so far.
|
||||
pub fn len(&self,) -> u32 {
|
||||
self.inner().len()
|
||||
}
|
||||
|
||||
/// Return amount of free space, in bytes.
|
||||
pub fn num_free_bytes(&self,) -> u32 {
|
||||
@@ -86,8 +80,14 @@ impl Heap {
|
||||
pub fn set<Item>(&self, addr: u32, val: &Item) {
|
||||
self.inner().set(addr, val);
|
||||
}
|
||||
|
||||
/// Returns a mutable reference for an object at a given Item.
|
||||
|
||||
/// Returns a reference to an `Item` at a given `addr`.
|
||||
#[cfg(test)]
|
||||
pub fn get_ref<Item>(&self, addr: u32) -> &Item {
|
||||
self.inner().get_mut_ref(addr)
|
||||
}
|
||||
|
||||
/// Returns a mutable reference to an `Item` at a given `addr`.
|
||||
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
|
||||
self.inner().get_mut_ref(addr)
|
||||
}
|
||||
@@ -100,81 +100,58 @@ impl Heap {
|
||||
|
||||
struct InnerHeap {
|
||||
buffer: Vec<u8>,
|
||||
buffer_len: u32,
|
||||
used: u32,
|
||||
next_heap: Option<Box<InnerHeap>>,
|
||||
has_been_resized: bool,
|
||||
}
|
||||
|
||||
|
||||
impl InnerHeap {
|
||||
|
||||
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
|
||||
let buffer: Vec<u8> = allocate_vec(num_bytes);
|
||||
let buffer: Vec<u8> = vec![0u8; num_bytes];
|
||||
InnerHeap {
|
||||
buffer: buffer,
|
||||
buffer_len: num_bytes as u32,
|
||||
next_heap: None,
|
||||
used: 0u32,
|
||||
has_been_resized: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.used = 0u32;
|
||||
self.next_heap = None;
|
||||
}
|
||||
|
||||
pub fn capacity(&self,) -> u32 {
|
||||
self.buffer.len() as u32
|
||||
}
|
||||
|
||||
pub fn len(&self,) -> u32 {
|
||||
self.used
|
||||
}
|
||||
|
||||
// Returns the number of free bytes. If the buffer
|
||||
// has reached it's capacity and overflowed to another buffer, return 0.
|
||||
pub fn num_free_bytes(&self,) -> u32 {
|
||||
if self.next_heap.is_some() {
|
||||
if self.has_been_resized {
|
||||
0u32
|
||||
}
|
||||
else {
|
||||
self.buffer_len - self.used
|
||||
(self.buffer.len() as u32) - self.used
|
||||
}
|
||||
}
|
||||
|
||||
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
|
||||
let addr = self.used;
|
||||
self.used += num_bytes as u32;
|
||||
if self.used <= self.buffer_len {
|
||||
addr
|
||||
let buffer_len = self.buffer.len();
|
||||
if self.used > buffer_len as u32 {
|
||||
self.buffer.resize(buffer_len * 2, 0u8);
|
||||
self.has_been_resized = true
|
||||
}
|
||||
else {
|
||||
if self.next_heap.is_none() {
|
||||
warn!("Exceeded heap size. The margin was apparently unsufficient. The segment will be committed right after indexing this very last document.");
|
||||
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
|
||||
}
|
||||
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
|
||||
}
|
||||
|
||||
|
||||
addr
|
||||
}
|
||||
|
||||
fn get_slice(&self, start: u32, stop: u32) -> &[u8] {
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap.as_ref().unwrap().get_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
}
|
||||
else {
|
||||
&self.buffer[start as usize..stop as usize]
|
||||
}
|
||||
&self.buffer[start as usize..stop as usize]
|
||||
}
|
||||
|
||||
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap.as_mut().unwrap().get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
}
|
||||
else {
|
||||
&mut self.buffer[start as usize..stop as usize]
|
||||
}
|
||||
&mut self.buffer[start as usize..stop as usize]
|
||||
}
|
||||
|
||||
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
|
||||
@@ -188,40 +165,23 @@ impl InnerHeap {
|
||||
}
|
||||
|
||||
fn get_mut(&mut self, addr: u32) -> *mut u8 {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap.as_mut().unwrap().get_mut(addr - self.buffer_len)
|
||||
}
|
||||
else {
|
||||
let addr_isize = addr as isize;
|
||||
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
|
||||
}
|
||||
let addr_isize = addr as isize;
|
||||
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap.as_mut().unwrap().get_mut_ref(addr - self.buffer_len)
|
||||
}
|
||||
else {
|
||||
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
|
||||
let v_ptr = v_ptr_u8 as *mut Item;
|
||||
unsafe { &mut *v_ptr }
|
||||
}
|
||||
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
|
||||
let v_ptr = v_ptr_u8 as *mut Item;
|
||||
unsafe { &mut *v_ptr }
|
||||
}
|
||||
|
||||
fn set<Item>(&mut self, addr: u32, val: &Item) {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap.as_mut().unwrap().set(addr - self.buffer_len, val);
|
||||
}
|
||||
else {
|
||||
let v_ptr: *const Item = val as *const Item;
|
||||
let v_ptr_u8: *const u8 = v_ptr as *const u8;
|
||||
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
|
||||
unsafe {
|
||||
let dest_ptr: *mut u8 = self.get_mut(addr);
|
||||
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
|
||||
}
|
||||
let v_ptr: *const Item = val as *const Item;
|
||||
let v_ptr_u8: *const u8 = v_ptr as *const u8;
|
||||
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
|
||||
unsafe {
|
||||
let dest_ptr: *mut u8 = self.get_mut(addr);
|
||||
ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::<Item>());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -18,10 +18,10 @@ fn test_unrolled_linked_list() {
|
||||
ks.push(2);
|
||||
ks.push(3);
|
||||
for k in (1..5).map(|k| k * 100) {
|
||||
let mut hashmap: HashMap<ExpUnrolledLinkedList> = HashMap::new(10, &heap);
|
||||
let mut hashmap: HashMap = HashMap::new(10, &heap);
|
||||
for j in 0..k {
|
||||
for i in 0..500 {
|
||||
let mut list = hashmap.get_or_create(i.to_string());
|
||||
let mut list: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string());
|
||||
list.push(i*j, &heap);
|
||||
}
|
||||
}
|
||||
|
||||
465
src/datastruct/stream_dictionary.rs
Normal file
465
src/datastruct/stream_dictionary.rs
Normal file
@@ -0,0 +1,465 @@
|
||||
#![allow(should_implement_trait)]
|
||||
|
||||
use std::cmp::max;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
use fst;
|
||||
use fst::raw::Fst;
|
||||
use common::VInt;
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
use std::marker::PhantomData;
|
||||
use common::CountingWriter;
|
||||
use std::cmp::Ordering;
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use std::str;
|
||||
use fst::raw::Node;
|
||||
use fst::raw::CompiledAddr;
|
||||
|
||||
const BLOCK_SIZE: usize = 1024;
|
||||
|
||||
fn convert_fst_error(e: fst::Error) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e)
|
||||
}
|
||||
|
||||
pub struct StreamDictionaryBuilder<W: Write, V: BinarySerializable + Clone + Default> {
|
||||
write: CountingWriter<W>,
|
||||
block_index: fst::MapBuilder<Vec<u8>>,
|
||||
last_key: Vec<u8>,
|
||||
len: usize,
|
||||
_phantom_: PhantomData<V>,
|
||||
}
|
||||
|
||||
fn common_prefix_length(left: &[u8], right: &[u8]) -> usize {
|
||||
left.iter().cloned()
|
||||
.zip(right.iter().cloned())
|
||||
.take_while(|&(b1, b2)| b1 == b2)
|
||||
.count()
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn fill_last<'a>(fst: &'a Fst, mut node: Node<'a>, buffer: &mut Vec<u8>) {
|
||||
loop {
|
||||
if let Some(transition) = node.transitions().last() {
|
||||
buffer.push(transition.inp);
|
||||
node = fst.node(transition.addr);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn strictly_previous_key<B: AsRef<[u8]>>(fst_map: &fst::Map, key_as_ref: B) -> (Vec<u8>, u64) {
|
||||
let key = key_as_ref.as_ref();
|
||||
let fst = fst_map.as_fst();
|
||||
let mut node = fst.root();
|
||||
let mut node_stack: Vec<Node> = vec!(node.clone());
|
||||
|
||||
// first check the longest prefix.
|
||||
for &b in &key[..key.len() - 1] {
|
||||
node = match node.find_input(b) {
|
||||
None => {
|
||||
break;
|
||||
},
|
||||
Some(i) => {
|
||||
fst.node(node.transition_addr(i))
|
||||
},
|
||||
};
|
||||
node_stack.push(node);
|
||||
}
|
||||
|
||||
let len_node_stack = node_stack.len();
|
||||
for i in (1..len_node_stack).rev() {
|
||||
let cur_node = &node_stack[i];
|
||||
let b: u8 = key[i];
|
||||
let last_transition_opt = cur_node
|
||||
.transitions()
|
||||
.take_while(|transition| transition.inp < b)
|
||||
.last();
|
||||
|
||||
if let Some(last_transition) = last_transition_opt {
|
||||
let mut result_buffer = Vec::from(&key[..i]);
|
||||
result_buffer.push(last_transition.inp);
|
||||
let mut result = Vec::from(&key[..i]);
|
||||
result.push(last_transition.inp);
|
||||
let fork_node = fst.node(last_transition.addr);
|
||||
fill_last(fst, fork_node, &mut result);
|
||||
let val = fst_map.get(&result).unwrap();
|
||||
return (result, val);
|
||||
}
|
||||
else if cur_node.is_final() {
|
||||
// the previous key is a prefix
|
||||
let result_buffer = Vec::from(&key[..i]);
|
||||
let val = fst_map.get(&result_buffer).unwrap();
|
||||
return (result_buffer, val);
|
||||
}
|
||||
}
|
||||
|
||||
return (vec!(), 0);
|
||||
}
|
||||
|
||||
|
||||
impl<W: Write, V: BinarySerializable + Clone + Default> StreamDictionaryBuilder<W, V> {
|
||||
|
||||
pub fn new(write: W) -> io::Result<StreamDictionaryBuilder<W, V>> {
|
||||
let buffer: Vec<u8> = vec!();
|
||||
Ok(StreamDictionaryBuilder {
|
||||
write: CountingWriter::wrap(write),
|
||||
block_index: fst::MapBuilder::new(buffer)
|
||||
.expect("This cannot fail"),
|
||||
last_key: Vec::with_capacity(128),
|
||||
len: 0,
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
fn add_index_entry(&mut self) {
|
||||
self.block_index.insert(&self.last_key, self.write.written_bytes() as u64).unwrap();
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()>{
|
||||
self.insert_key(key)?;
|
||||
self.insert_value(value)
|
||||
}
|
||||
|
||||
pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()>{
|
||||
if self.len % BLOCK_SIZE == 0 {
|
||||
self.add_index_entry();
|
||||
}
|
||||
self.len += 1;
|
||||
let common_len = common_prefix_length(key, &self.last_key);
|
||||
VInt(common_len as u64).serialize(&mut self.write)?;
|
||||
self.last_key.truncate(common_len);
|
||||
self.last_key.extend_from_slice(&key[common_len..]);
|
||||
VInt((key.len() - common_len) as u64).serialize(&mut self.write)?;
|
||||
self.write.write_all(&key[common_len..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert_value(&mut self, value: &V) -> io::Result<()>{
|
||||
value.serialize(&mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(mut self) -> io::Result<W> {
|
||||
self.add_index_entry();
|
||||
let (mut w, split_len) = self.write.finish()?;
|
||||
let fst_write = self.block_index
|
||||
.into_inner()
|
||||
.map_err(convert_fst_error)?;
|
||||
w.write(&fst_write)?;
|
||||
(split_len as u64).serialize(&mut w)?;
|
||||
w.flush()?;
|
||||
Ok(w)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn stream_before<'a, V: 'a + Clone + Default + BinarySerializable>(stream_dictionary: &'a StreamDictionary<V>, target_key: &[u8]) -> StreamDictionaryStreamer<'a, V> {
|
||||
let (prev_key, offset) = strictly_previous_key(&stream_dictionary.fst_index, target_key.as_ref());
|
||||
let offset: usize = offset as usize;
|
||||
StreamDictionaryStreamer {
|
||||
cursor: &stream_dictionary.stream_data.as_slice()[offset..],
|
||||
current_key: Vec::from(prev_key),
|
||||
current_value: V::default(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct StreamDictionary<V> where V:BinarySerializable + Default + Clone {
|
||||
stream_data: ReadOnlySource,
|
||||
fst_index: fst::Map,
|
||||
_phantom_: PhantomData<V>,
|
||||
}
|
||||
|
||||
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
|
||||
Ok(fst::Map::from(match source {
|
||||
ReadOnlySource::Anonymous(data) => try!(Fst::from_shared_bytes(data.data, data.start, data.len).map_err(convert_fst_error)),
|
||||
ReadOnlySource::Mmap(mmap_readonly) => try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)),
|
||||
}))
|
||||
}
|
||||
|
||||
|
||||
impl<V: BinarySerializable + Clone + Default> StreamDictionary<V> {
|
||||
|
||||
pub fn from_source(source: ReadOnlySource) -> io::Result<StreamDictionary<V>> {
|
||||
let total_len = source.len();
|
||||
let length_offset = total_len - 8;
|
||||
let split_len: usize = {
|
||||
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
|
||||
u64::deserialize(&mut split_len_buffer)? as usize
|
||||
};
|
||||
let stream_data = source.slice(0, split_len);
|
||||
let fst_data = source.slice(split_len, length_offset);
|
||||
let fst_index = open_fst_index(fst_data)?;
|
||||
|
||||
Ok(StreamDictionary {
|
||||
stream_data: stream_data,
|
||||
fst_index: fst_index,
|
||||
_phantom_: PhantomData
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get<K: AsRef<[u8]>>(&self, target_key: K) -> Option<V> {
|
||||
let mut streamer = stream_before(self, target_key.as_ref());
|
||||
while let Some((iter_key, iter_val)) = streamer.next() {
|
||||
match iter_key.cmp(target_key.as_ref()) {
|
||||
Ordering::Less => {}
|
||||
Ordering::Equal => {
|
||||
let val: V = (*iter_val).clone();
|
||||
return Some(val);
|
||||
}
|
||||
Ordering::Greater => {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
return None;
|
||||
}
|
||||
|
||||
pub fn range(&self) -> StreamDictionaryStreamerBuilder<V> {
|
||||
let data: &[u8] = &self.stream_data;
|
||||
StreamDictionaryStreamerBuilder {
|
||||
stream_dictionary: &self,
|
||||
offset_from: 0,
|
||||
offset_to: (data.as_ptr() as usize) + data.len(),
|
||||
current_key: vec!(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stream(&self) -> StreamDictionaryStreamer<V> {
|
||||
StreamDictionaryStreamer {
|
||||
cursor: &*self.stream_data,
|
||||
current_key: Vec::with_capacity(128),
|
||||
current_value: V::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StreamDictionaryStreamerBuilder<'a, V: 'a + BinarySerializable + Clone + Default> {
|
||||
stream_dictionary: &'a StreamDictionary<V>,
|
||||
offset_from: usize,
|
||||
offset_to: usize,
|
||||
current_key: Vec<u8>,
|
||||
}
|
||||
|
||||
|
||||
/// Returns offset information for the first
|
||||
/// key in the stream matching a given predicate.
|
||||
///
|
||||
/// returns (start offset, the data required to load the value)
|
||||
fn get_offset<'a, V, P: Fn(&[u8])->bool>(predicate: P, mut streamer: StreamDictionaryStreamer<V>) -> (usize, Vec<u8>)
|
||||
where V: 'a + BinarySerializable + Clone + Default {
|
||||
let mut prev: &[u8] = streamer.cursor;
|
||||
|
||||
let mut prev_data: Vec<u8> = streamer.current_key.clone();
|
||||
|
||||
while let Some((iter_key, _)) = streamer.next() {
|
||||
if !predicate(iter_key) {
|
||||
return (prev.as_ptr() as usize, prev_data);
|
||||
}
|
||||
prev = streamer.cursor;
|
||||
prev_data.clear();
|
||||
prev_data.extend_from_slice(iter_key);
|
||||
}
|
||||
return (prev.as_ptr() as usize, prev_data);
|
||||
}
|
||||
|
||||
impl<'a, V: 'a + BinarySerializable + Clone + Default> StreamDictionaryStreamerBuilder<'a, V> {
|
||||
pub fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> StreamDictionaryStreamerBuilder<'a, V> {
|
||||
let target_key = bound.as_ref();
|
||||
let streamer = stream_before(&self.stream_dictionary, target_key.as_ref());
|
||||
let smaller_than = |k: &[u8]| { k.lt(target_key) };
|
||||
let (offset_before, current_key) = get_offset(smaller_than, streamer);
|
||||
self.current_key = current_key;
|
||||
self.offset_from = offset_before;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> StreamDictionaryStreamerBuilder<'a, V> {
|
||||
let target_key = bound.as_ref();
|
||||
let streamer = stream_before(self.stream_dictionary, target_key.as_ref());
|
||||
let smaller_than = |k: &[u8]| { k.le(target_key) };
|
||||
let (offset_before, current_key) = get_offset(smaller_than, streamer);
|
||||
self.current_key = current_key;
|
||||
self.offset_from = offset_before;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> StreamDictionaryStreamerBuilder<'a, V> {
|
||||
let target_key = bound.as_ref();
|
||||
let streamer = stream_before(self.stream_dictionary, target_key.as_ref());
|
||||
let smaller_than = |k: &[u8]| { k.le(target_key) };
|
||||
let (offset_before, _) = get_offset(smaller_than, streamer);
|
||||
self.offset_to = offset_before;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn le<T: AsRef<[u8]>>(mut self, bound: T) -> StreamDictionaryStreamerBuilder<'a, V> {
|
||||
let target_key = bound.as_ref();
|
||||
let streamer = stream_before(self.stream_dictionary, target_key.as_ref());
|
||||
let smaller_than = |k: &[u8]| { k.lt(target_key) };
|
||||
let (offset_before, _) = get_offset(smaller_than, streamer);
|
||||
self.offset_to = offset_before;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn into_stream(self) -> StreamDictionaryStreamer<'a, V> {
|
||||
let data: &[u8] = &self.stream_dictionary.stream_data.as_slice()[..];
|
||||
let origin = data.as_ptr() as usize;
|
||||
let start = self.offset_from - origin;
|
||||
let stop = max(self.offset_to - origin, start);
|
||||
StreamDictionaryStreamer {
|
||||
cursor: &data[start..stop],
|
||||
current_key: self.current_key,
|
||||
current_value: V::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StreamDictionaryStreamer<'a, V: BinarySerializable> {
|
||||
cursor: &'a [u8],
|
||||
current_key: Vec<u8>,
|
||||
current_value: V,
|
||||
}
|
||||
|
||||
impl<'a, V: BinarySerializable> StreamDictionaryStreamer<'a, V> {
|
||||
|
||||
pub fn next(&mut self) -> Option<(&[u8], &V)> {
|
||||
if self.cursor.len() == 0 {
|
||||
return None;
|
||||
}
|
||||
let common_length: usize = VInt::deserialize(&mut self.cursor).unwrap().0 as usize;
|
||||
let new_length: usize = common_length + VInt::deserialize(&mut self.cursor).unwrap().0 as usize;
|
||||
self.current_key.reserve(new_length);
|
||||
unsafe {
|
||||
self.current_key.set_len(new_length);
|
||||
}
|
||||
self.cursor.read_exact(&mut self.current_key[common_length..new_length]).unwrap();
|
||||
self.current_value = V::deserialize(&mut self.cursor).unwrap();
|
||||
Some((&self.current_key, &self.current_value))
|
||||
}
|
||||
|
||||
|
||||
pub fn key(&self) -> &[u8] {
|
||||
&self.current_key
|
||||
}
|
||||
|
||||
pub fn value(&self) -> &V {
|
||||
&self.current_value
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use std::str;
|
||||
use directory::ReadOnlySource;
|
||||
use super::CountingWriter;
|
||||
use std::io::Write;
|
||||
use super::{BLOCK_SIZE, StreamDictionary, StreamDictionaryBuilder};
|
||||
|
||||
#[test]
|
||||
fn test_stream_dictionary() {
|
||||
let ids: Vec<_> = (0u32..10_000u32)
|
||||
.map(|i| (format!("doc{:0>6}", i), i))
|
||||
.collect();
|
||||
let buffer: Vec<u8> = {
|
||||
let mut stream_dictionary_builder = StreamDictionaryBuilder::new(vec!()).unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
stream_dictionary_builder.insert(id.as_bytes(), i).unwrap();
|
||||
}
|
||||
stream_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
let source = ReadOnlySource::from(buffer);
|
||||
let stream_dictionary: StreamDictionary<u32> = StreamDictionary::from_source(source).unwrap();
|
||||
{
|
||||
let mut streamer = stream_dictionary.stream();
|
||||
let mut i = 0;
|
||||
while let Some((streamer_k, streamer_v)) = streamer.next() {
|
||||
let &(ref key, ref v) = &ids[i];
|
||||
assert_eq!(streamer_k, key.as_bytes());
|
||||
assert_eq!(streamer_v, v);
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
let &(ref key, ref _v) = &ids[2047];
|
||||
stream_dictionary.get(key.as_bytes());
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_stream_range() {
|
||||
let ids: Vec<_> = (0u32..10_000u32)
|
||||
.map(|i| (format!("doc{:0>6}", i), i))
|
||||
.collect();
|
||||
let buffer: Vec<u8> = {
|
||||
let mut stream_dictionary_builder = StreamDictionaryBuilder::new(vec!()).unwrap();
|
||||
for &(ref id, ref i) in &ids {
|
||||
stream_dictionary_builder.insert(id.as_bytes(), i).unwrap();
|
||||
}
|
||||
stream_dictionary_builder.finish().unwrap()
|
||||
};
|
||||
let source = ReadOnlySource::from(buffer);
|
||||
|
||||
let stream_dictionary: StreamDictionary<u32> = StreamDictionary::from_source(source).unwrap();
|
||||
{
|
||||
for i in (0..20).chain((BLOCK_SIZE - 10..BLOCK_SIZE + 10)) {
|
||||
let &(ref target_key, _) = &ids[i];
|
||||
let mut streamer = stream_dictionary
|
||||
.range()
|
||||
.ge(target_key.as_bytes())
|
||||
.into_stream();
|
||||
for j in 0..3 {
|
||||
let (streamer_k, streamer_v) = streamer.next().unwrap();
|
||||
let &(ref key, ref v) = &ids[i + j];
|
||||
assert_eq!(str::from_utf8(streamer_k).unwrap(), key);
|
||||
assert_eq!(streamer_v, v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
for i in (0..20).chain((BLOCK_SIZE - 10..BLOCK_SIZE + 10)) {
|
||||
let &(ref target_key, _) = &ids[i];
|
||||
let mut streamer = stream_dictionary
|
||||
.range()
|
||||
.gt(target_key.as_bytes())
|
||||
.into_stream();
|
||||
for j in 0..3 {
|
||||
let (streamer_k, streamer_v) = streamer.next().unwrap();
|
||||
let &(ref key, ref v) = &ids[i + j + 1];
|
||||
assert_eq!(streamer_k, key.as_bytes());
|
||||
assert_eq!(streamer_v, v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
for i in (0..20).chain((BLOCK_SIZE - 10..BLOCK_SIZE + 10)) {
|
||||
for j in 0..3 {
|
||||
let &(ref fst_key, _) = &ids[i];
|
||||
let &(ref last_key, _) = &ids[i + 3];
|
||||
let mut streamer = stream_dictionary
|
||||
.range()
|
||||
.ge(fst_key.as_bytes())
|
||||
.lt(last_key.as_bytes())
|
||||
.into_stream();
|
||||
for _ in 0..(j + 1) {
|
||||
assert!(streamer.next().is_some());
|
||||
}
|
||||
assert!(streamer.next().is_some());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
use serde_json;
|
||||
use directory::error::{OpenReadError, DeleteError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use std::result;
|
||||
@@ -7,7 +8,6 @@ use Directory;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::collections::HashSet;
|
||||
use std::io::Write;
|
||||
use rustc_serialize::json;
|
||||
use core::MANAGED_FILEPATH;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
@@ -74,7 +74,7 @@ impl ManagedDirectory {
|
||||
match directory.atomic_read(&MANAGED_FILEPATH) {
|
||||
Ok(data) => {
|
||||
let managed_files_json = String::from_utf8_lossy(&data);
|
||||
let managed_files: HashSet<PathBuf> = json::decode(&managed_files_json)
|
||||
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
||||
.map_err(|e| Error::CorruptedFile(MANAGED_FILEPATH.clone(), Box::new(e)))?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
@@ -204,8 +204,8 @@ impl ManagedDirectory {
|
||||
.expect("Managed file lock poisoned");
|
||||
managed_paths = meta_informations_rlock.managed_paths.clone();
|
||||
}
|
||||
let mut w = vec!();
|
||||
try!(write!(&mut w, "{}\n", json::as_pretty_json(&managed_paths)));
|
||||
let mut w = try!(serde_json::to_vec(&managed_paths));
|
||||
try!(write!(&mut w, "\n"));
|
||||
self.directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -53,7 +53,7 @@ fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadE
|
||||
|
||||
}
|
||||
|
||||
#[derive(Default,Clone,Debug,RustcDecodable,RustcEncodable)]
|
||||
#[derive(Default,Clone,Debug,Serialize,Deserialize)]
|
||||
pub struct CacheCounters {
|
||||
// Number of time the cache prevents to call `mmap`
|
||||
pub hit: usize,
|
||||
@@ -65,7 +65,7 @@ pub struct CacheCounters {
|
||||
pub miss_weak: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
|
||||
#[derive(Clone,Debug,Serialize,Deserialize)]
|
||||
pub struct CacheInfo {
|
||||
pub counters: CacheCounters,
|
||||
pub mmapped: Vec<PathBuf>,
|
||||
|
||||
@@ -79,3 +79,10 @@ impl Clone for ReadOnlySource {
|
||||
self.slice(0, self.len())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<u8>> for ReadOnlySource {
|
||||
fn from(data: Vec<u8>) -> ReadOnlySource {
|
||||
let shared_data = SharedVecSlice::from(data);
|
||||
ReadOnlySource::Anonymous(shared_data)
|
||||
}
|
||||
}
|
||||
@@ -35,3 +35,9 @@ impl SharedVecSlice {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<u8>> for SharedVecSlice {
|
||||
fn from(data: Vec<u8>) -> SharedVecSlice {
|
||||
SharedVecSlice::new(Arc::new(data))
|
||||
}
|
||||
}
|
||||
20
src/error.rs
20
src/error.rs
@@ -10,9 +10,8 @@ use std::sync::PoisonError;
|
||||
use directory::error::{OpenReadError, OpenWriteError, OpenDirectoryError};
|
||||
use query;
|
||||
use schema;
|
||||
|
||||
|
||||
|
||||
use fastfield::FastFieldNotAvailableError;
|
||||
use serde_json;
|
||||
|
||||
|
||||
/// Generic tantivy error.
|
||||
@@ -38,7 +37,14 @@ pub enum Error {
|
||||
ErrorInThread(String),
|
||||
/// An Error appeared related to the lack of a field.
|
||||
SchemaError(String),
|
||||
|
||||
/// Tried to access a fastfield reader for a field not configured accordingly.
|
||||
FastFieldError(FastFieldNotAvailableError)
|
||||
}
|
||||
|
||||
impl From<FastFieldNotAvailableError> for Error {
|
||||
fn from(fastfield_error: FastFieldNotAvailableError) -> Error {
|
||||
Error::FastFieldError(fastfield_error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<io::Error> for Error {
|
||||
@@ -95,3 +101,9 @@ impl From<OpenDirectoryError> for Error {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for Error {
|
||||
fn from(error: serde_json::Error) -> Error {
|
||||
Error::IOError(error.into())
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,9 @@ use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use common::HasLen;
|
||||
|
||||
/// Write a delete BitSet
|
||||
///
|
||||
/// where `delete_bitset` is the set of deleted `DocId`.
|
||||
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> {
|
||||
let max_doc = delete_bitset.capacity();
|
||||
let mut byte = 0u8;
|
||||
@@ -29,14 +32,16 @@ pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io:
|
||||
writer.flush()
|
||||
}
|
||||
|
||||
/// Set of deleted `DocId`s.
|
||||
#[derive(Clone)]
|
||||
pub struct DeleteBitSet {
|
||||
data: ReadOnlySource,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl DeleteBitSet {
|
||||
|
||||
impl DeleteBitSet {
|
||||
/// Opens a delete bitset given its data source.
|
||||
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
|
||||
let num_deleted: usize = data
|
||||
.as_slice()
|
||||
@@ -49,6 +54,7 @@ impl DeleteBitSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty delete bit set.
|
||||
pub fn empty() -> DeleteBitSet {
|
||||
DeleteBitSet {
|
||||
data: ReadOnlySource::empty(),
|
||||
@@ -56,10 +62,12 @@ impl DeleteBitSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true iff the segment has some deleted documents.
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.len() > 0
|
||||
}
|
||||
|
||||
/// Returns true iff the document is deleted.
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
if self.len == 0 {
|
||||
false
|
||||
|
||||
26
src/fastfield/error.rs
Normal file
26
src/fastfield/error.rs
Normal file
@@ -0,0 +1,26 @@
|
||||
use std::result;
|
||||
use schema::FieldEntry;
|
||||
|
||||
/// FastFieldNotAvailableError is returned when the
|
||||
/// user requested for a fast field reader, and the field was not
|
||||
/// defined in the schema as a fast field.
|
||||
#[derive(Debug)]
|
||||
pub struct FastFieldNotAvailableError {
|
||||
field_name: String,
|
||||
}
|
||||
|
||||
impl FastFieldNotAvailableError {
|
||||
|
||||
/// Creates a `FastFieldNotAvailable` error.
|
||||
/// `field_entry` is the configuration of the field
|
||||
/// for which fast fields are not available.
|
||||
pub fn new(field_entry: &FieldEntry) -> FastFieldNotAvailableError {
|
||||
FastFieldNotAvailableError {
|
||||
field_name: field_entry.name().to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Result when trying to access a fast field reader.
|
||||
pub type Result<R> = result::Result<R, FastFieldNotAvailableError>;
|
||||
@@ -1,23 +1,39 @@
|
||||
/// Fast field module
|
||||
///
|
||||
/// Fast fields are the equivalent of `DocValues` in `Lucene`.
|
||||
/// Fast fields are stored in column-oriented fashion and allow fast
|
||||
/// random access given a `DocId`.
|
||||
///
|
||||
/// Their performance is comparable to that of an array lookup.
|
||||
/// They are useful when a field is required for all or most of
|
||||
/// the `DocSet` : for instance for scoring, grouping, filtering, or facetting.
|
||||
///
|
||||
/// Currently only u32 fastfield are supported.
|
||||
//! # Fast fields
|
||||
//!
|
||||
//! Fast fields are the equivalent of `DocValues` in `Lucene`.
|
||||
//! Fast fields is a non-compressed column-oriented fashion storage
|
||||
//! of `tantivy`.
|
||||
//!
|
||||
//! It is designed for the fast random access of some document
|
||||
//! fields given a document id.
|
||||
//!
|
||||
//! `FastField` are useful when a field is required for all or most of
|
||||
//! the `DocSet` : for instance for scoring, grouping, filtering, or facetting.
|
||||
//!
|
||||
//!
|
||||
//! Fields have to be declared as `FAST` in the schema.
|
||||
//! Currently only 64-bits integers (signed or unsigned) are
|
||||
//! supported.
|
||||
//!
|
||||
//! They are stored in a bitpacked fashion so that their
|
||||
//! memory usage is directly linear with the amplitude of the
|
||||
//! values stored.
|
||||
//!
|
||||
//! Read access performance is comparable to that of an array lookup.
|
||||
|
||||
mod reader;
|
||||
mod writer;
|
||||
mod serializer;
|
||||
pub mod delete;
|
||||
mod error;
|
||||
mod delete;
|
||||
|
||||
pub use self::writer::{U32FastFieldsWriter, U32FastFieldWriter};
|
||||
pub use self::reader::{U32FastFieldsReader, U32FastFieldReader};
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
pub use self::reader::{FastFieldsReader, U64FastFieldReader, I64FastFieldReader};
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::error::{Result, FastFieldNotAvailableError};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
@@ -30,6 +46,7 @@ mod tests {
|
||||
use schema::FAST;
|
||||
use test::Bencher;
|
||||
use test;
|
||||
use fastfield::FastFieldReader;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use rand::XorShiftRng;
|
||||
@@ -37,7 +54,7 @@ mod tests {
|
||||
lazy_static! {
|
||||
static ref SCHEMA: Schema = {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
schema_builder.add_u32_field("field", FAST);
|
||||
schema_builder.add_u64_field("field", FAST);
|
||||
schema_builder.build()
|
||||
};
|
||||
static ref FIELD: Field = {
|
||||
@@ -45,15 +62,15 @@ mod tests {
|
||||
};
|
||||
}
|
||||
|
||||
fn add_single_field_doc(fast_field_writers: &mut U32FastFieldsWriter, field: Field, value: u32) {
|
||||
fn add_single_field_doc(fast_field_writers: &mut FastFieldsWriter, field: Field, value: u64) {
|
||||
let mut doc = Document::default();
|
||||
doc.add_u32(field, value);
|
||||
doc.add_u64(field, value);
|
||||
fast_field_writers.add_document(&doc);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield() {
|
||||
let test_fastfield = U32FastFieldReader::from(vec!(100,200,300));
|
||||
let test_fastfield = U64FastFieldReader::from(vec!(100,200,300));
|
||||
assert_eq!(test_fastfield.get(0), 100);
|
||||
assert_eq!(test_fastfield.get(1), 200);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
@@ -66,23 +83,23 @@ mod tests {
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 2u32);
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 2u64);
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 20 as usize);
|
||||
assert_eq!(source.len(), 31 as usize);
|
||||
}
|
||||
{
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 13u32);
|
||||
assert_eq!(fast_field_reader.get(1), 14u32);
|
||||
assert_eq!(fast_field_reader.get(2), 2u32);
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get(2), 2u64);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,35 +110,35 @@ mod tests {
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 4u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 3_052u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 9002u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 15_001u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 777u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_002u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 215u32);
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 4u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 3_052u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 9002u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 15_001u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 777u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_002u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u64);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 215u64);
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 45 as usize);
|
||||
assert_eq!(source.len(), 56 as usize);
|
||||
}
|
||||
{
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4u32);
|
||||
assert_eq!(fast_field_reader.get(1), 14_082_001u32);
|
||||
assert_eq!(fast_field_reader.get(2), 3_052u32);
|
||||
assert_eq!(fast_field_reader.get(3), 9002u32);
|
||||
assert_eq!(fast_field_reader.get(4), 15_001u32);
|
||||
assert_eq!(fast_field_reader.get(5), 777u32);
|
||||
assert_eq!(fast_field_reader.get(6), 1_002u32);
|
||||
assert_eq!(fast_field_reader.get(7), 1_501u32);
|
||||
assert_eq!(fast_field_reader.get(8), 215u32);
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
|
||||
assert_eq!(fast_field_reader.get(2), 3_052u64);
|
||||
assert_eq!(fast_field_reader.get(3), 9002u64);
|
||||
assert_eq!(fast_field_reader.get(4), 15_001u64);
|
||||
assert_eq!(fast_field_reader.get(5), 777u64);
|
||||
assert_eq!(fast_field_reader.get(6), 1_002u64);
|
||||
assert_eq!(fast_field_reader.get(7), 1_501u64);
|
||||
assert_eq!(fast_field_reader.get(8), 215u64);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -134,30 +151,123 @@ mod tests {
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for _ in 0..10_000 {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u64);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 18 as usize);
|
||||
assert_eq!(source.len(), 29 as usize);
|
||||
}
|
||||
{
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
for doc in 0..10_000 {
|
||||
assert_eq!(fast_field_reader.get(doc), 100_000u32);
|
||||
assert_eq!(fast_field_reader.get(doc), 100_000u64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_permutation() -> Vec<u32> {
|
||||
#[test]
|
||||
fn test_intfastfield_large_numbers() {
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
// forcing the amplitude to be high
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 0u64);
|
||||
for i in 0u64..10_000u64 {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 5_000_000_000_000_000_000u64 + i);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 80037 as usize);
|
||||
}
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 0u64);
|
||||
for doc in 1..10_001 {
|
||||
assert_eq!(fast_field_reader.get(doc), 5_000_000_000_000_000_000u64 + doc as u64 - 1u64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_signed_intfastfield() {
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
|
||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
for i in -100i64..10_000i64 {
|
||||
let mut doc = Document::default();
|
||||
doc.add_i64(i64_field, i);
|
||||
fast_field_writers.add_document(&doc);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 17704 as usize);
|
||||
}
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: I64FastFieldReader = fast_field_readers.open_reader(i64_field).unwrap();
|
||||
assert_eq!(fast_field_reader.min_value(), -100i64);
|
||||
assert_eq!(fast_field_reader.max_value(), 9_999i64);
|
||||
for (doc, i) in (-100i64..10_000i64).enumerate() {
|
||||
assert_eq!(fast_field_reader.get(doc as u32), i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_signed_intfastfield_default_val() {
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
let doc = Document::default();
|
||||
fast_field_writers.add_document(&doc);
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: I64FastFieldReader = fast_field_readers.open_reader(i64_field).unwrap();
|
||||
assert_eq!(fast_field_reader.get(0u32), 0i64);
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_permutation() -> Vec<u64> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, 4];
|
||||
let mut rng = XorShiftRng::from_seed(*seed);
|
||||
let mut permutation: Vec<u32> = (0u32..1_000_000u32).collect();
|
||||
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
|
||||
rng.shuffle(&mut permutation);
|
||||
permutation
|
||||
}
|
||||
@@ -171,7 +281,7 @@ mod tests {
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
}
|
||||
@@ -180,10 +290,11 @@ mod tests {
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
|
||||
let mut a = 0u32;
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
let mut a = 0u64;
|
||||
for _ in 0..n {
|
||||
println!("i {}=> {} {}", a, fast_field_reader.get(a as u32), permutation[a as usize]);
|
||||
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
|
||||
a = fast_field_reader.get(a as u32);
|
||||
}
|
||||
@@ -195,7 +306,7 @@ mod tests {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u32;
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n).step_by(7) {
|
||||
a ^= permutation[i as usize];
|
||||
}
|
||||
@@ -208,7 +319,7 @@ mod tests {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u32;
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
a = permutation[a as usize];
|
||||
}
|
||||
@@ -224,7 +335,7 @@ mod tests {
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
}
|
||||
@@ -233,11 +344,11 @@ mod tests {
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u32;
|
||||
let mut a = 0u64;
|
||||
for i in (0u32..n).step_by(7) {
|
||||
a ^= fast_field_reader.get(i);
|
||||
}
|
||||
@@ -254,7 +365,7 @@ mod tests {
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
}
|
||||
@@ -263,13 +374,13 @@ mod tests {
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader: U64FastFieldReader = fast_field_readers.open_reader(*FIELD).unwrap();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u32;
|
||||
for _ in 0u32..n {
|
||||
a = fast_field_reader.get(a);
|
||||
a = fast_field_reader.get(a) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
use std::io;
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Deref;
|
||||
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
use DocId;
|
||||
@@ -10,83 +8,123 @@ use std::path::Path;
|
||||
use schema::FAST;
|
||||
use directory::{WritePtr, RAMDirectory, Directory};
|
||||
use fastfield::FastFieldSerializer;
|
||||
use fastfield::U32FastFieldsWriter;
|
||||
use fastfield::FastFieldsWriter;
|
||||
use common::bitpacker::compute_num_bits;
|
||||
use common::bitpacker::BitUnpacker;
|
||||
use schema::FieldType;
|
||||
use common;
|
||||
|
||||
/// Trait for accessing a fastfield.
|
||||
///
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
pub trait FastFieldReader: Sized {
|
||||
|
||||
/// Type of the value stored in the fastfield.
|
||||
type ValueType;
|
||||
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
fn get(&self, doc: DocId) -> Self::ValueType;
|
||||
|
||||
|
||||
lazy_static! {
|
||||
static ref U32_FAST_FIELD_EMPTY: ReadOnlySource = {
|
||||
let u32_fast_field = U32FastFieldReader::from(Vec::new());
|
||||
u32_fast_field._data.clone()
|
||||
};
|
||||
/// Opens a fast field given a source.
|
||||
fn open(source: ReadOnlySource) -> Self;
|
||||
|
||||
/// Returns true iff the given field_type makes
|
||||
/// it possible to access the field values via a
|
||||
/// fastfield.
|
||||
fn is_enabled(field_type: &FieldType) -> bool;
|
||||
}
|
||||
|
||||
pub struct U32FastFieldReader {
|
||||
/// FastFieldReader for unsigned 64-bits integers.
|
||||
pub struct U64FastFieldReader {
|
||||
_data: ReadOnlySource,
|
||||
bit_unpacker: BitUnpacker,
|
||||
min_val: u32,
|
||||
max_val: u32,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
}
|
||||
|
||||
impl U32FastFieldReader {
|
||||
unsafe impl Send for U64FastFieldReader {}
|
||||
unsafe impl Sync for U64FastFieldReader {}
|
||||
|
||||
pub fn empty() -> U32FastFieldReader {
|
||||
U32FastFieldReader::open(U32_FAST_FIELD_EMPTY.clone())
|
||||
impl U64FastFieldReader {
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual minimum value.
|
||||
pub fn min_value(&self,) -> u64 {
|
||||
self.min_value
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self,) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldReader for U64FastFieldReader {
|
||||
type ValueType = u64;
|
||||
|
||||
fn get(&self, doc: DocId) -> u64 {
|
||||
self.min_value + self.bit_unpacker.get(doc as usize)
|
||||
}
|
||||
|
||||
pub fn min_val(&self,) -> u32 {
|
||||
self.min_val
|
||||
}
|
||||
|
||||
pub fn max_val(&self,) -> u32 {
|
||||
self.max_val
|
||||
fn is_enabled(field_type: &FieldType) -> bool {
|
||||
match field_type {
|
||||
&FieldType::U64(ref integer_options) =>
|
||||
integer_options.is_fast(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Opens a new fast field reader given a read only source.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if the data is corrupted.
|
||||
pub fn open(data: ReadOnlySource) -> U32FastFieldReader {
|
||||
let min_val;
|
||||
let amplitude;
|
||||
let max_val;
|
||||
fn open(data: ReadOnlySource) -> U64FastFieldReader {
|
||||
let min_value: u64;
|
||||
let max_value: u64;
|
||||
let bit_unpacker: BitUnpacker;
|
||||
|
||||
{
|
||||
let mut cursor = data.as_slice();
|
||||
min_val = u32::deserialize(&mut cursor).unwrap();
|
||||
amplitude = u32::deserialize(&mut cursor).unwrap();
|
||||
max_val = min_val + amplitude;
|
||||
let mut cursor: &[u8] = data.as_slice();
|
||||
min_value = u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
|
||||
let amplitude = u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
|
||||
max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
bit_unpacker = BitUnpacker::new(cursor, num_bits as usize)
|
||||
}
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = {
|
||||
let data_arr = &(data.deref()[8..]);
|
||||
BitUnpacker::new(data_arr, num_bits as usize)
|
||||
};
|
||||
U32FastFieldReader {
|
||||
|
||||
U64FastFieldReader {
|
||||
_data: data,
|
||||
bit_unpacker: bit_unpacker,
|
||||
min_val: min_val,
|
||||
max_val: max_val,
|
||||
min_value: min_value,
|
||||
max_value: max_value,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, doc: DocId) -> u32 {
|
||||
self.min_val + self.bit_unpacker.get(doc as usize)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl From<Vec<u32>> for U32FastFieldReader {
|
||||
fn from(vals: Vec<u32>) -> U32FastFieldReader {
|
||||
impl From<Vec<u64>> for U64FastFieldReader {
|
||||
fn from(vals: Vec<u64>) -> U64FastFieldReader {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_u32_field("field", FAST);
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema);
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
for val in vals {
|
||||
let mut fast_field_writer = fast_field_writers.get_field_writer(field).unwrap();
|
||||
fast_field_writer.add_val(val);
|
||||
@@ -95,29 +133,103 @@ impl From<Vec<u32>> for U32FastFieldReader {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
fast_field_readers.get_field(field).unwrap()
|
||||
let fast_field_readers = FastFieldsReader::open(source).unwrap();
|
||||
fast_field_readers.open_reader(field).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct U32FastFieldsReader {
|
||||
/// FastFieldReader for signed 64-bits integers.
|
||||
pub struct I64FastFieldReader {
|
||||
underlying: U64FastFieldReader,
|
||||
}
|
||||
|
||||
unsafe impl Send for I64FastFieldReader {}
|
||||
unsafe impl Sync for I64FastFieldReader {}
|
||||
|
||||
impl I64FastFieldReader {
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual minimum value.
|
||||
pub fn min_value(&self,) -> i64 {
|
||||
common::u64_to_i64(self.underlying.min_value())
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self,) -> i64 {
|
||||
common::u64_to_i64(self.underlying.max_value())
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldReader for I64FastFieldReader {
|
||||
type ValueType = i64;
|
||||
|
||||
fn get(&self, doc: DocId) -> i64 {
|
||||
common::u64_to_i64(self.underlying.get(doc))
|
||||
}
|
||||
|
||||
/// Opens a new fast field reader given a read only source.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if the data is corrupted.
|
||||
fn open(data: ReadOnlySource) -> I64FastFieldReader {
|
||||
I64FastFieldReader {
|
||||
underlying: U64FastFieldReader::open(data)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_enabled(field_type: &FieldType) -> bool {
|
||||
match field_type {
|
||||
&FieldType::I64(ref integer_options) => {
|
||||
if integer_options.is_fast() {
|
||||
true
|
||||
}
|
||||
else {
|
||||
false
|
||||
}
|
||||
},
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// The FastFieldsReader` is the datastructure containing
|
||||
/// all of the fast fields' data.
|
||||
///
|
||||
/// It contains a mapping that associated these fields to
|
||||
/// the proper slice in the fastfield reader file.
|
||||
pub struct FastFieldsReader {
|
||||
source: ReadOnlySource,
|
||||
field_offsets: HashMap<Field, (u32, u32)>,
|
||||
}
|
||||
|
||||
impl U32FastFieldsReader {
|
||||
pub fn open(source: ReadOnlySource) -> io::Result<U32FastFieldsReader> {
|
||||
impl FastFieldsReader {
|
||||
|
||||
/// Opens the `FastFieldsReader` file
|
||||
///
|
||||
/// When opening the fast field reader, the
|
||||
/// the list of the offset is read (as a footer of the
|
||||
/// data file).
|
||||
pub fn open(source: ReadOnlySource) -> io::Result<FastFieldsReader> {
|
||||
let header_offset;
|
||||
let field_offsets: Vec<(Field, u32)>;
|
||||
{
|
||||
let buffer = source.as_slice();
|
||||
{
|
||||
let mut cursor = buffer;
|
||||
header_offset = try!(u32::deserialize(&mut cursor));
|
||||
header_offset = u32::deserialize(&mut cursor)?;
|
||||
}
|
||||
{
|
||||
let mut cursor = &buffer[header_offset as usize..];
|
||||
field_offsets = try!(Vec::deserialize(&mut cursor));
|
||||
field_offsets = Vec::deserialize(&mut cursor)?;
|
||||
}
|
||||
}
|
||||
let mut end_offsets: Vec<u32> = field_offsets
|
||||
@@ -130,26 +242,26 @@ impl U32FastFieldsReader {
|
||||
let (field, start_offset) = *field_start_offsets;
|
||||
field_offsets_map.insert(field, (start_offset, *stop_offset));
|
||||
}
|
||||
Ok(U32FastFieldsReader {
|
||||
Ok(FastFieldsReader {
|
||||
field_offsets: field_offsets_map,
|
||||
source: source,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the u32 fast value reader if the field
|
||||
/// is a u32 field indexed as "fast".
|
||||
|
||||
/// Returns the u64 fast value reader if the field
|
||||
/// is a u64 field indexed as "fast".
|
||||
///
|
||||
/// Return None if the field is not a u32 field
|
||||
/// Return None if the field is not a u64 field
|
||||
/// indexed with the fast option.
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
pub fn get_field(&self, field: Field) -> Option<U32FastFieldReader> {
|
||||
pub fn open_reader<FFReader: FastFieldReader>(&self, field: Field) -> Option<FFReader> {
|
||||
self.field_offsets
|
||||
.get(&field)
|
||||
.map(|&(start, stop)| {
|
||||
let field_source = self.source.slice(start as usize, stop as usize);
|
||||
U32FastFieldReader::open(field_source)
|
||||
FFReader::open(field_source)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,13 +14,13 @@ use std::io::{self, Write, Seek, SeekFrom};
|
||||
/// the serializer.
|
||||
/// The serializer expects to receive the following calls.
|
||||
///
|
||||
/// * `new_u32_fast_field(...)`
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `new_u32_fast_field(...)`
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
@@ -29,7 +29,7 @@ pub struct FastFieldSerializer {
|
||||
write: WritePtr,
|
||||
written_size: usize,
|
||||
fields: Vec<(Field, u32)>,
|
||||
min_value: u32,
|
||||
min_value: u64,
|
||||
field_open: bool,
|
||||
bit_packer: BitPacker,
|
||||
}
|
||||
@@ -50,8 +50,8 @@ impl FastFieldSerializer {
|
||||
})
|
||||
}
|
||||
|
||||
/// Start serializing a new u32 fast field
|
||||
pub fn new_u32_fast_field(&mut self, field: Field, min_value: u32, max_value: u32) -> io::Result<()> {
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field(&mut self, field: Field, min_value: u64, max_value: u64) -> io::Result<()> {
|
||||
if self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
|
||||
}
|
||||
@@ -68,14 +68,14 @@ impl FastFieldSerializer {
|
||||
}
|
||||
|
||||
|
||||
/// Pushes a new value to the currently open u32 fast field.
|
||||
pub fn add_val(&mut self, val: u32) -> io::Result<()> {
|
||||
let val_to_write: u32 = val - self.min_value;
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer.write(val_to_write, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Close the u32 fast field.
|
||||
/// Close the u64 fast field.
|
||||
pub fn close_field(&mut self,) -> io::Result<()> {
|
||||
if !self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
|
||||
|
||||
@@ -3,47 +3,85 @@ use fastfield::FastFieldSerializer;
|
||||
use std::io;
|
||||
use schema::Value;
|
||||
use DocId;
|
||||
use common;
|
||||
use schema::FieldType;
|
||||
|
||||
pub struct U32FastFieldsWriter {
|
||||
field_writers: Vec<U32FastFieldWriter>,
|
||||
/// The fastfieldswriter regroup all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
field_writers: Vec<IntFastFieldWriter>,
|
||||
}
|
||||
|
||||
impl U32FastFieldsWriter {
|
||||
impl FastFieldsWriter {
|
||||
|
||||
pub fn from_schema(schema: &Schema) -> U32FastFieldsWriter {
|
||||
let u32_fields: Vec<Field> = schema.fields()
|
||||
/// Create all `FastFieldWriter` required by the schema.
|
||||
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
|
||||
let field_writers: Vec<IntFastFieldWriter> = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_u32_fast())
|
||||
.map(|(field_id, _)| Field(field_id as u8))
|
||||
.flat_map(|(field_id, field_entry)| {
|
||||
let field = Field(field_id as u32);
|
||||
match field_entry.field_type() {
|
||||
&FieldType::I64(ref int_options) => {
|
||||
if int_options.is_fast() {
|
||||
let mut fast_field_writer = IntFastFieldWriter::new(field);
|
||||
fast_field_writer.set_val_if_missing(common::i64_to_u64(0i64));
|
||||
Some(fast_field_writer)
|
||||
}
|
||||
else {
|
||||
None
|
||||
}
|
||||
}
|
||||
&FieldType::U64(ref int_options) => {
|
||||
if int_options.is_fast() {
|
||||
Some(IntFastFieldWriter::new(field))
|
||||
}
|
||||
else {
|
||||
None
|
||||
}
|
||||
}
|
||||
_ => None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
U32FastFieldsWriter::new(u32_fields)
|
||||
FastFieldsWriter {
|
||||
field_writers: field_writers,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new(fields: Vec<Field>) -> U32FastFieldsWriter {
|
||||
U32FastFieldsWriter {
|
||||
|
||||
/// Returns a `FastFieldsWriter`
|
||||
/// with a `IntFastFieldWriter` for each
|
||||
/// of the field given in argument.
|
||||
pub fn new(fields: Vec<Field>) -> FastFieldsWriter {
|
||||
FastFieldsWriter {
|
||||
field_writers: fields
|
||||
.into_iter()
|
||||
.map(U32FastFieldWriter::new)
|
||||
.map(IntFastFieldWriter::new)
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut U32FastFieldWriter> {
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.field_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field == field)
|
||||
}
|
||||
|
||||
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
for field_writer in &mut self.field_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes all of the `FastFieldWriter`s by pushing them in
|
||||
/// order to the fast field serializer.
|
||||
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
|
||||
for field_writer in &self.field_writers {
|
||||
try!(field_writer.serialize(serializer));
|
||||
field_writer.serialize(serializer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -56,23 +94,49 @@ impl U32FastFieldsWriter {
|
||||
for field_writer in &mut self.field_writers {
|
||||
field_writer.fill_val_up_to(doc);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
pub struct U32FastFieldWriter {
|
||||
/// Fast field writer for ints.
|
||||
/// The fast field writer just keeps the values in memory.
|
||||
///
|
||||
/// Only when the segment writer can be closed and
|
||||
/// persisted on disc, the fast field writer is
|
||||
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
|
||||
/// method.
|
||||
///
|
||||
/// We cannot serialize earlier as the values are
|
||||
/// bitpacked and the number of bits required for bitpacking
|
||||
/// can only been known once we have seen all of the values.
|
||||
///
|
||||
/// Both u64, and i64 use the same writer.
|
||||
/// i64 are just remapped to the `0..2^64 - 1`
|
||||
/// using `common::i64_to_u64`.
|
||||
pub struct IntFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u32>,
|
||||
vals: Vec<u64>,
|
||||
val_if_missing: u64,
|
||||
}
|
||||
|
||||
impl U32FastFieldWriter {
|
||||
pub fn new(field: Field) -> U32FastFieldWriter {
|
||||
U32FastFieldWriter {
|
||||
impl IntFastFieldWriter {
|
||||
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub fn new(field: Field) -> IntFastFieldWriter {
|
||||
IntFastFieldWriter {
|
||||
field: field,
|
||||
vals: Vec::new(),
|
||||
val_if_missing: 0u64,
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the default value.
|
||||
///
|
||||
/// This default value is recorded for documents if
|
||||
/// a document does not have any value.
|
||||
fn set_val_if_missing(&mut self, val_if_missing: u64) {
|
||||
self.val_if_missing = val_if_missing;
|
||||
}
|
||||
|
||||
/// Ensures all of the fast field writer have
|
||||
/// reached `doc`. (included)
|
||||
///
|
||||
@@ -80,42 +144,68 @@ impl U32FastFieldWriter {
|
||||
fn fill_val_up_to(&mut self, doc: DocId) {
|
||||
let target = doc as usize + 1;
|
||||
debug_assert!(self.vals.len() <= target);
|
||||
let val_if_missing = self.val_if_missing;
|
||||
while self.vals.len() < target {
|
||||
self.add_val(0u32)
|
||||
self.add_val(val_if_missing);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_val(&mut self, val: u32) {
|
||||
|
||||
/// Records a new value.
|
||||
///
|
||||
/// The n-th value being recorded is implicitely
|
||||
/// associated to the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: u64) {
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
fn extract_val(&self, doc: &Document) -> u32 {
|
||||
|
||||
/// Extract the value associated to the fast field for
|
||||
/// this document.
|
||||
///
|
||||
/// i64 are remapped to u64 using the logic
|
||||
/// in `common::i64_to_u64`.
|
||||
///
|
||||
/// If the value is missing, then the default value is used
|
||||
/// instead.
|
||||
/// If the document has more than one value for the given field,
|
||||
/// only the first one is taken in account.
|
||||
fn extract_val(&self, doc: &Document) -> u64 {
|
||||
match doc.get_first(self.field) {
|
||||
Some(v) => {
|
||||
match *v {
|
||||
Value::U32(ref val) => { *val }
|
||||
_ => { panic!("Expected a u32field, got {:?} ", v) }
|
||||
Value::U64(ref val) => { *val },
|
||||
Value::I64(ref val) => common::i64_to_u64(*val),
|
||||
_ => { panic!("Expected a u64field, got {:?} ", v) }
|
||||
}
|
||||
},
|
||||
None => {
|
||||
0u32
|
||||
self.val_if_missing
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Extract the fast field value from the document
|
||||
/// (or use the default value) and records it.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
let val = self.extract_val(doc);
|
||||
self.add_val(val);
|
||||
}
|
||||
|
||||
/// Push the fast fields value to the `FastFieldWriter`.
|
||||
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
|
||||
let zero = 0;
|
||||
let min = *self.vals.iter().min().unwrap_or(&zero);
|
||||
let max = *self.vals.iter().max().unwrap_or(&min);
|
||||
try!(serializer.new_u32_fast_field(self.field, min, max));
|
||||
serializer.new_u64_fast_field(self.field, min, max)?;
|
||||
for &val in &self.vals {
|
||||
try!(serializer.add_val(val));
|
||||
serializer.add_val(val)?;
|
||||
}
|
||||
serializer.close_field()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use Index;
|
||||
use Searcher;
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
|
||||
fn check_index_content(searcher: &Searcher, vals: &HashSet<u32>) {
|
||||
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
assert!(searcher.segment_readers().len() < 20);
|
||||
assert_eq!(searcher.num_docs() as usize, vals.len());
|
||||
}
|
||||
@@ -17,19 +17,19 @@ fn test_indexing() {
|
||||
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
let id_field = schema_builder.add_u32_field("id", U32_INDEXED);
|
||||
let multiples_field = schema_builder.add_u32_field("multiples", U32_INDEXED);
|
||||
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
|
||||
let universe = Range::new(0u32, 20u32);
|
||||
let universe = Range::new(0u64, 20u64);
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
|
||||
|
||||
let mut committed_docs: HashSet<u32> = HashSet::new();
|
||||
let mut uncommitted_docs: HashSet<u32> = HashSet::new();
|
||||
let mut committed_docs: HashSet<u64> = HashSet::new();
|
||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||
|
||||
for _ in 0..200 {
|
||||
let random_val = universe.ind_sample(&mut rng);
|
||||
@@ -45,15 +45,15 @@ fn test_indexing() {
|
||||
else {
|
||||
if committed_docs.remove(&random_val) ||
|
||||
uncommitted_docs.remove(&random_val) {
|
||||
let doc_id_term = Term::from_field_u32(id_field, random_val);
|
||||
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
||||
index_writer.delete_term(doc_id_term);
|
||||
}
|
||||
else {
|
||||
uncommitted_docs.insert(random_val);
|
||||
let mut doc = Document::new();
|
||||
doc.add_u32(id_field, random_val);
|
||||
for i in 1u32..10u32 {
|
||||
doc.add_u32(multiples_field, random_val * i);
|
||||
doc.add_u64(id_field, random_val);
|
||||
for i in 1u64..10u64 {
|
||||
doc.add_u64(multiples_field, random_val * i);
|
||||
}
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
|
||||
@@ -170,7 +170,7 @@ impl NextBlock {
|
||||
}
|
||||
}
|
||||
}
|
||||
*next_write_lock.deref_mut() = InnerNextBlock::Closed(next_block.clone()); // TODO fix
|
||||
*next_write_lock.deref_mut() = InnerNextBlock::Closed(next_block.clone());
|
||||
return Some(next_block)
|
||||
}
|
||||
}
|
||||
@@ -280,10 +280,10 @@ mod tests {
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
let make_op = |i: usize| {
|
||||
let field = Field(1u8);
|
||||
let field = Field(1u32);
|
||||
DeleteOperation {
|
||||
opstamp: i as u64,
|
||||
term: Term::from_field_u32(field, i as u32)
|
||||
term: Term::from_field_u64(field, i as u64)
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ use datastruct::stacker::Heap;
|
||||
use directory::FileProtection;
|
||||
use Error;
|
||||
use Directory;
|
||||
use fastfield::delete::write_delete_bitset;
|
||||
use fastfield::write_delete_bitset;
|
||||
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||
use futures::Canceled;
|
||||
use futures::Future;
|
||||
@@ -36,9 +36,9 @@ use std::thread;
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
// in the heap goes below MARGIN_IN_BYTES.
|
||||
pub const MARGIN_IN_BYTES: u32 = 10_000_000u32;
|
||||
pub const MARGIN_IN_BYTES: u32 = 1_000_000u32;
|
||||
|
||||
// We impose the memory per thread to be at least 30 MB.
|
||||
// We impose the memory per thread to be at least 3 MB.
|
||||
pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32;
|
||||
|
||||
// Add document will block if the number of docs waiting in the queue to be indexed reaches PIPELINE_MAX_SIZE_IN_DOCS
|
||||
@@ -120,7 +120,9 @@ pub fn open_index_writer(
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
let stamper = Stamper::new(index.opstamp());
|
||||
let current_opstamp = index.opstamp();
|
||||
|
||||
let stamper = Stamper::new(current_opstamp);
|
||||
|
||||
let segment_updater = SegmentUpdater::new(index.clone(),
|
||||
stamper.clone(),
|
||||
@@ -143,7 +145,7 @@ pub fn open_index_writer(
|
||||
|
||||
delete_queue: delete_queue,
|
||||
|
||||
committed_opstamp: index.opstamp(),
|
||||
committed_opstamp: current_opstamp,
|
||||
stamper: stamper,
|
||||
|
||||
generation: 0,
|
||||
@@ -196,10 +198,6 @@ pub fn compute_deleted_bitset(
|
||||
Ok(might_have_changed)
|
||||
}
|
||||
|
||||
|
||||
// TODO skip delete operation before teh
|
||||
// last delete opstamp
|
||||
|
||||
/// Advance delete for the given segment up
|
||||
/// to the target opstamp.
|
||||
pub fn advance_deletes(
|
||||
@@ -268,11 +266,24 @@ fn index_documents(heap: &mut Heap,
|
||||
let mut segment_writer = SegmentWriter::for_segment(heap, segment.clone(), &schema)?;
|
||||
for doc in document_iterator {
|
||||
try!(segment_writer.add_document(&doc, &schema));
|
||||
// There is two possible conditions to close the segment.
|
||||
// One is the memory arena dedicated to the segment is
|
||||
// getting full.
|
||||
if segment_writer.is_buffer_full() {
|
||||
info!("Buffer limit reached, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc());
|
||||
break;
|
||||
}
|
||||
// The second is the term dictionary hash table
|
||||
// is reaching saturation.
|
||||
//
|
||||
// Tantivy does not resize its hashtable. When it reaches
|
||||
// capacity, we just stop indexing new document.
|
||||
if segment_writer.is_termdic_saturated() {
|
||||
info!("Term dic saturated, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc());
|
||||
break;
|
||||
}
|
||||
}
|
||||
let num_docs = segment_writer.max_doc();
|
||||
|
||||
@@ -345,6 +356,11 @@ impl IndexWriter {
|
||||
result
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn new_segment(&self) -> Segment {
|
||||
self.segment_updater.new_segment()
|
||||
}
|
||||
|
||||
/// Spawns a new worker thread for indexing.
|
||||
/// The thread consumes documents from the pipeline.
|
||||
///
|
||||
@@ -418,6 +434,12 @@ impl IndexWriter {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
|
||||
let delete_cursor = self.delete_queue.cursor();
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
|
||||
self.segment_updater.add_segment(self.generation, segment_entry);
|
||||
}
|
||||
|
||||
/// Detects and removes the files that
|
||||
/// are not used by the index anymore.
|
||||
pub fn garbage_collect_files(&mut self) -> Result<()> {
|
||||
|
||||
@@ -6,17 +6,17 @@ use core::SerializableSegment;
|
||||
use schema::FieldValue;
|
||||
use indexer::SegmentSerializer;
|
||||
use postings::PostingsSerializer;
|
||||
use fastfield::U32FastFieldReader;
|
||||
use fastfield::U64FastFieldReader;
|
||||
use itertools::Itertools;
|
||||
use postings::Postings;
|
||||
use postings::DocSet;
|
||||
use core::TermIterator;
|
||||
use fastfield::delete::DeleteBitSet;
|
||||
use fastfield::DeleteBitSet;
|
||||
use schema::{Schema, Field};
|
||||
use fastfield::FastFieldSerializer;
|
||||
use fastfield::FastFieldReader;
|
||||
use store::StoreWriter;
|
||||
use std::cmp::{min, max};
|
||||
use common::allocate_vec;
|
||||
|
||||
pub struct IndexMerger {
|
||||
schema: Schema,
|
||||
@@ -32,7 +32,7 @@ struct DeltaPositionComputer {
|
||||
impl DeltaPositionComputer {
|
||||
fn new() -> DeltaPositionComputer {
|
||||
DeltaPositionComputer {
|
||||
buffer: allocate_vec(512)
|
||||
buffer: vec![0u32; 512]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,32 +50,34 @@ impl DeltaPositionComputer {
|
||||
}
|
||||
|
||||
|
||||
fn compute_min_max_val(u32_reader: &U32FastFieldReader, max_doc: DocId, delete_bitset: &DeleteBitSet) -> Option<(u32, u32)> {
|
||||
fn compute_min_max_val(u64_reader: &U64FastFieldReader, max_doc: DocId, delete_bitset: &DeleteBitSet) -> Option<(u64, u64)> {
|
||||
if max_doc == 0 {
|
||||
None
|
||||
}
|
||||
else if !delete_bitset.has_deletes() {
|
||||
// no deleted documents,
|
||||
// we can use the previous min_val, max_val.
|
||||
Some((u32_reader.min_val(), u32_reader.max_val()))
|
||||
Some((u64_reader.min_value(), u64_reader.max_value()))
|
||||
}
|
||||
else {
|
||||
// some deleted documents,
|
||||
// we need to recompute the max / min
|
||||
(0..max_doc)
|
||||
.filter(|doc_id| !delete_bitset.is_deleted(*doc_id))
|
||||
.map(|doc_id| u32_reader.get(doc_id))
|
||||
.map(|doc_id| u64_reader.get(doc_id))
|
||||
.minmax()
|
||||
.into_option()
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> Option<U32FastFieldReader> {
|
||||
fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> Option<U64FastFieldReader> {
|
||||
segment_reader.get_fieldnorms_reader(field)
|
||||
}
|
||||
|
||||
fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Option<U32FastFieldReader> {
|
||||
segment_reader.get_fast_field_reader(field)
|
||||
fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Option<U64FastFieldReader> {
|
||||
segment_reader
|
||||
.fast_fields_reader()
|
||||
.open_reader(field)
|
||||
}
|
||||
|
||||
impl IndexMerger {
|
||||
@@ -103,7 +105,7 @@ impl IndexMerger {
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
||||
.map(|(field_id, _)| Field(field_id as u8))
|
||||
.map(|(field_id, _)| Field(field_id as u32))
|
||||
.collect();
|
||||
self.generic_write_fast_field(fieldnorm_fastfields, &extract_fieldnorm_reader, fast_field_serializer)
|
||||
}
|
||||
@@ -113,37 +115,37 @@ impl IndexMerger {
|
||||
.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_u32_fast())
|
||||
.map(|(field_id, _)| Field(field_id as u8))
|
||||
.filter(|&(_, field_entry)| field_entry.is_int_fast())
|
||||
.map(|(field_id, _)| Field(field_id as u32))
|
||||
.collect();
|
||||
self.generic_write_fast_field(fast_fields, &extract_fast_field_reader, fast_field_serializer)
|
||||
}
|
||||
|
||||
|
||||
// used both to merge field norms and regular u32 fast fields.
|
||||
// used both to merge field norms and regular u64 fast fields.
|
||||
fn generic_write_fast_field(&self,
|
||||
fields: Vec<Field>,
|
||||
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U32FastFieldReader>,
|
||||
field_reader_extractor: &Fn(&SegmentReader, Field) -> Option<U64FastFieldReader>,
|
||||
fast_field_serializer: &mut FastFieldSerializer) -> Result<()> {
|
||||
|
||||
for field in fields {
|
||||
|
||||
let mut u32_readers = vec!();
|
||||
let mut min_val = u32::max_value();
|
||||
let mut max_val = u32::min_value();
|
||||
let mut u64_readers = vec!();
|
||||
let mut min_val = u64::max_value();
|
||||
let mut max_val = u64::min_value();
|
||||
|
||||
for reader in &self.readers {
|
||||
match field_reader_extractor(reader, field) {
|
||||
Some(u32_reader) => {
|
||||
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(&u32_reader, reader.max_doc(), reader.delete_bitset()) {
|
||||
Some(u64_reader) => {
|
||||
if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset()) {
|
||||
// the segment has some non-deleted documents
|
||||
min_val = min(min_val, seg_min_val);
|
||||
max_val = max(max_val, seg_max_val);
|
||||
u32_readers.push((reader.max_doc(), u32_reader, reader.delete_bitset()));
|
||||
u64_readers.push((reader.max_doc(), u64_reader, reader.delete_bitset()));
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let error_msg = format!("Failed to find a u32_reader for field {:?}", field);
|
||||
let error_msg = format!("Failed to find a u64_reader for field {:?}", field);
|
||||
error!("{}", error_msg);
|
||||
return Err(Error::SchemaError(error_msg))
|
||||
}
|
||||
@@ -151,7 +153,7 @@ impl IndexMerger {
|
||||
|
||||
}
|
||||
|
||||
if u32_readers.is_empty() {
|
||||
if u64_readers.is_empty() {
|
||||
// we have actually zero documents.
|
||||
min_val = 0;
|
||||
max_val = 0;
|
||||
@@ -159,11 +161,11 @@ impl IndexMerger {
|
||||
|
||||
assert!(min_val <= max_val);
|
||||
|
||||
try!(fast_field_serializer.new_u32_fast_field(field, min_val, max_val));
|
||||
for (max_doc, u32_reader, delete_bitset) in u32_readers {
|
||||
try!(fast_field_serializer.new_u64_fast_field(field, min_val, max_val));
|
||||
for (max_doc, u64_reader, delete_bitset) in u64_readers {
|
||||
for doc_id in 0..max_doc {
|
||||
if !delete_bitset.is_deleted(doc_id) {
|
||||
let val = u32_reader.get(doc_id);
|
||||
let val = u64_reader.get(doc_id);
|
||||
try!(fast_field_serializer.add_val(val));
|
||||
}
|
||||
}
|
||||
@@ -199,6 +201,8 @@ impl IndexMerger {
|
||||
}
|
||||
merged_doc_id_map.push(segment_local_map);
|
||||
}
|
||||
|
||||
let mut last_field: Option<Field> = None;
|
||||
|
||||
while merged_terms.advance() {
|
||||
// Create the total list of doc ids
|
||||
@@ -229,15 +233,20 @@ impl IndexMerger {
|
||||
|
||||
// We can now serialize this postings, by pushing each document to the
|
||||
// postings serializer.
|
||||
|
||||
for (segment_ord, mut segment_postings) in segment_postings {
|
||||
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
|
||||
while segment_postings.advance() {
|
||||
if let Some(remapped_doc_id) = old_to_new_doc_id[segment_postings.doc() as usize] {
|
||||
if !term_written {
|
||||
let current_field = term.field();
|
||||
if last_field != Some(current_field) {
|
||||
postings_serializer.new_field(current_field);
|
||||
last_field = Some(current_field);
|
||||
}
|
||||
|
||||
// we make sure to only write the term iff
|
||||
// there is at least one document.
|
||||
postings_serializer.new_term(&term)?;
|
||||
postings_serializer.new_term(term.as_slice())?;
|
||||
term_written = true;
|
||||
}
|
||||
let delta_positions: &[u32] =
|
||||
@@ -295,6 +304,7 @@ mod tests {
|
||||
use query::TermQuery;
|
||||
use schema::{Field, FieldValue};
|
||||
use core::Index;
|
||||
use fastfield::U64FastFieldReader;
|
||||
use Searcher;
|
||||
use DocAddress;
|
||||
use collector::tests::FastFieldTestCollector;
|
||||
@@ -311,8 +321,8 @@ mod tests {
|
||||
.set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype = schema::U32Options::default().set_fast();
|
||||
let score_field = schema_builder.add_u32_field("score", score_fieldtype);
|
||||
let score_fieldtype = schema::IntOptions::default().set_fast();
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
{
|
||||
@@ -322,19 +332,19 @@ mod tests {
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "af b");
|
||||
doc.add_u32(score_field, 3);
|
||||
doc.add_u64(score_field, 3);
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b c");
|
||||
doc.add_u32(score_field, 5);
|
||||
doc.add_u64(score_field, 5);
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b c d");
|
||||
doc.add_u32(score_field, 7);
|
||||
doc.add_u64(score_field, 7);
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().expect("committed");
|
||||
@@ -345,13 +355,13 @@ mod tests {
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "af b");
|
||||
doc.add_u32(score_field, 11);
|
||||
doc.add_u64(score_field, 11);
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b c g");
|
||||
doc.add_u32(score_field, 13);
|
||||
doc.add_u64(score_field, 13);
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().expect("Commit failed");
|
||||
@@ -417,7 +427,7 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn search_term(searcher: &Searcher, term: Term) -> Vec<u32> {
|
||||
fn search_term(searcher: &Searcher, term: Term) -> Vec<u64> {
|
||||
let mut collector = FastFieldTestCollector::for_field(Field(1));
|
||||
let term_query = TermQuery::new(term, SegmentPostingsOption::NoFreq);
|
||||
searcher.search(&term_query, &mut collector).unwrap();
|
||||
@@ -432,27 +442,29 @@ mod tests {
|
||||
.set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype = schema::U32Options::default().set_fast();
|
||||
let score_field = schema_builder.add_u32_field("score", score_fieldtype);
|
||||
let score_fieldtype = schema::IntOptions::default().set_fast();
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
|
||||
let empty_vec = Vec::<u64>::new();
|
||||
|
||||
{ // a first commit
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
text_field => "a b d",
|
||||
score_field => 1
|
||||
score_field => 1u64
|
||||
));
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
text_field => "b c",
|
||||
score_field => 2
|
||||
score_field => 2u64
|
||||
));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
text_field => "c d",
|
||||
score_field => 3
|
||||
score_field => 3u64
|
||||
));
|
||||
index_writer.commit().expect("committed");
|
||||
index.load_searchers().unwrap();
|
||||
@@ -469,24 +481,24 @@ mod tests {
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
text_field => "a d e",
|
||||
score_field => 4_000
|
||||
score_field => 4_000u64
|
||||
));
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
text_field => "e f",
|
||||
score_field => 5_000
|
||||
score_field => 5_000u64
|
||||
));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "f"));
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
text_field => "f g",
|
||||
score_field => 6_000
|
||||
score_field => 6_000u64
|
||||
));
|
||||
index_writer.add_document(
|
||||
doc!(
|
||||
text_field => "g h",
|
||||
score_field => 7_000
|
||||
score_field => 7_000u64
|
||||
));
|
||||
index_writer.commit().expect("committed");
|
||||
index.load_searchers().unwrap();
|
||||
@@ -498,21 +510,21 @@ mod tests {
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(searcher.segment_readers()[1].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[1].max_doc(), 4);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
|
||||
|
||||
let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(score_field_reader.min_val(), 1);
|
||||
assert_eq!(score_field_reader.max_val(), 3);
|
||||
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 1);
|
||||
assert_eq!(score_field_reader.max_value(), 3);
|
||||
|
||||
let score_field_reader = searcher.segment_reader(1).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(score_field_reader.min_val(), 4000);
|
||||
assert_eq!(score_field_reader.max_val(), 7000);
|
||||
let score_field_reader: U64FastFieldReader = searcher.segment_reader(1).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 4000);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
}
|
||||
{ // merging the segments
|
||||
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
|
||||
@@ -525,16 +537,16 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 3);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
|
||||
let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(score_field_reader.min_val(), 3);
|
||||
assert_eq!(score_field_reader.max_val(), 7000);
|
||||
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 3);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
}
|
||||
{
|
||||
// test a commit with only deletes
|
||||
@@ -547,16 +559,16 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
|
||||
let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(score_field_reader.min_val(), 3);
|
||||
assert_eq!(score_field_reader.max_val(), 7000);
|
||||
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 3);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
}
|
||||
{ // Test merging a single segment in order to remove deletes.
|
||||
let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed.");
|
||||
@@ -570,16 +582,16 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 2);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!());
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), empty_vec);
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000));
|
||||
assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000));
|
||||
let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(score_field_reader.min_val(), 6000);
|
||||
assert_eq!(score_field_reader.max_val(), 7000);
|
||||
let score_field_reader: U64FastFieldReader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 6000);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
}
|
||||
|
||||
{ // Test removing all docs
|
||||
|
||||
@@ -172,7 +172,6 @@ impl SegmentManager {
|
||||
// ... and we make sure the target segment entry
|
||||
// can be garbage collected.
|
||||
registers_lock.writing.remove(&after_merge_segment_id);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ use indexer::SegmentEntry;
|
||||
use indexer::SegmentSerializer;
|
||||
use Result;
|
||||
use futures_cpupool::CpuFuture;
|
||||
use rustc_serialize::json;
|
||||
use serde_json;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use schema::Schema;
|
||||
use std::borrow::BorrowMut;
|
||||
@@ -77,10 +77,10 @@ pub fn save_metas(segment_metas: Vec<SegmentMeta>,
|
||||
schema: schema,
|
||||
opstamp: opstamp,
|
||||
};
|
||||
let mut w = vec!();
|
||||
try!(write!(&mut w, "{}\n", json::as_pretty_json(&metas)));
|
||||
let mut w = try!(serde_json::to_vec_pretty(&metas));
|
||||
try!(write!(&mut w, "\n"));
|
||||
let res = directory.atomic_write(&META_FILEPATH, &w[..])?;
|
||||
debug!("Saved metas {}", json::as_pretty_json(&metas));
|
||||
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
||||
Ok(res)
|
||||
|
||||
}
|
||||
@@ -309,8 +309,6 @@ impl SegmentUpdater {
|
||||
let merging_join_handle = thread::spawn(move || {
|
||||
|
||||
// first we need to apply deletes to our segment.
|
||||
info!("Start merge: {:?}", segment_ids_vec);
|
||||
|
||||
let merged_segment = segment_updater_clone.new_segment();
|
||||
let merged_segment_id = merged_segment.id();
|
||||
let merge_result = perform_merge(&segment_ids_vec, &segment_updater_clone, merged_segment, target_opstamp);
|
||||
@@ -374,12 +372,25 @@ impl SegmentUpdater {
|
||||
self.run_async(move |segment_updater| {
|
||||
debug!("End merge {:?}", after_merge_segment_entry.meta());
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
let mut _file_protection_opt = None;
|
||||
if let Some(delete_operation) = delete_cursor.get() {
|
||||
let committed_opstamp = segment_updater.0.index.opstamp();
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let segment = segment_updater.0.index.segment(after_merge_segment_entry.meta().clone());
|
||||
// TODO check unwrap
|
||||
advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp).unwrap();
|
||||
match advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp) {
|
||||
Ok(file_protection_opt_res) => {
|
||||
_file_protection_opt = file_protection_opt_res;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Merge of {:?} was cancelled (advancing deletes failed): {:?}", before_merge_segment_ids, e);
|
||||
// ... cancel merge
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
}
|
||||
segment_updater.cancel_merge(&before_merge_segment_ids, after_merge_segment_entry.segment_id());
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
|
||||
|
||||
@@ -5,19 +5,15 @@ use schema::Schema;
|
||||
use schema::Term;
|
||||
use core::Segment;
|
||||
use core::SerializableSegment;
|
||||
use postings::PostingsWriter;
|
||||
use fastfield::U32FastFieldsWriter;
|
||||
use fastfield::FastFieldsWriter;
|
||||
use schema::Field;
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldValue;
|
||||
use schema::FieldType;
|
||||
use schema::TextIndexingOptions;
|
||||
use postings::SpecializedPostingsWriter;
|
||||
use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use datastruct::stacker::Heap;
|
||||
use indexer::index_writer::MARGIN_IN_BYTES;
|
||||
use super::operation::AddOperation;
|
||||
use postings::MultiFieldPostingsWriter;
|
||||
|
||||
|
||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||
@@ -28,46 +24,25 @@ use super::operation::AddOperation;
|
||||
pub struct SegmentWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
max_doc: DocId,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
|
||||
multifield_postings: MultiFieldPostingsWriter<'a>,
|
||||
segment_serializer: SegmentSerializer,
|
||||
fast_field_writers: U32FastFieldsWriter,
|
||||
fieldnorms_writer: U32FastFieldsWriter,
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FastFieldsWriter,
|
||||
doc_opstamps: Vec<u64>,
|
||||
}
|
||||
|
||||
|
||||
fn create_fieldnorms_writer(schema: &Schema) -> U32FastFieldsWriter {
|
||||
let u32_fields: Vec<Field> = schema.fields()
|
||||
fn create_fieldnorms_writer(schema: &Schema) -> FastFieldsWriter {
|
||||
let u64_fields: Vec<Field> = schema.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
||||
.map(|(field_id, _)| Field(field_id as u8))
|
||||
.map(|(field_id, _)| Field(field_id as u32))
|
||||
.collect();
|
||||
U32FastFieldsWriter::new(u32_fields)
|
||||
FastFieldsWriter::new(u64_fields)
|
||||
}
|
||||
|
||||
|
||||
fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box<PostingsWriter + 'a> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
match text_options.get_indexing_options() {
|
||||
TextIndexingOptions::TokenizedWithFreq => {
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
|
||||
}
|
||||
TextIndexingOptions::TokenizedWithFreqAndPosition => {
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
|
||||
}
|
||||
_ => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::U32(_) => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> SegmentWriter<'a> {
|
||||
|
||||
@@ -84,18 +59,14 @@ impl<'a> SegmentWriter<'a> {
|
||||
mut segment: Segment,
|
||||
schema: &Schema) -> Result<SegmentWriter<'a>> {
|
||||
let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment));
|
||||
let mut per_field_postings_writers: Vec<Box<PostingsWriter + 'a>> = Vec::new();
|
||||
for field_entry in schema.fields() {
|
||||
let postings_writer: Box<PostingsWriter + 'a> = posting_from_field_entry(field_entry, heap);
|
||||
per_field_postings_writers.push(postings_writer);
|
||||
}
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, heap);
|
||||
Ok(SegmentWriter {
|
||||
heap: heap,
|
||||
max_doc: 0,
|
||||
per_field_postings_writers: per_field_postings_writers,
|
||||
multifield_postings: multifield_postings,
|
||||
fieldnorms_writer: create_fieldnorms_writer(schema),
|
||||
segment_serializer: segment_serializer,
|
||||
fast_field_writers: U32FastFieldsWriter::from_schema(schema),
|
||||
fast_field_writers: FastFieldsWriter::from_schema(schema),
|
||||
doc_opstamps: Vec::with_capacity(1_000),
|
||||
})
|
||||
}
|
||||
@@ -104,15 +75,11 @@ impl<'a> SegmentWriter<'a> {
|
||||
///
|
||||
/// Finalize consumes the `SegmentWriter`, so that it cannot
|
||||
/// be used afterwards.
|
||||
pub fn finalize(mut self) -> Result<Vec<u64>> {
|
||||
for per_field_postings_writer in &mut self.per_field_postings_writers {
|
||||
per_field_postings_writer.close(self.heap);
|
||||
}
|
||||
write(&self.per_field_postings_writers,
|
||||
pub fn finalize(self) -> Result<Vec<u64>> {
|
||||
write(&self.multifield_postings,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
self.segment_serializer,
|
||||
self.heap)?;
|
||||
self.segment_serializer)?;
|
||||
Ok(self.doc_opstamps)
|
||||
}
|
||||
|
||||
@@ -127,6 +94,15 @@ impl<'a> SegmentWriter<'a> {
|
||||
self.heap.num_free_bytes() <= MARGIN_IN_BYTES
|
||||
}
|
||||
|
||||
|
||||
/// Return true if the term dictionary hashmap is reaching capacity.
|
||||
/// It is one of the condition that triggers a `SegmentWriter` to
|
||||
/// be finalized.
|
||||
pub(crate) fn is_termdic_saturated(&self,) -> bool {
|
||||
self.multifield_postings.is_termdic_saturated()
|
||||
}
|
||||
|
||||
|
||||
/// Indexes a new document
|
||||
///
|
||||
/// As a user, you should rather use `IndexWriter`'s add_document.
|
||||
@@ -135,33 +111,40 @@ impl<'a> SegmentWriter<'a> {
|
||||
let doc = &add_operation.document;
|
||||
self.doc_opstamps.push(add_operation.opstamp);
|
||||
for (field, field_values) in doc.get_sorted_field_values() {
|
||||
let field_posting_writer: &mut Box<PostingsWriter> = &mut self.per_field_postings_writers[field.0 as usize];
|
||||
let field_options = schema.get_field_entry(field);
|
||||
match *field_options.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
let num_tokens: u32 =
|
||||
let num_tokens: u32 =
|
||||
if text_options.get_indexing_options().is_tokenized() {
|
||||
field_posting_writer.index_text(doc_id, field, &field_values, self.heap)
|
||||
self.multifield_postings.index_text(doc_id, field, &field_values)
|
||||
}
|
||||
else {
|
||||
let num_field_values = field_values.len() as u32;
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_text(field, field_value.value().text());
|
||||
field_posting_writer.suscribe(doc_id, 0, &term, self.heap);
|
||||
self.multifield_postings.suscribe(doc_id, &term);
|
||||
}
|
||||
num_field_values
|
||||
};
|
||||
self.fieldnorms_writer
|
||||
.get_field_writer(field)
|
||||
.map(|field_norms_writer| {
|
||||
field_norms_writer.add_val(num_tokens as u32)
|
||||
field_norms_writer.add_val(num_tokens as u64)
|
||||
});
|
||||
}
|
||||
FieldType::U32(ref u32_options) => {
|
||||
if u32_options.is_indexed() {
|
||||
FieldType::U64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_u32(field_value.field(), field_value.value().u32_value());
|
||||
field_posting_writer.suscribe(doc_id, 0, &term, self.heap);
|
||||
let term = Term::from_field_u64(field_value.field(), field_value.value().u64_value());
|
||||
self.multifield_postings.suscribe(doc_id, &term);
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::I64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_i64(field_value.field(), field_value.value().i64_value());
|
||||
self.multifield_postings.suscribe(doc_id, &term);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -204,28 +187,27 @@ impl<'a> SegmentWriter<'a> {
|
||||
}
|
||||
|
||||
// This method is used as a trick to workaround the borrow checker
|
||||
fn write<'a>(per_field_postings_writers: &[Box<PostingsWriter + 'a>],
|
||||
fast_field_writers: &U32FastFieldsWriter,
|
||||
fieldnorms_writer: &U32FastFieldsWriter,
|
||||
mut serializer: SegmentSerializer,
|
||||
heap: &'a Heap,) -> Result<()> {
|
||||
for per_field_postings_writer in per_field_postings_writers {
|
||||
try!(per_field_postings_writer.serialize(serializer.get_postings_serializer(), heap));
|
||||
}
|
||||
fn write<'a>(
|
||||
multifield_postings: &MultiFieldPostingsWriter,
|
||||
fast_field_writers: &FastFieldsWriter,
|
||||
fieldnorms_writer: &FastFieldsWriter,
|
||||
mut serializer: SegmentSerializer) -> Result<()> {
|
||||
|
||||
try!(multifield_postings.serialize(serializer.get_postings_serializer()));
|
||||
try!(fast_field_writers.serialize(serializer.get_fast_field_serializer()));
|
||||
try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer()));
|
||||
try!(serializer.close());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl<'a> SerializableSegment for SegmentWriter<'a> {
|
||||
fn write(&self, serializer: SegmentSerializer) -> Result<u32> {
|
||||
let max_doc = self.max_doc;
|
||||
write(&self.per_field_postings_writers,
|
||||
write(&self.multifield_postings,
|
||||
&self.fast_field_writers,
|
||||
&self.fieldnorms_writer,
|
||||
serializer,
|
||||
self.heap)?;
|
||||
serializer)?;
|
||||
Ok(max_doc)
|
||||
}
|
||||
}
|
||||
|
||||
101
src/lib.rs
101
src/lib.rs
@@ -9,6 +9,7 @@
|
||||
|
||||
#![cfg_attr(test, feature(test))]
|
||||
#![cfg_attr(test, feature(step_by))]
|
||||
|
||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||
|
||||
#![warn(missing_docs)]
|
||||
@@ -24,6 +25,9 @@
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
|
||||
#[macro_use]
|
||||
extern crate serde_derive;
|
||||
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
|
||||
@@ -34,10 +38,11 @@ extern crate byteorder;
|
||||
extern crate memmap;
|
||||
extern crate regex;
|
||||
extern crate tempfile;
|
||||
extern crate rustc_serialize;
|
||||
extern crate atomicwrites;
|
||||
extern crate tempdir;
|
||||
extern crate serde;
|
||||
extern crate bincode;
|
||||
extern crate serde_json;
|
||||
extern crate time;
|
||||
extern crate lz4;
|
||||
extern crate uuid;
|
||||
@@ -91,13 +96,12 @@ pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
mod core;
|
||||
mod compression;
|
||||
mod fastfield;
|
||||
mod store;
|
||||
pub mod store;
|
||||
mod indexer;
|
||||
mod common;
|
||||
pub mod common;
|
||||
mod error;
|
||||
mod analyzer;
|
||||
mod datastruct;
|
||||
pub mod datastruct;
|
||||
|
||||
|
||||
|
||||
@@ -112,15 +116,16 @@ pub mod postings;
|
||||
/// Schema
|
||||
pub mod schema;
|
||||
|
||||
pub mod fastfield;
|
||||
|
||||
|
||||
pub use directory::Directory;
|
||||
pub use core::{Index, Segment, SegmentId, SegmentMeta, Searcher};
|
||||
pub use core::{Index, Segment, SegmentComponent, SegmentId, SegmentMeta, Searcher};
|
||||
pub use indexer::IndexWriter;
|
||||
pub use schema::{Term, Document};
|
||||
pub use core::SegmentReader;
|
||||
pub use self::common::TimerTree;
|
||||
|
||||
|
||||
pub use postings::DocSet;
|
||||
pub use postings::Postings;
|
||||
pub use postings::SegmentPostingsOption;
|
||||
@@ -200,8 +205,10 @@ mod tests {
|
||||
use schema::*;
|
||||
use DocSet;
|
||||
use IndexWriter;
|
||||
use fastfield::{FastFieldReader, U64FastFieldReader, I64FastFieldReader};
|
||||
use Postings;
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_indexing() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -437,26 +444,50 @@ mod tests {
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_indexed_u32() {
|
||||
fn test_indexed_u64() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_u32_field("text", U32_INDEXED);
|
||||
let field = schema_builder.add_u64_field("value", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.add_document(
|
||||
doc!(field=>1)
|
||||
doc!(field=>1u64)
|
||||
);
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let term = Term::from_field_u32(field, 1u32);
|
||||
let term = Term::from_field_u64(field, 1u64);
|
||||
let mut postings = searcher.segment_reader(0).read_postings(&term, SegmentPostingsOption::NoFreq).unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_indexed_i64() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let value_field = schema_builder.add_i64_field("value", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
let negative_val = -1i64;
|
||||
index_writer.add_document(
|
||||
doc!(value_field => negative_val)
|
||||
);
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let term = Term::from_field_i64(value_field, negative_val);
|
||||
let mut postings = searcher
|
||||
.segment_reader(0)
|
||||
.read_postings(&term, SegmentPostingsOption::NoFreq).unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_postings2() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
@@ -630,4 +661,52 @@ mod tests {
|
||||
assert_eq!(values.len(), 1);
|
||||
assert_eq!(values[0].text(), "short");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wrong_fast_field_type() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
|
||||
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let stored_int_field = schema_builder.add_u64_field("text", INT_STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
|
||||
{
|
||||
let document = doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64);
|
||||
index_writer.add_document(document);
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.get_fast_field_reader::<U64FastFieldReader>(text_field);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.get_fast_field_reader::<U64FastFieldReader>(stored_int_field);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.get_fast_field_reader::<U64FastFieldReader>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
}
|
||||
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.get_fast_field_reader::<I64FastFieldReader>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,21 +17,21 @@ mod docset;
|
||||
mod segment_postings_option;
|
||||
|
||||
pub use self::docset::{SkipResult, DocSet};
|
||||
pub use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
|
||||
use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
|
||||
pub use self::serializer::PostingsSerializer;
|
||||
pub use self::postings_writer::PostingsWriter;
|
||||
pub use self::postings_writer::SpecializedPostingsWriter;
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
pub use self::term_info::TermInfo;
|
||||
pub use self::postings::Postings;
|
||||
|
||||
#[cfg(test)]
|
||||
pub use self::vec_postings::VecPostings;
|
||||
pub use self::segment_postings::SegmentPostings;
|
||||
pub use self::segment_postings::{SegmentPostings, BlockSegmentPostings};
|
||||
pub use self::intersection::IntersectionDocSet;
|
||||
pub use self::freq_handler::FreqHandler;
|
||||
pub use self::segment_postings_option::SegmentPostingsOption;
|
||||
pub use common::HasLen;
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -43,6 +43,7 @@ mod tests {
|
||||
use core::Index;
|
||||
use std::iter;
|
||||
use datastruct::stacker::Heap;
|
||||
use fastfield::FastFieldReader;
|
||||
use query::TermQuery;
|
||||
use schema::Field;
|
||||
use test::Bencher;
|
||||
@@ -58,8 +59,8 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut segment = index.new_segment();
|
||||
let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap();
|
||||
let term = Term::from_field_text(text_field, "abc");
|
||||
posting_serializer.new_term(&term).unwrap();
|
||||
posting_serializer.new_field(text_field);
|
||||
posting_serializer.new_term("abc".as_bytes()).unwrap();
|
||||
for doc_id in 0u32..3u32 {
|
||||
let positions = vec!(1,2,3,2);
|
||||
posting_serializer.write_doc(doc_id, 2, &positions).unwrap();
|
||||
@@ -67,7 +68,7 @@ mod tests {
|
||||
posting_serializer.close_term().unwrap();
|
||||
posting_serializer.close().unwrap();
|
||||
let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
|
||||
assert_eq!(read.len(), 13);
|
||||
assert!(read.len() <= 16);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -119,7 +120,7 @@ mod tests {
|
||||
assert_eq!(fieldnorm_reader.get(0), 8 + 5);
|
||||
assert_eq!(fieldnorm_reader.get(1), 2);
|
||||
for i in 2 .. 1000 {
|
||||
assert_eq!(fieldnorm_reader.get(i), i + 1);
|
||||
assert_eq!(fieldnorm_reader.get(i), (i + 1) as u64);
|
||||
}
|
||||
}
|
||||
{
|
||||
@@ -266,16 +267,26 @@ mod tests {
|
||||
};
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_block_segment_postings(b: &mut Bencher) {
|
||||
let searcher = INDEX.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
b.iter(|| {
|
||||
let mut block_segment_postings = segment_reader.read_block_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap();
|
||||
while block_segment_postings.advance() {}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_segment_postings(b: &mut Bencher) {
|
||||
let searcher = INDEX.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
b.iter(|| {
|
||||
let mut segment_postings = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap();
|
||||
while segment_postings.advance() {}
|
||||
let mut block_segment_postings = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap();
|
||||
while block_segment_postings.advance() {}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[bench]
|
||||
fn bench_segment_intersection(b: &mut Bencher) {
|
||||
|
||||
@@ -5,9 +5,129 @@ use postings::PostingsSerializer;
|
||||
use std::io;
|
||||
use postings::Recorder;
|
||||
use analyzer::SimpleTokenizer;
|
||||
use schema::Field;
|
||||
use Result;
|
||||
use schema::{Schema, Field};
|
||||
use analyzer::StreamingIterator;
|
||||
use std::marker::PhantomData;
|
||||
use schema::extract_field_from_term_bytes;
|
||||
use std::ops::DerefMut;
|
||||
use datastruct::stacker::{HashMap, Heap};
|
||||
use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use schema::TextIndexingOptions;
|
||||
|
||||
fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box<PostingsWriter + 'a> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
match text_options.get_indexing_options() {
|
||||
TextIndexingOptions::TokenizedWithFreq => {
|
||||
SpecializedPostingsWriter::<TermFrequencyRecorder>::new_boxed(heap)
|
||||
}
|
||||
TextIndexingOptions::TokenizedWithFreqAndPosition => {
|
||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed(heap)
|
||||
}
|
||||
_ => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
FieldType::I64(_) => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct MultiFieldPostingsWriter<'a> {
|
||||
heap: &'a Heap,
|
||||
term_index: HashMap<'a>,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
|
||||
}
|
||||
|
||||
impl<'a> MultiFieldPostingsWriter<'a> {
|
||||
|
||||
/// Create a new `MultiFieldPostingsWriter` given
|
||||
/// a schema and a heap.
|
||||
pub fn new(schema: &Schema, heap: &'a Heap) -> MultiFieldPostingsWriter<'a> {
|
||||
let capacity = heap.capacity();
|
||||
let hashmap_size = hashmap_size_in_bits(capacity);
|
||||
let term_index = HashMap::new(hashmap_size, heap);
|
||||
|
||||
let mut per_field_postings_writers: Vec<_> = vec!();
|
||||
for field_entry in schema.fields() {
|
||||
let field_entry = posting_from_field_entry(&field_entry, heap);
|
||||
per_field_postings_writers.push(field_entry);
|
||||
}
|
||||
MultiFieldPostingsWriter {
|
||||
heap: heap,
|
||||
term_index: term_index,
|
||||
per_field_postings_writers: per_field_postings_writers
|
||||
}
|
||||
}
|
||||
|
||||
pub fn index_text(&mut self,
|
||||
doc: DocId,
|
||||
field: Field,
|
||||
field_values: &[&FieldValue])
|
||||
-> u32 {
|
||||
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
|
||||
postings_writer.index_text(&mut self.term_index, doc, field, field_values, self.heap)
|
||||
}
|
||||
|
||||
pub fn suscribe(&mut self, doc: DocId, term: &Term) {
|
||||
let postings_writer = self.per_field_postings_writers[term.field().0 as usize].deref_mut();
|
||||
postings_writer.suscribe(&mut self.term_index, doc, 0u32, term, self.heap)
|
||||
}
|
||||
|
||||
|
||||
/// Serialize the inverted index.
|
||||
/// It pushes all term, one field at a time, towards the
|
||||
/// postings serializer.
|
||||
pub fn serialize(&self, serializer: &mut PostingsSerializer) -> Result<()> {
|
||||
let mut term_offsets: Vec<(&[u8], u32)> = self.term_index
|
||||
.iter()
|
||||
.collect();
|
||||
term_offsets.sort_by_key(|&(k, _v)| k);
|
||||
|
||||
let mut offsets: Vec<(Field, usize)> = vec!();
|
||||
let term_offsets_it = term_offsets
|
||||
.iter()
|
||||
.map(|&(ref key, _)| {
|
||||
extract_field_from_term_bytes(&key)
|
||||
})
|
||||
.enumerate();
|
||||
|
||||
let mut prev_field = Field(u32::max_value());
|
||||
for (offset, field) in term_offsets_it {
|
||||
if field != prev_field {
|
||||
offsets.push((field, offset));
|
||||
prev_field = field;
|
||||
}
|
||||
}
|
||||
offsets.push((Field(0), term_offsets.len()));
|
||||
for i in 0..(offsets.len() - 1) {
|
||||
let (field, start) = offsets[i];
|
||||
let (_, stop) = offsets[i+1];
|
||||
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
||||
postings_writer.serialize(
|
||||
field,
|
||||
&term_offsets[start..stop],
|
||||
serializer,
|
||||
self.heap)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return true iff the term dictionary is saturated.
|
||||
pub fn is_termdic_saturated(&self) -> bool {
|
||||
self.term_index.is_saturated()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// The `PostingsWriter` is in charge of receiving documenting
|
||||
/// and building a `Segment` in anonymous memory.
|
||||
@@ -21,17 +141,15 @@ pub trait PostingsWriter {
|
||||
/// * term - the term
|
||||
/// * heap - heap used to store the postings informations as well as the terms
|
||||
/// in the hashmap.
|
||||
fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap);
|
||||
fn suscribe(&mut self, term_index: &mut HashMap, doc: DocId, pos: u32, term: &Term, heap: &Heap);
|
||||
|
||||
/// Serializes the postings on disk.
|
||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
|
||||
|
||||
/// Closes all of the currently open `Recorder`'s.
|
||||
fn close(&mut self, heap: &Heap);
|
||||
|
||||
fn serialize(&self, field: Field, term_addrs: &[(&[u8], u32)], serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>;
|
||||
|
||||
/// Tokenize a text and suscribe all of its token.
|
||||
fn index_text<'a>(&mut self,
|
||||
term_index: &mut HashMap,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
field_values: &[&'a FieldValue],
|
||||
@@ -39,14 +157,15 @@ pub trait PostingsWriter {
|
||||
-> u32 {
|
||||
let mut pos = 0u32;
|
||||
let mut num_tokens: u32 = 0u32;
|
||||
let mut term = Term::allocate(field, 100);
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
term.set_field(field);
|
||||
for field_value in field_values {
|
||||
let mut tokens = SimpleTokenizer.tokenize(field_value.value().text());
|
||||
// right now num_tokens and pos are redundant, but it should
|
||||
// change when we get proper analyzers
|
||||
while let Some(token) = tokens.next() {
|
||||
term.set_text(token);
|
||||
self.suscribe(doc_id, pos, &term, heap);
|
||||
self.suscribe(term_index, doc_id, pos, &term, heap);
|
||||
pos += 1u32;
|
||||
num_tokens += 1u32;
|
||||
}
|
||||
@@ -61,7 +180,8 @@ pub trait PostingsWriter {
|
||||
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
||||
/// dispatch to the recorder information.
|
||||
pub struct SpecializedPostingsWriter<'a, Rec: Recorder + 'static> {
|
||||
term_index: HashMap<'a, Rec>,
|
||||
heap: &'a Heap,
|
||||
_recorder_type: PhantomData<Rec>,
|
||||
}
|
||||
|
||||
/// Given a `Heap` size, computes a relevant size for the `HashMap`.
|
||||
@@ -81,9 +201,10 @@ fn hashmap_size_in_bits(heap_capacity: u32) -> usize {
|
||||
impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
/// constructor
|
||||
pub fn new(heap: &'a Heap) -> SpecializedPostingsWriter<'a, Rec> {
|
||||
let capacity = heap.capacity();
|
||||
let hashmap_size = hashmap_size_in_bits(capacity);
|
||||
SpecializedPostingsWriter { term_index: HashMap::new(hashmap_size, heap) }
|
||||
SpecializedPostingsWriter {
|
||||
heap: heap,
|
||||
_recorder_type: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
|
||||
@@ -93,16 +214,9 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
|
||||
}
|
||||
|
||||
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
|
||||
fn close(&mut self, heap: &Heap) {
|
||||
for recorder in self.term_index.values_mut() {
|
||||
recorder.close_doc(heap);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[inline]
|
||||
fn suscribe(&mut self, doc: DocId, position: u32, term: &Term, heap: &Heap) {
|
||||
let mut recorder = self.term_index.get_or_create(term);
|
||||
|
||||
fn suscribe(&mut self, term_index: &mut HashMap, doc: DocId, position: u32, term: &Term, heap: &Heap) {
|
||||
let recorder: &mut Rec = term_index.get_or_create(term);
|
||||
let current_doc = recorder.current_doc();
|
||||
if current_doc != doc {
|
||||
if current_doc != u32::max_value() {
|
||||
@@ -113,19 +227,18 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'
|
||||
recorder.record_position(position, heap);
|
||||
}
|
||||
|
||||
fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()> {
|
||||
let mut term_offsets: Vec<(&[u8], (u32, &Rec))> = self.term_index
|
||||
.iter()
|
||||
.collect();
|
||||
term_offsets.sort_by_key(|&(k, _v)| k);
|
||||
let mut term = Term::allocate(Field(0), 100);
|
||||
for (term_bytes, (addr, recorder)) in term_offsets {
|
||||
// sadly we are required to copy the data
|
||||
term.set_content(term_bytes);
|
||||
try!(serializer.new_term(&term));
|
||||
fn serialize(&self,
|
||||
field: Field,
|
||||
term_addrs: &[(&[u8], u32)],
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap) -> io::Result<()> {
|
||||
serializer.new_field(field);
|
||||
for &(term_bytes, addr) in term_addrs {
|
||||
let recorder: &mut Rec = self.heap.get_mut_ref(addr);
|
||||
try!(serializer.new_term(&term_bytes));
|
||||
try!(recorder.serialize(addr, serializer, heap));
|
||||
try!(serializer.close_term());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,6 +76,7 @@ impl Recorder for NothingRecorder {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Recorder encoding document ids, and term frequencies
|
||||
#[repr(C, packed)]
|
||||
pub struct TermFrequencyRecorder {
|
||||
@@ -95,6 +96,7 @@ impl HeapAllocable for TermFrequencyRecorder {
|
||||
}
|
||||
|
||||
impl Recorder for TermFrequencyRecorder {
|
||||
|
||||
fn current_doc(&self) -> DocId {
|
||||
self.current_doc
|
||||
}
|
||||
@@ -114,22 +116,28 @@ impl Recorder for TermFrequencyRecorder {
|
||||
self.current_tf = 0;
|
||||
}
|
||||
|
||||
|
||||
fn serialize(&self,
|
||||
self_addr: u32,
|
||||
serializer: &mut PostingsSerializer,
|
||||
heap: &Heap)
|
||||
-> io::Result<()> {
|
||||
let mut doc_iter = self.stack.iter(self_addr, heap);
|
||||
|
||||
// the last document has not been closed...
|
||||
// its term freq is self.current_tf.
|
||||
let mut doc_iter = self.stack
|
||||
.iter(self_addr, heap)
|
||||
.chain(Some(self.current_tf).into_iter());
|
||||
|
||||
loop {
|
||||
if let Some(doc) = doc_iter.next() {
|
||||
if let Some(term_freq) = doc_iter.next() {
|
||||
try!(serializer.write_doc(doc, term_freq, &EMPTY_ARRAY));
|
||||
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
return Ok(());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -188,7 +196,8 @@ impl Recorder for TFAndPositionRecorder {
|
||||
}
|
||||
}
|
||||
None => {
|
||||
panic!("This should never happen. Pleasee report the bug.");
|
||||
// the last document has not been closed...
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,11 +2,93 @@ use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder};
|
||||
use DocId;
|
||||
use postings::{Postings, FreqHandler, DocSet, HasLen};
|
||||
use std::num::Wrapping;
|
||||
use fastfield::delete::DeleteBitSet;
|
||||
use fastfield::DeleteBitSet;
|
||||
|
||||
|
||||
const EMPTY_DATA: [u8; 0] = [0u8; 0];
|
||||
|
||||
|
||||
pub struct BlockSegmentPostings<'a> {
|
||||
num_binpacked_blocks: usize,
|
||||
num_vint_docs: usize,
|
||||
block_decoder: BlockDecoder,
|
||||
freq_handler: FreqHandler,
|
||||
remaining_data: &'a [u8],
|
||||
doc_offset: DocId,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl<'a> BlockSegmentPostings<'a> {
|
||||
|
||||
pub fn from_data(len: usize, data: &'a [u8], freq_handler: FreqHandler) -> BlockSegmentPostings<'a> {
|
||||
let num_binpacked_blocks: usize = (len as usize) / NUM_DOCS_PER_BLOCK;
|
||||
let num_vint_docs = (len as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks;
|
||||
BlockSegmentPostings {
|
||||
num_binpacked_blocks: num_binpacked_blocks,
|
||||
num_vint_docs: num_vint_docs,
|
||||
block_decoder: BlockDecoder::new(),
|
||||
freq_handler: freq_handler,
|
||||
remaining_data: data,
|
||||
doc_offset: 0,
|
||||
len: len,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reset(&mut self, len: usize, data: &'a [u8]) {
|
||||
let num_binpacked_blocks: usize = (len as usize) / NUM_DOCS_PER_BLOCK;
|
||||
let num_vint_docs = (len as usize) - NUM_DOCS_PER_BLOCK * num_binpacked_blocks;
|
||||
self.num_binpacked_blocks = num_binpacked_blocks;
|
||||
self.num_vint_docs = num_vint_docs;
|
||||
self.remaining_data = data;
|
||||
self.doc_offset = 0;
|
||||
self.len = len;
|
||||
}
|
||||
|
||||
pub fn docs(&self) -> &[DocId] {
|
||||
self.block_decoder.output_array()
|
||||
}
|
||||
|
||||
pub fn freq_handler(&self) -> &FreqHandler {
|
||||
&self.freq_handler
|
||||
}
|
||||
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.num_binpacked_blocks > 0 {
|
||||
self.remaining_data = self.block_decoder.uncompress_block_sorted(self.remaining_data, self.doc_offset);
|
||||
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
|
||||
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
|
||||
self.num_binpacked_blocks -= 1;
|
||||
true
|
||||
}
|
||||
else {
|
||||
if self.num_vint_docs > 0 {
|
||||
self.remaining_data = self.block_decoder.uncompress_vint_sorted(self.remaining_data, self.doc_offset, self.num_vint_docs);
|
||||
self.freq_handler.read_freq_vint(self.remaining_data, self.num_vint_docs);
|
||||
self.num_vint_docs = 0;
|
||||
true
|
||||
}
|
||||
else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> BlockSegmentPostings<'static> {
|
||||
BlockSegmentPostings {
|
||||
num_binpacked_blocks: 0,
|
||||
num_vint_docs: 0,
|
||||
block_decoder: BlockDecoder::new(),
|
||||
freq_handler: FreqHandler::new_without_freq(),
|
||||
remaining_data: &EMPTY_DATA,
|
||||
doc_offset: 0,
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// `SegmentPostings` represents the inverted list or postings associated to
|
||||
/// a term in a `Segment`.
|
||||
///
|
||||
@@ -14,28 +96,14 @@ const EMPTY_DATA: [u8; 0] = [0u8; 0];
|
||||
/// Positions on the other hand, are optionally entirely decoded upfront.
|
||||
pub struct SegmentPostings<'a> {
|
||||
len: usize,
|
||||
doc_offset: u32,
|
||||
block_decoder: BlockDecoder,
|
||||
freq_handler: FreqHandler,
|
||||
remaining_data: &'a [u8],
|
||||
cur: Wrapping<usize>,
|
||||
block_cursor: BlockSegmentPostings<'a>,
|
||||
cur_block_len: usize,
|
||||
delete_bitset: DeleteBitSet,
|
||||
}
|
||||
|
||||
impl<'a> SegmentPostings<'a> {
|
||||
fn load_next_block(&mut self) {
|
||||
let num_remaining_docs = self.len - self.cur.0;
|
||||
if num_remaining_docs >= NUM_DOCS_PER_BLOCK {
|
||||
self.remaining_data = self.block_decoder
|
||||
.uncompress_block_sorted(self.remaining_data, self.doc_offset);
|
||||
self.remaining_data = self.freq_handler.read_freq_block(self.remaining_data);
|
||||
self.doc_offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
|
||||
} else {
|
||||
self.remaining_data = self.block_decoder
|
||||
.uncompress_vint_sorted(self.remaining_data, self.doc_offset, num_remaining_docs);
|
||||
self.freq_handler.read_freq_vint(self.remaining_data, num_remaining_docs);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Reads a Segment postings from an &[u8]
|
||||
///
|
||||
@@ -43,39 +111,29 @@ impl<'a> SegmentPostings<'a> {
|
||||
/// * `data` - data array. The complete data is not necessarily used.
|
||||
/// * `freq_handler` - the freq handler is in charge of decoding
|
||||
/// frequencies and/or positions
|
||||
pub fn from_data(len: u32,
|
||||
data: &'a [u8],
|
||||
delete_bitset: &'a DeleteBitSet,
|
||||
freq_handler: FreqHandler) -> SegmentPostings<'a> {
|
||||
pub fn from_block_postings(
|
||||
segment_block_postings: BlockSegmentPostings<'a>,
|
||||
delete_bitset: DeleteBitSet) -> SegmentPostings<'a> {
|
||||
SegmentPostings {
|
||||
len: len as usize,
|
||||
doc_offset: 0,
|
||||
block_decoder: BlockDecoder::new(),
|
||||
freq_handler: freq_handler,
|
||||
remaining_data: data,
|
||||
len: segment_block_postings.len,
|
||||
block_cursor: segment_block_postings,
|
||||
cur: Wrapping(usize::max_value()),
|
||||
delete_bitset: delete_bitset.clone(),
|
||||
cur_block_len: 0,
|
||||
delete_bitset: delete_bitset,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns an empty segment postings object
|
||||
pub fn empty() -> SegmentPostings<'static> {
|
||||
let empty_block_cursor = BlockSegmentPostings::empty();
|
||||
SegmentPostings {
|
||||
len: 0,
|
||||
doc_offset: 0,
|
||||
block_decoder: BlockDecoder::new(),
|
||||
freq_handler: FreqHandler::new_without_freq(),
|
||||
remaining_data: &EMPTY_DATA,
|
||||
block_cursor: empty_block_cursor,
|
||||
delete_bitset: DeleteBitSet::empty(),
|
||||
cur: Wrapping(usize::max_value()),
|
||||
cur_block_len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Index within a block is used as an address when
|
||||
/// interacting with the `FreqHandler`
|
||||
fn index_within_block(&self) -> usize {
|
||||
self.cur.0 % NUM_DOCS_PER_BLOCK
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -84,13 +142,16 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self) -> bool {
|
||||
loop {
|
||||
loop {
|
||||
self.cur += Wrapping(1);
|
||||
if self.cur.0 >= self.len {
|
||||
return false;
|
||||
}
|
||||
if self.index_within_block() == 0 {
|
||||
self.load_next_block();
|
||||
if self.cur.0 == self.cur_block_len {
|
||||
self.cur = Wrapping(0);
|
||||
if !self.block_cursor.advance() {
|
||||
self.cur_block_len = 0;
|
||||
self.cur = Wrapping(usize::max_value());
|
||||
return false;
|
||||
}
|
||||
self.cur_block_len = self.block_cursor.docs().len();
|
||||
}
|
||||
if !self.delete_bitset.is_deleted(self.doc()) {
|
||||
return true;
|
||||
@@ -100,7 +161,7 @@ impl<'a> DocSet for SegmentPostings<'a> {
|
||||
|
||||
#[inline]
|
||||
fn doc(&self) -> DocId {
|
||||
self.block_decoder.output(self.index_within_block())
|
||||
self.block_cursor.docs()[self.cur.0]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -112,10 +173,10 @@ impl<'a> HasLen for SegmentPostings<'a> {
|
||||
|
||||
impl<'a> Postings for SegmentPostings<'a> {
|
||||
fn term_freq(&self) -> u32 {
|
||||
self.freq_handler.freq(self.index_within_block())
|
||||
self.block_cursor.freq_handler().freq(self.cur.0)
|
||||
}
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
self.freq_handler.positions(self.index_within_block())
|
||||
self.block_cursor.freq_handler().positions(self.cur.0)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use Result;
|
||||
use datastruct::FstMapBuilder;
|
||||
use datastruct::TermDictionaryBuilder;
|
||||
use super::TermInfo;
|
||||
use schema::Term;
|
||||
use schema::Field;
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
@@ -30,7 +29,7 @@ use common::BinarySerializable;
|
||||
///
|
||||
/// The serializer expects to receive the following calls
|
||||
/// in this order :
|
||||
///
|
||||
/// * `set_field(...)`
|
||||
/// * `new_term(...)`
|
||||
/// * `write_doc(...)`
|
||||
/// * `write_doc(...)`
|
||||
@@ -41,6 +40,8 @@ use common::BinarySerializable;
|
||||
/// * `write_doc(...)`
|
||||
/// * ...
|
||||
/// * `close_term()`
|
||||
/// * `set_field(...)`
|
||||
/// * ...
|
||||
/// * `close()`
|
||||
///
|
||||
/// Terms have to be pushed in a lexicographically-sorted order.
|
||||
@@ -49,7 +50,7 @@ use common::BinarySerializable;
|
||||
/// A description of the serialization format is
|
||||
/// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html).
|
||||
pub struct PostingsSerializer {
|
||||
terms_fst_builder: FstMapBuilder<WritePtr, TermInfo>,
|
||||
terms_fst_builder: TermDictionaryBuilder<WritePtr, TermInfo>,
|
||||
postings_write: WritePtr,
|
||||
positions_write: WritePtr,
|
||||
written_bytes_postings: usize,
|
||||
@@ -73,7 +74,7 @@ impl PostingsSerializer {
|
||||
positions_write: WritePtr,
|
||||
schema: Schema)
|
||||
-> Result<PostingsSerializer> {
|
||||
let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
|
||||
let terms_fst_builder = TermDictionaryBuilder::new(terms_write)?;
|
||||
Ok(PostingsSerializer {
|
||||
terms_fst_builder: terms_fst_builder,
|
||||
postings_write: postings_write,
|
||||
@@ -105,17 +106,28 @@ impl PostingsSerializer {
|
||||
segment.schema())
|
||||
}
|
||||
|
||||
fn load_indexing_options(&mut self, field: Field) {
|
||||
/// Must be called before starting pushing terms of
|
||||
/// a given field.
|
||||
///
|
||||
/// Loads the indexing options for the given field.
|
||||
pub fn new_field(&mut self, field: Field) {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
self.text_indexing_options = match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options.get_indexing_options(),
|
||||
FieldType::U32(ref u32_options) => {
|
||||
if u32_options.is_indexed() {
|
||||
FieldType::U64(ref int_options) => {
|
||||
if int_options.is_indexed() {
|
||||
TextIndexingOptions::Unindexed
|
||||
} else {
|
||||
TextIndexingOptions::Untokenized
|
||||
}
|
||||
}
|
||||
FieldType::I64(ref int_options) => {
|
||||
if int_options.is_indexed() {
|
||||
TextIndexingOptions::Unindexed
|
||||
} else {
|
||||
TextIndexingOptions::Untokenized
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@@ -123,12 +135,11 @@ impl PostingsSerializer {
|
||||
/// * term - the term. It needs to come after the previous term according
|
||||
/// to the lexicographical order.
|
||||
/// * doc_freq - return the number of document containing the term.
|
||||
pub fn new_term(&mut self, term: &Term) -> io::Result<()> {
|
||||
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
|
||||
if self.term_open {
|
||||
panic!("Called new_term, while the previous term was not closed.");
|
||||
}
|
||||
self.term_open = true;
|
||||
self.load_indexing_options(term.field());
|
||||
self.doc_ids.clear();
|
||||
self.last_doc_id_encoded = 0;
|
||||
self.term_freqs.clear();
|
||||
@@ -138,7 +149,7 @@ impl PostingsSerializer {
|
||||
postings_offset: self.written_bytes_postings as u32,
|
||||
positions_offset: self.written_bytes_positions as u32,
|
||||
};
|
||||
self.terms_fst_builder.insert_key(term.as_slice())
|
||||
self.terms_fst_builder.insert_key(term)
|
||||
}
|
||||
|
||||
/// Finish the serialization for this term postings.
|
||||
|
||||
@@ -23,7 +23,7 @@ mod tests {
|
||||
use collector::tests::TestCollector;
|
||||
use Index;
|
||||
use schema::*;
|
||||
use fastfield::{U32FastFieldReader};
|
||||
use fastfield::{U64FastFieldReader};
|
||||
use postings::SegmentPostingsOption;
|
||||
|
||||
fn abs_diff(left: f32, right: f32) -> f32 {
|
||||
@@ -102,7 +102,7 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let boolean_query = BooleanQuery::from(vec![(Occur::MustNot, make_term_query("d")),]);
|
||||
assert_eq!(matching_docs(&boolean_query), Vec::new());
|
||||
assert_eq!(matching_docs(&boolean_query), Vec::<u32>::new());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -111,7 +111,7 @@ mod tests {
|
||||
let occurs = vec!(Occur::Should, Occur::Should);
|
||||
let occur_filter = OccurFilter::new(&occurs);
|
||||
|
||||
let left_fieldnorms = U32FastFieldReader::from(vec!(100,200,300));
|
||||
let left_fieldnorms = U64FastFieldReader::from(vec!(100,200,300));
|
||||
|
||||
let left = VecPostings::from(vec!(1, 2, 3));
|
||||
let left_scorer = TermScorer {
|
||||
@@ -120,7 +120,7 @@ mod tests {
|
||||
postings: left,
|
||||
};
|
||||
|
||||
let right_fieldnorms = U32FastFieldReader::from(vec!(15,25,35));
|
||||
let right_fieldnorms = U64FastFieldReader::from(vec!(15,25,35));
|
||||
let right = VecPostings::from(vec!(1, 3, 8));
|
||||
|
||||
let right_scorer = TermScorer {
|
||||
|
||||
@@ -60,11 +60,14 @@ mod tests {
|
||||
searcher.search(&phrase_query, &mut test_collector).expect("search should succeed");
|
||||
test_collector.docs()
|
||||
};
|
||||
|
||||
let empty_vec = Vec::<u32>::new();
|
||||
|
||||
assert_eq!(test_query(vec!("a", "b", "c")), vec!(2, 4));
|
||||
assert_eq!(test_query(vec!("a", "b")), vec!(1, 2, 3, 4));
|
||||
assert_eq!(test_query(vec!("b", "b")), vec!(0, 1));
|
||||
assert_eq!(test_query(vec!("g", "ewrwer")), vec!());
|
||||
assert_eq!(test_query(vec!("g", "a")), vec!());
|
||||
assert_eq!(test_query(vec!("g", "ewrwer")), empty_vec);
|
||||
assert_eq!(test_query(vec!("g", "a")), empty_vec);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -10,8 +10,21 @@ fn literal<I>(input: I) -> ParseResult<UserInputAST, I>
|
||||
let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s);
|
||||
phrase.or(word)
|
||||
};
|
||||
let field = many1(letter());
|
||||
let term_query = (field, char(':'), term_val()).map(|(field_name, _, phrase)| {
|
||||
|
||||
let negative_numbers =
|
||||
(char('-'), many1(satisfy(|c: char| c.is_numeric())))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
|
||||
let field =
|
||||
(
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_'))
|
||||
)
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2));
|
||||
|
||||
let term_val_with_field = negative_numbers.or(term_val());
|
||||
|
||||
let term_query = (field, char(':'), term_val_with_field).map(|(field_name, _, phrase)| {
|
||||
UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase: phrase,
|
||||
|
||||
@@ -10,7 +10,9 @@ use postings::SegmentPostingsOption;
|
||||
use query::PhraseQuery;
|
||||
use analyzer::SimpleTokenizer;
|
||||
use analyzer::StreamingIterator;
|
||||
use schema::Term;
|
||||
use schema::{Term, FieldType};
|
||||
use std::str::FromStr;
|
||||
use std::num::ParseIntError;
|
||||
|
||||
|
||||
|
||||
@@ -22,18 +24,25 @@ pub enum QueryParserError {
|
||||
/// `FieldDoesNotExist(field_name: String)`
|
||||
/// The query references a field that is not in the schema
|
||||
FieldDoesNotExist(String),
|
||||
/// `ExpectedU32(field_name: String, field_value: String)`
|
||||
/// The query contains a term for a `u32`-field, but the value
|
||||
/// is not a u32.
|
||||
ExpectedU32(String, String),
|
||||
/// The query contains a term for a `u64`-field, but the value
|
||||
/// is not a u64.
|
||||
ExpectedInt(ParseIntError),
|
||||
/// It is forbidden queries that are only "excluding". (e.g. -title:pop)
|
||||
AllButQueryForbidden,
|
||||
/// If no default field is declared, running a query without any
|
||||
/// field specified is forbbidden.
|
||||
NoDefaultFieldDeclared,
|
||||
/// The field searched for is not declared
|
||||
/// as indexed in the schema.
|
||||
FieldNotIndexed(String),
|
||||
}
|
||||
|
||||
|
||||
impl From<ParseIntError> for QueryParserError {
|
||||
fn from(err: ParseIntError) -> QueryParserError {
|
||||
QueryParserError::ExpectedInt(err)
|
||||
}
|
||||
}
|
||||
|
||||
/// Tantivy's Query parser
|
||||
///
|
||||
@@ -122,7 +131,7 @@ impl QueryParser {
|
||||
fn compute_logical_ast(&self,
|
||||
user_input_ast: UserInputAST)
|
||||
-> Result<LogicalAST, QueryParserError> {
|
||||
let (occur, ast) = try!(self.compute_logical_ast_with_occur(user_input_ast));
|
||||
let (occur, ast) = self.compute_logical_ast_with_occur(user_input_ast)?;
|
||||
if occur == Occur::MustNot {
|
||||
return Err(QueryParserError::AllButQueryForbidden);
|
||||
}
|
||||
@@ -133,25 +142,51 @@ impl QueryParser {
|
||||
field: Field,
|
||||
phrase: &str)
|
||||
-> Result<Option<LogicalLiteral>, QueryParserError> {
|
||||
let mut token_iter = self.analyzer.tokenize(phrase);
|
||||
let mut tokens: Vec<Term> = Vec::new();
|
||||
loop {
|
||||
if let Some(token) = token_iter.next() {
|
||||
let text = token.to_string();
|
||||
// TODO Handle u32
|
||||
let term = Term::from_field_text(field, &text);
|
||||
tokens.push(term);
|
||||
} else {
|
||||
break;
|
||||
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
if !field_type.is_indexed() {
|
||||
let field_name = field_entry.name().to_string();
|
||||
return Err(QueryParserError::FieldNotIndexed(field_name));
|
||||
}
|
||||
match field_type {
|
||||
&FieldType::I64(_) => {
|
||||
let val: i64 = i64::from_str(phrase)?;
|
||||
let term = Term::from_field_i64(field, val);
|
||||
return Ok(Some(LogicalLiteral::Term(term)));
|
||||
}
|
||||
&FieldType::U64(_) => {
|
||||
let val: u64 = u64::from_str(phrase)?;
|
||||
let term = Term::from_field_u64(field, val);
|
||||
return Ok(Some(LogicalLiteral::Term(term)));
|
||||
}
|
||||
&FieldType::Str(ref str_options) => {
|
||||
let mut terms: Vec<Term> = Vec::new();
|
||||
if str_options.get_indexing_options().is_tokenized() {
|
||||
let mut token_iter = self.analyzer.tokenize(phrase);
|
||||
loop {
|
||||
if let Some(token) = token_iter.next() {
|
||||
let term = Term::from_field_text(field, token);
|
||||
terms.push(term);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
terms.push(Term::from_field_text(field, phrase));
|
||||
}
|
||||
if terms.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
else if terms.len() == 1 {
|
||||
return Ok(Some(LogicalLiteral::Term(terms.into_iter().next().unwrap())))
|
||||
} else {
|
||||
return Ok(Some(LogicalLiteral::Phrase(terms)))
|
||||
}
|
||||
}
|
||||
}
|
||||
if tokens.is_empty() {
|
||||
Ok(None)
|
||||
} else if tokens.len() == 1 {
|
||||
Ok(Some(LogicalLiteral::Term(tokens.into_iter().next().unwrap())))
|
||||
} else {
|
||||
Ok(Some(LogicalLiteral::Phrase(tokens)))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
fn default_occur(&self) -> Occur {
|
||||
@@ -209,23 +244,23 @@ impl QueryParser {
|
||||
asts.push(LogicalAST::Leaf(box ast));
|
||||
}
|
||||
}
|
||||
let result_ast = if asts.len() == 0 {
|
||||
// this should never happen
|
||||
return Err(QueryParserError::SyntaxError);
|
||||
} else if asts.len() == 1 {
|
||||
asts[0].clone()
|
||||
} else {
|
||||
LogicalAST::Clause(asts.into_iter()
|
||||
.map(|ast| (Occur::Should, ast))
|
||||
.collect())
|
||||
};
|
||||
let result_ast =
|
||||
if asts.len() == 0 {
|
||||
// this should never happen
|
||||
return Err(QueryParserError::SyntaxError);
|
||||
} else if asts.len() == 1 {
|
||||
asts[0].clone()
|
||||
} else {
|
||||
LogicalAST::Clause(asts.into_iter()
|
||||
.map(|ast| (Occur::Should, ast))
|
||||
.collect())
|
||||
};
|
||||
Ok((Occur::Should, result_ast))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Compose two occur values.
|
||||
fn compose_occur(left: Occur, right: Occur) -> Occur {
|
||||
match left {
|
||||
@@ -270,16 +305,23 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use schema::{SchemaBuilder, TEXT};
|
||||
use schema::{SchemaBuilder, Term, TEXT, STRING, STORED, INT_INDEXED};
|
||||
use query::Query;
|
||||
use schema::Field;
|
||||
use super::QueryParser;
|
||||
use super::QueryParserError;
|
||||
use super::super::logical_ast::*;
|
||||
|
||||
|
||||
fn make_query_parser() -> QueryParser {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
schema_builder.add_i64_field("signed", INT_INDEXED);
|
||||
schema_builder.add_u64_field("unsigned", INT_INDEXED);
|
||||
schema_builder.add_text_field("notindexed_text", STORED);
|
||||
schema_builder.add_text_field("notindexed_u64", STORED);
|
||||
schema_builder.add_text_field("notindexed_i64", STORED);
|
||||
schema_builder.add_text_field("nottokenized", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let default_fields = vec![title, text];
|
||||
QueryParser::new(schema, default_fields)
|
||||
@@ -310,47 +352,105 @@ mod test {
|
||||
let query_parser = make_query_parser();
|
||||
assert!(query_parser.parse_query("toto").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_nonindexed_field_yields_error() {
|
||||
let query_parser = make_query_parser();
|
||||
|
||||
let is_not_indexed_err = |query: &str| {
|
||||
let result: Result<Box<Query>, QueryParserError> = query_parser.parse_query(query);
|
||||
if let Err(QueryParserError::FieldNotIndexed(field_name)) = result {
|
||||
Some(field_name.clone())
|
||||
}
|
||||
else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_text:titi"),
|
||||
Some(String::from("notindexed_text"))
|
||||
);
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_u64:23424"),
|
||||
Some(String::from("notindexed_u64"))
|
||||
);
|
||||
assert_eq!(
|
||||
is_not_indexed_err("notindexed_i64:-234324"),
|
||||
Some(String::from("notindexed_i64"))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_untokenized() {
|
||||
test_parse_query_to_logical_ast_helper("nottokenized:\"wordone wordtwo\"",
|
||||
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, 101, 32, 119, 111, 114, 100, 116, 119, 111])",
|
||||
false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_ints() {
|
||||
let query_parser = make_query_parser();
|
||||
assert!(query_parser.parse_query("signed:2324").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"22\"").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"-2234\"").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"-9999999999999\"").is_ok());
|
||||
assert!(query_parser.parse_query("signed:\"a\"").is_err());
|
||||
assert!(query_parser.parse_query("signed:\"2a\"").is_err());
|
||||
assert!(query_parser.parse_query("signed:\"18446744073709551615\"").is_err());
|
||||
assert!(query_parser.parse_query("unsigned:\"2\"").is_ok());
|
||||
assert!(query_parser.parse_query("unsigned:\"-2\"").is_err());
|
||||
assert!(query_parser.parse_query("unsigned:\"18446744073709551615\"").is_ok());
|
||||
test_parse_query_to_logical_ast_helper("unsigned:2324",
|
||||
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
|
||||
false);
|
||||
|
||||
test_parse_query_to_logical_ast_helper("signed:-2324",
|
||||
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
|
||||
false);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_disjunction() {
|
||||
test_parse_query_to_logical_ast_helper("title:toto",
|
||||
"Term([0, 116, 111, 116, 111])",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
false);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto",
|
||||
"Term([0, 116, 111, 116, 111])",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
false);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto -titi",
|
||||
"(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, \
|
||||
105, 116, 105]) Term([1, 116, 105, 116, 105])))",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) -(Term([0, 0, 0, 0, 116, \
|
||||
105, 116, 105]) Term([0, 0, 0, 1, 116, 105, 116, 105])))",
|
||||
false);
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", false).err().unwrap(),
|
||||
QueryParserError::AllButQueryForbidden);
|
||||
test_parse_query_to_logical_ast_helper("title:a b",
|
||||
"(Term([0, 97]) (Term([0, 98]) Term([1, 98])))",
|
||||
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) Term([0, 0, 0, 1, 98])))",
|
||||
false);
|
||||
test_parse_query_to_logical_ast_helper("title:\"a b\"",
|
||||
"\"[Term([0, 97]), Term([0, 98])]\"",
|
||||
"\"[Term([0, 0, 0, 0, 97]), Term([0, 0, 0, 0, 98])]\"",
|
||||
false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 116, 111, 116, 111])", true);
|
||||
test_parse_query_to_logical_ast_helper("title:toto", "Term([0, 0, 0, 0, 116, 111, 116, 111])", true);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto",
|
||||
"Term([0, 116, 111, 116, 111])",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
true);
|
||||
test_parse_query_to_logical_ast_helper("+title:toto -titi",
|
||||
"(+Term([0, 116, 111, 116, 111]) -(Term([0, 116, \
|
||||
105, 116, 105]) Term([1, 116, 105, 116, 105])))",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) -(Term([0, 0, 0, 0, 116, \
|
||||
105, 116, 105]) Term([0, 0, 0, 1, 116, 105, 116, 105])))",
|
||||
true);
|
||||
assert_eq!(parse_query_to_logical_ast("-title:toto", true).err().unwrap(),
|
||||
QueryParserError::AllButQueryForbidden);
|
||||
test_parse_query_to_logical_ast_helper("title:a b",
|
||||
"(+Term([0, 97]) +(Term([0, 98]) Term([1, 98])))",
|
||||
"(+Term([0, 0, 0, 0, 97]) +(Term([0, 0, 0, 0, 98]) Term([0, 0, 0, 1, 98])))",
|
||||
true);
|
||||
test_parse_query_to_logical_ast_helper("title:\"a b\"",
|
||||
"\"[Term([0, 97]), Term([0, 98])]\"",
|
||||
"\"[Term([0, 0, 0, 0, 97]), Term([0, 0, 0, 0, 98])]\"",
|
||||
true);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,10 +30,8 @@ impl<'a> Scorer for Box<Scorer + 'a> {
|
||||
}
|
||||
|
||||
fn collect(&mut self, collector: &mut Collector) {
|
||||
let scorer = self.deref_mut();
|
||||
while scorer.advance() {
|
||||
collector.collect(scorer.doc(), scorer.score());
|
||||
}
|
||||
let scorer: &mut Scorer = self.deref_mut();
|
||||
scorer.collect(collector);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,11 +14,12 @@ mod tests {
|
||||
use query::Scorer;
|
||||
use query::term_query::TermScorer;
|
||||
use query::Query;
|
||||
use fastfield::U32FastFieldReader;
|
||||
use fastfield::U64FastFieldReader;
|
||||
use query::TermQuery;
|
||||
use Index;
|
||||
use schema::*;
|
||||
use postings::SegmentPostingsOption;
|
||||
use fastfield::FastFieldReader;
|
||||
|
||||
fn abs_diff(left: f32, right: f32) -> f32 {
|
||||
(right - left).abs()
|
||||
@@ -55,7 +56,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_term_scorer() {
|
||||
let left_fieldnorms = U32FastFieldReader::from(vec!(10, 4));
|
||||
let left_fieldnorms = U64FastFieldReader::from(vec!(10, 4));
|
||||
assert_eq!(left_fieldnorms.get(0), 10);
|
||||
assert_eq!(left_fieldnorms.get(1), 4);
|
||||
let left = VecPostings::from(vec!(1));
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
use Score;
|
||||
use DocId;
|
||||
use fastfield::U32FastFieldReader;
|
||||
use fastfield::U64FastFieldReader;
|
||||
use postings::DocSet;
|
||||
use query::Scorer;
|
||||
use postings::Postings;
|
||||
use fastfield::FastFieldReader;
|
||||
|
||||
pub struct TermScorer<TPostings> where TPostings: Postings {
|
||||
pub idf: Score,
|
||||
pub fieldnorm_reader_opt: Option<U32FastFieldReader>,
|
||||
pub fieldnorm_reader_opt: Option<U64FastFieldReader>,
|
||||
pub postings: TPostings,
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ use itertools::Itertools;
|
||||
|
||||
/// Documents are really just a list of couple `(field, value)`.
|
||||
/// In this list, one field may appear more than once.
|
||||
#[derive(Debug, RustcEncodable, RustcDecodable, Default)]
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
pub struct Document {
|
||||
field_values: Vec<FieldValue>,
|
||||
}
|
||||
@@ -52,9 +52,14 @@ impl Document {
|
||||
self.add(FieldValue::new(field, value));
|
||||
}
|
||||
|
||||
/// Add a u32 field
|
||||
pub fn add_u32(&mut self, field: Field, value: u32) {
|
||||
self.add(FieldValue::new(field, Value::U32(value)));
|
||||
/// Add a u64 field
|
||||
pub fn add_u64(&mut self, field: Field, value: u64) {
|
||||
self.add(FieldValue::new(field, Value::U64(value)));
|
||||
}
|
||||
|
||||
/// Add a u64 field
|
||||
pub fn add_i64(&mut self, field: Field, value: i64) {
|
||||
self.add(FieldValue::new(field, Value::I64(value)));
|
||||
}
|
||||
|
||||
/// Add a field value
|
||||
|
||||
@@ -10,8 +10,8 @@ use common::BinarySerializable;
|
||||
///
|
||||
/// Because the field id is a `u8`, tantivy can only have at most `255` fields.
|
||||
/// Value 255 is reserved.
|
||||
#[derive(Copy, Clone, Debug, PartialEq,PartialOrd,Eq,Ord,Hash, RustcEncodable, RustcDecodable)]
|
||||
pub struct Field(pub u8);
|
||||
#[derive(Copy, Clone, Debug, PartialEq,PartialOrd,Eq,Ord,Hash, Serialize, Deserialize)]
|
||||
pub struct Field(pub u32);
|
||||
|
||||
impl BinarySerializable for Field {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
@@ -19,7 +19,7 @@ impl BinarySerializable for Field {
|
||||
}
|
||||
|
||||
fn deserialize(reader: &mut Read) -> io::Result<Field> {
|
||||
u8::deserialize(reader).map(Field)
|
||||
u32::deserialize(reader).map(Field)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use schema::TextOptions;
|
||||
use schema::U32Options;
|
||||
use schema::IntOptions;
|
||||
|
||||
use rustc_serialize::Decodable;
|
||||
use rustc_serialize::Decoder;
|
||||
use rustc_serialize::Encodable;
|
||||
use rustc_serialize::Encoder;
|
||||
use std::fmt;
|
||||
use serde::{Serialize, Deserialize, Serializer, Deserializer};
|
||||
use serde::ser::SerializeStruct;
|
||||
use serde::de::{self, Visitor, MapAccess};
|
||||
use schema::FieldType;
|
||||
|
||||
/// A `FieldEntry` represents a field and its configuration.
|
||||
@@ -22,7 +22,7 @@ pub struct FieldEntry {
|
||||
|
||||
impl FieldEntry {
|
||||
|
||||
/// Creates a new u32 field entry in the schema, given
|
||||
/// Creates a new u64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_text(field_name: String, field_type: TextOptions) -> FieldEntry {
|
||||
FieldEntry {
|
||||
@@ -31,17 +31,27 @@ impl FieldEntry {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new u32 field entry in the schema, given
|
||||
/// Creates a new u64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_u32(field_name: String, field_type: U32Options) -> FieldEntry {
|
||||
pub fn new_u64(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::U32(field_type),
|
||||
field_type: FieldType::U64(field_type),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new i64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_i64(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::I64(field_type),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns the name of the field
|
||||
pub fn name(&self,) -> &String {
|
||||
pub fn name(&self,) -> &str {
|
||||
&self.name
|
||||
}
|
||||
|
||||
@@ -54,14 +64,16 @@ impl FieldEntry {
|
||||
pub fn is_indexed(&self,) -> bool {
|
||||
match self.field_type {
|
||||
FieldType::Str(ref options) => options.get_indexing_options().is_indexed(),
|
||||
FieldType::U32(ref options) => options.is_indexed(),
|
||||
FieldType::U64(ref options) => options.is_indexed(),
|
||||
FieldType::I64(ref options) => options.is_indexed(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true iff the field is a u32 fast field
|
||||
pub fn is_u32_fast(&self,) -> bool {
|
||||
/// Returns true iff the field is a int (signed or unsigned) fast field
|
||||
pub fn is_int_fast(&self,) -> bool {
|
||||
match self.field_type {
|
||||
FieldType::U32(ref options) => options.is_fast(),
|
||||
FieldType::U64(ref options) => options.is_fast(),
|
||||
FieldType::I64(ref options) => options.is_fast(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
@@ -69,7 +81,10 @@ impl FieldEntry {
|
||||
/// Returns true iff the field is stored
|
||||
pub fn is_stored(&self,) -> bool {
|
||||
match self.field_type {
|
||||
FieldType::U32(ref options) => {
|
||||
FieldType::U64(ref options) => {
|
||||
options.is_stored()
|
||||
}
|
||||
FieldType::I64(ref options) => {
|
||||
options.is_stored()
|
||||
}
|
||||
FieldType::Str(ref options) => {
|
||||
@@ -79,63 +94,99 @@ impl FieldEntry {
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for FieldEntry {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer
|
||||
{
|
||||
let mut s = serializer.serialize_struct("field_entry", 3)?;
|
||||
s.serialize_field("name", &self.name)?;
|
||||
|
||||
|
||||
impl Encodable for FieldEntry {
|
||||
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
|
||||
s.emit_struct("field_entry", 3, |s| {
|
||||
try!(s.emit_struct_field("name", 0, |s| {
|
||||
self.name.encode(s)
|
||||
}));
|
||||
match self.field_type {
|
||||
FieldType::Str(ref options) => {
|
||||
try!(s.emit_struct_field("type", 1, |s| {
|
||||
s.emit_str("text")
|
||||
}));
|
||||
try!(s.emit_struct_field("options", 2, |s| {
|
||||
options.encode(s)
|
||||
}));
|
||||
}
|
||||
FieldType::U32(ref options) => {
|
||||
try!(s.emit_struct_field("type", 1, |s| {
|
||||
s.emit_str("u32")
|
||||
}));
|
||||
try!(s.emit_struct_field("options", 2, |s| {
|
||||
options.encode(s)
|
||||
}));
|
||||
}
|
||||
match self.field_type {
|
||||
FieldType::Str(ref options) => {
|
||||
s.serialize_field("type", "text")?;
|
||||
s.serialize_field("options", options)?;
|
||||
},
|
||||
FieldType::U64(ref options) => {
|
||||
s.serialize_field("type", "u64")?;
|
||||
s.serialize_field("options", options)?;
|
||||
},
|
||||
FieldType::I64(ref options) => {
|
||||
s.serialize_field("type", "i64")?;
|
||||
s.serialize_field("options", options)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
s.end()
|
||||
}
|
||||
}
|
||||
|
||||
impl Decodable for FieldEntry {
|
||||
fn decode<D: Decoder>(d: &mut D) -> Result<Self, D::Error> {
|
||||
d.read_struct("field_entry", 3, |d| {
|
||||
let name = try!(d.read_struct_field("name", 0, |d| {
|
||||
d.read_str()
|
||||
}));
|
||||
let field_type: String = try!(d.read_struct_field("type", 1, |d| {
|
||||
d.read_str()
|
||||
}));
|
||||
d.read_struct_field("options", 2, |d| {
|
||||
match field_type.as_ref() {
|
||||
"u32" => {
|
||||
let u32_options = try!(U32Options::decode(d));
|
||||
Ok(FieldEntry::new_u32(name, u32_options))
|
||||
}
|
||||
"text" => {
|
||||
let text_options = try!(TextOptions::decode(d));
|
||||
Ok(FieldEntry::new_text(name, text_options))
|
||||
}
|
||||
_ => {
|
||||
Err(d.error(&format!("Field type {:?} unknown", field_type)))
|
||||
impl<'de> Deserialize<'de> for FieldEntry {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where D: Deserializer<'de>
|
||||
{
|
||||
#[derive(Deserialize)]
|
||||
#[serde(field_identifier, rename_all = "lowercase")]
|
||||
enum Field { Name, Type, Options };
|
||||
|
||||
const FIELDS: &'static [&'static str] = &["name", "type", "options"];
|
||||
|
||||
struct FieldEntryVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for FieldEntryVisitor {
|
||||
type Value = FieldEntry;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str("struct FieldEntry")
|
||||
}
|
||||
|
||||
fn visit_map<V>(self, mut map: V) -> Result<FieldEntry, V::Error>
|
||||
where V: MapAccess<'de>
|
||||
{
|
||||
let mut name = None;
|
||||
let mut ty = None;
|
||||
let mut field_type = None;
|
||||
while let Some(key) = map.next_key()? {
|
||||
match key {
|
||||
Field::Name => {
|
||||
if name.is_some() {
|
||||
return Err(de::Error::duplicate_field("name"));
|
||||
}
|
||||
name = Some(map.next_value()?);
|
||||
}
|
||||
Field::Type => {
|
||||
if ty.is_some() {
|
||||
return Err(de::Error::duplicate_field("type"));
|
||||
}
|
||||
ty = Some(map.next_value()?);
|
||||
}
|
||||
Field::Options => {
|
||||
match ty {
|
||||
None => return Err(de::Error::custom("The `type` field must be specified before `options`")),
|
||||
Some(ty) => {
|
||||
match ty {
|
||||
"text" => field_type = Some(FieldType::Str(map.next_value()?)),
|
||||
"u64" => field_type = Some(FieldType::U64(map.next_value()?)),
|
||||
"i64" => field_type = Some(FieldType::I64(map.next_value()?)),
|
||||
_ => return Err(de::Error::custom(format!("Unrecognised type {}", ty)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
let name = name.ok_or_else(|| de::Error::missing_field("name"))?;
|
||||
ty.ok_or_else(|| de::Error::missing_field("ty"))?;
|
||||
let field_type = field_type.ok_or_else(|| de::Error::missing_field("options"))?;
|
||||
|
||||
Ok(FieldEntry {
|
||||
name: name,
|
||||
field_type: field_type,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_struct("field_entry", FIELDS, FieldEntryVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,18 +196,31 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use schema::TEXT;
|
||||
use rustc_serialize::json;
|
||||
use serde_json;
|
||||
|
||||
#[test]
|
||||
fn test_json_serialization() {
|
||||
let field_value = FieldEntry::new_text(String::from("title"), TEXT);
|
||||
assert_eq!(format!("{}", json::as_pretty_json(&field_value)), r#"{
|
||||
|
||||
let expected = r#"{
|
||||
"name": "title",
|
||||
"type": "text",
|
||||
"options": {
|
||||
"indexing": "position",
|
||||
"stored": false
|
||||
}
|
||||
}"#);
|
||||
}"#;
|
||||
let field_value_json = serde_json::to_string_pretty(&field_value).unwrap();
|
||||
|
||||
assert_eq!(expected, &field_value_json);
|
||||
|
||||
let field_value: FieldEntry = serde_json::from_str(expected).unwrap();
|
||||
|
||||
assert_eq!("title", field_value.name);
|
||||
|
||||
match field_value.field_type {
|
||||
FieldType::Str(_) => assert!(true),
|
||||
_ => panic!("expected FieldType::Str")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use schema::TextOptions;
|
||||
use schema::U32Options;
|
||||
use schema::{TextOptions, IntOptions};
|
||||
|
||||
use rustc_serialize::json::Json;
|
||||
use serde_json::Value as JsonValue;
|
||||
use schema::Value;
|
||||
|
||||
|
||||
@@ -11,58 +10,83 @@ use schema::Value;
|
||||
pub enum ValueParsingError {
|
||||
/// Encounterred a numerical value that overflows or underflow its integer type.
|
||||
OverflowError(String),
|
||||
/// The json node is not of the correct type. (e.g. 3 for a `Str` type or `"abc"` for a u32 type)
|
||||
/// The json node is not of the correct type. (e.g. 3 for a `Str` type or `"abc"` for a u64 type)
|
||||
/// Tantivy will try to autocast values.
|
||||
TypeError(String),
|
||||
}
|
||||
|
||||
|
||||
/// A `FieldType` describes the type (text, u32) of a field as well as
|
||||
/// A `FieldType` describes the type (text, u64) of a field as well as
|
||||
/// how it should be handled by tantivy.
|
||||
#[derive(Clone, Debug, RustcDecodable, RustcEncodable)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum FieldType {
|
||||
/// String field type configuration
|
||||
Str(TextOptions),
|
||||
/// U32 field type configuration
|
||||
U32(U32Options),
|
||||
/// Unsigned 64-bits integers field type configuration
|
||||
U64(IntOptions),
|
||||
/// Signed 64-bits integers 64 field type configuration
|
||||
I64(IntOptions),
|
||||
}
|
||||
|
||||
impl FieldType {
|
||||
|
||||
|
||||
/// returns true iff the field is indexed.
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
match self {
|
||||
&FieldType::Str(ref text_options) => {
|
||||
text_options.get_indexing_options().is_indexed()
|
||||
}
|
||||
&FieldType::U64(ref int_options) => {
|
||||
int_options.is_indexed()
|
||||
}
|
||||
&FieldType::I64(ref int_options) => {
|
||||
int_options.is_indexed()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses a field value from json, given the target FieldType.
|
||||
///
|
||||
/// Tantivy will not try to cast values.
|
||||
/// For instance, If the json value is the integer `3` and the
|
||||
/// target field is a `Str`, this method will return an Error.
|
||||
pub fn value_from_json(&self, json: &Json) -> Result<Value, ValueParsingError> {
|
||||
pub fn value_from_json(&self, json: &JsonValue) -> Result<Value, ValueParsingError> {
|
||||
match *json {
|
||||
Json::String(ref field_text) => {
|
||||
JsonValue::String(ref field_text) => {
|
||||
match *self {
|
||||
FieldType::Str(_) => {
|
||||
Ok(Value::Str(field_text.clone()))
|
||||
}
|
||||
FieldType::U32(_) => {
|
||||
Err(ValueParsingError::TypeError(format!("Expected a u32 int, got {:?}", json)))
|
||||
FieldType::U64(_) | FieldType::I64(_) => {
|
||||
Err(ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json)))
|
||||
}
|
||||
}
|
||||
}
|
||||
Json::U64(ref field_val_u64) => {
|
||||
JsonValue::Number(ref field_val_num) => {
|
||||
match *self {
|
||||
FieldType::U32(_) => {
|
||||
if *field_val_u64 > (u32::max_value() as u64) {
|
||||
Err(ValueParsingError::OverflowError(format!("Expected u32, but value {:?} overflows.", field_val_u64)))
|
||||
FieldType::I64(_) => {
|
||||
if let Some(field_val_i64) = field_val_num.as_i64() {
|
||||
Ok(Value::I64(field_val_i64))
|
||||
}
|
||||
else {
|
||||
Ok(Value::U32(*field_val_u64 as u32))
|
||||
Err(ValueParsingError::OverflowError(format!("Expected an i64 int, got {:?}", json)))
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
FieldType::U64(_) => {
|
||||
if let Some(field_val_u64) = field_val_num.as_u64() {
|
||||
Ok(Value::U64(field_val_u64))
|
||||
}
|
||||
else {
|
||||
Err(ValueParsingError::OverflowError(format!("Expected an u64 int, got {:?}", json)))
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) => {
|
||||
Err(ValueParsingError::TypeError(format!("Expected a string, got {:?}", json)))
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
_ => {
|
||||
Err(ValueParsingError::TypeError(format!("Expected a string or a u32, got {:?}", json)))
|
||||
Err(ValueParsingError::TypeError(format!("Json value not supported error {:?}. Expected {:?}", json, self)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use schema::Value;
|
||||
|
||||
|
||||
/// `FieldValue` holds together a `Field` and its `Value`.
|
||||
#[derive(Debug, Clone, Ord, PartialEq, Eq, PartialOrd, RustcEncodable, RustcDecodable)]
|
||||
#[derive(Debug, Clone, Ord, PartialEq, Eq, PartialOrd, Serialize, Deserialize)]
|
||||
pub struct FieldValue {
|
||||
field: Field,
|
||||
value: Value,
|
||||
@@ -36,15 +36,13 @@ impl FieldValue {
|
||||
|
||||
impl BinarySerializable for FieldValue {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
let mut written_size = 0;
|
||||
written_size += try!(self.field.serialize(writer));
|
||||
written_size += try!(self.value.serialize(writer));
|
||||
Ok(written_size)
|
||||
Ok(self.field.serialize(writer)? +
|
||||
self.value.serialize(writer)?)
|
||||
}
|
||||
|
||||
fn deserialize(reader: &mut Read) -> io::Result<Self> {
|
||||
let field = try!(Field::deserialize(reader));
|
||||
let value = try!(Value::deserialize(reader));
|
||||
let field = Field::deserialize(reader)?;
|
||||
let value = Value::deserialize(reader)?;
|
||||
Ok(FieldValue::new(field, value))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
use std::ops::BitOr;
|
||||
|
||||
/// Define how a U32 field should be handled by tantivy.
|
||||
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
|
||||
pub struct U32Options {
|
||||
/// Define how an int field should be handled by tantivy.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct IntOptions {
|
||||
indexed: bool,
|
||||
fast: bool,
|
||||
stored: bool,
|
||||
}
|
||||
|
||||
impl U32Options {
|
||||
impl IntOptions {
|
||||
|
||||
/// Returns true iff the value is stored.
|
||||
pub fn is_stored(&self,) -> bool {
|
||||
@@ -26,39 +26,39 @@ impl U32Options {
|
||||
self.fast
|
||||
}
|
||||
|
||||
/// Set the u32 options as stored.
|
||||
/// Set the u64 options as stored.
|
||||
///
|
||||
/// Only the fields that are set as *stored* are
|
||||
/// persisted into the Tantivy's store.
|
||||
pub fn set_stored(mut self,) -> U32Options {
|
||||
pub fn set_stored(mut self,) -> IntOptions {
|
||||
self.stored = true;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the u32 options as indexed.
|
||||
/// Set the u64 options as indexed.
|
||||
///
|
||||
/// Setting an integer as indexed will generate
|
||||
/// a posting list for each value taken by the integer.
|
||||
pub fn set_indexed(mut self,) -> U32Options {
|
||||
pub fn set_indexed(mut self,) -> IntOptions {
|
||||
self.indexed = true;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the u32 options as a fast field.
|
||||
/// Set the u64 options as a fast field.
|
||||
///
|
||||
/// Fast fields are designed for random access.
|
||||
/// Access time are similar to a random lookup in an array.
|
||||
/// If more than one value is associated to a fast field, only the last one is
|
||||
/// kept.
|
||||
pub fn set_fast(mut self,) -> U32Options {
|
||||
pub fn set_fast(mut self,) -> IntOptions {
|
||||
self.fast = true;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for U32Options {
|
||||
fn default() -> U32Options {
|
||||
U32Options {
|
||||
impl Default for IntOptions {
|
||||
fn default() -> IntOptions {
|
||||
IntOptions {
|
||||
fast: false,
|
||||
indexed: false,
|
||||
stored: false,
|
||||
@@ -67,40 +67,40 @@ impl Default for U32Options {
|
||||
}
|
||||
|
||||
|
||||
/// Shortcut for a u32 fast field.
|
||||
/// Shortcut for a u64 fast field.
|
||||
///
|
||||
/// Such a shortcut can be composed as follows `STORED | FAST | U32_INDEXED`
|
||||
pub const FAST: U32Options = U32Options {
|
||||
/// Such a shortcut can be composed as follows `STORED | FAST | INT_INDEXED`
|
||||
pub const FAST: IntOptions = IntOptions {
|
||||
indexed: false,
|
||||
stored: false,
|
||||
fast: true,
|
||||
};
|
||||
|
||||
/// Shortcut for a u32 indexed field.
|
||||
/// Shortcut for a u64 indexed field.
|
||||
///
|
||||
/// Such a shortcut can be composed as follows `STORED | FAST | U32_INDEXED`
|
||||
pub const U32_INDEXED: U32Options = U32Options {
|
||||
/// Such a shortcut can be composed as follows `STORED | FAST | INT_INDEXED`
|
||||
pub const INT_INDEXED: IntOptions = IntOptions {
|
||||
indexed: true,
|
||||
stored: false,
|
||||
fast: false,
|
||||
};
|
||||
|
||||
/// Shortcut for a u32 stored field.
|
||||
/// Shortcut for a u64 stored field.
|
||||
///
|
||||
/// Such a shortcut can be composed as follows `STORED | FAST | U32_INDEXED`
|
||||
pub const U32_STORED: U32Options = U32Options {
|
||||
/// Such a shortcut can be composed as follows `STORED | FAST | INT_INDEXED`
|
||||
pub const INT_STORED: IntOptions = IntOptions {
|
||||
indexed: false,
|
||||
stored: true,
|
||||
fast: false,
|
||||
};
|
||||
|
||||
|
||||
impl BitOr for U32Options {
|
||||
impl BitOr for IntOptions {
|
||||
|
||||
type Output = U32Options;
|
||||
type Output = IntOptions;
|
||||
|
||||
fn bitor(self, other: U32Options) -> U32Options {
|
||||
let mut res = U32Options::default();
|
||||
fn bitor(self, other: IntOptions) -> IntOptions {
|
||||
let mut res = IntOptions::default();
|
||||
res.indexed = self.indexed | other.indexed;
|
||||
res.stored = self.stored | other.stored;
|
||||
res.fast = self.fast | other.fast;
|
||||
@@ -7,7 +7,7 @@ Tantivy has a very strict schema.
|
||||
The schema defines information about the fields your index contains, that is, for each field :
|
||||
|
||||
* the field name (may only contain letters `[a-zA-Z]`, number `[0-9]`, and `_`)
|
||||
* the type of the field (currently only `text` and `u32` are supported)
|
||||
* the type of the field (currently only `text` and `u64` are supported)
|
||||
* how the field should be indexed / stored.
|
||||
|
||||
This very last point is critical as it will enable / disable some of the functionality
|
||||
@@ -64,17 +64,17 @@ let schema = schema_builder.build();
|
||||
|
||||
|
||||
|
||||
## Setting a u32 field
|
||||
## Setting a u64 field
|
||||
|
||||
### Example
|
||||
|
||||
```
|
||||
use tantivy::schema::*;
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let num_stars_options = U32Options::default()
|
||||
let num_stars_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_indexed();
|
||||
schema_builder.add_u32_field("num_stars", num_stars_options);
|
||||
schema_builder.add_u64_field("num_stars", num_stars_options);
|
||||
let schema = schema_builder.build();
|
||||
```
|
||||
|
||||
@@ -82,15 +82,15 @@ Just like for Text fields (see above),
|
||||
setting the field as stored defines whether the field will be
|
||||
returned when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called,
|
||||
and setting the field as indexed means that we will be able perform queries such as `num_stars:10`.
|
||||
Note that unlike text fields, u32 can only be indexed in one way for the moment.
|
||||
Note that unlike text fields, u64 can only be indexed in one way for the moment.
|
||||
This may change when we will start supporting range queries.
|
||||
|
||||
The `fast` option on the other hand is specific to u32 fields, and is only relevant
|
||||
The `fast` option on the other hand is specific to u64 fields, and is only relevant
|
||||
if you are implementing your own queries. This functionality is somewhat similar to Lucene's
|
||||
`DocValues`.
|
||||
|
||||
u32 that are indexed as fast will be stored in a special data structure that will
|
||||
make it possible to access the u32 value given the doc id rapidly. This is useful if the value of
|
||||
u64 that are indexed as fast will be stored in a special data structure that will
|
||||
make it possible to access the u64 value given the doc id rapidly. This is useful if the value of
|
||||
the field is required during scoring or collection for instance.
|
||||
|
||||
*/
|
||||
@@ -104,12 +104,12 @@ mod field_entry;
|
||||
mod field_value;
|
||||
|
||||
mod text_options;
|
||||
mod u32_options;
|
||||
mod int_options;
|
||||
mod field;
|
||||
mod value;
|
||||
mod named_field_document;
|
||||
|
||||
|
||||
pub(crate) use self::term::extract_field_from_term_bytes;
|
||||
pub use self::named_field_document::NamedFieldDocument;
|
||||
pub use self::schema::{Schema, SchemaBuilder};
|
||||
pub use self::value::Value;
|
||||
@@ -129,10 +129,10 @@ pub use self::text_options::TEXT;
|
||||
pub use self::text_options::STRING;
|
||||
pub use self::text_options::STORED;
|
||||
|
||||
pub use self::u32_options::U32Options;
|
||||
pub use self::u32_options::FAST;
|
||||
pub use self::u32_options::U32_INDEXED;
|
||||
pub use self::u32_options::U32_STORED;
|
||||
pub use self::int_options::IntOptions;
|
||||
pub use self::int_options::FAST;
|
||||
pub use self::int_options::INT_INDEXED;
|
||||
pub use self::int_options::INT_STORED;
|
||||
|
||||
use regex::Regex;
|
||||
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
use std::collections::BTreeMap;
|
||||
use schema::Value;
|
||||
use rustc_serialize::Encodable;
|
||||
use rustc_serialize::Encoder;
|
||||
|
||||
|
||||
|
||||
@@ -11,33 +9,5 @@ use rustc_serialize::Encoder;
|
||||
/// A `NamedFieldDocument` is a simple representation of a document
|
||||
/// as a `BTreeMap<String, Vec<Value>>`.
|
||||
///
|
||||
#[derive(Serialize)]
|
||||
pub struct NamedFieldDocument(pub BTreeMap<String, Vec<Value>>);
|
||||
|
||||
|
||||
impl Encodable for NamedFieldDocument {
|
||||
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
|
||||
s.emit_struct("named_field_document", self.0.len(), |s| {
|
||||
for (i, (name, vals)) in self.0.iter().enumerate() {
|
||||
try!(s.emit_struct_field(name, i, |s| {
|
||||
for (j, val) in vals.iter().enumerate() {
|
||||
try!(s.emit_seq(vals.len(), |s| {
|
||||
s.emit_seq_elt(j, |s| {
|
||||
match *val {
|
||||
Value::Str(ref text) => {
|
||||
s.emit_str(text)
|
||||
},
|
||||
Value::U32(ref val) => {
|
||||
s.emit_u32(*val)
|
||||
}
|
||||
}
|
||||
})
|
||||
}));
|
||||
}
|
||||
Ok(())
|
||||
|
||||
}));
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,19 +1,15 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use rustc_serialize::Decodable;
|
||||
use rustc_serialize::Encodable;
|
||||
use rustc_serialize::Decoder;
|
||||
use rustc_serialize::Encoder;
|
||||
use rustc_serialize::json;
|
||||
use rustc_serialize::json::Json;
|
||||
use std::collections::BTreeMap;
|
||||
use schema::field_type::ValueParsingError;
|
||||
use std::sync::Arc;
|
||||
|
||||
use serde_json::{self, Value as JsonValue, Map as JsonObject};
|
||||
use serde::{Serialize, Serializer, Deserialize, Deserializer};
|
||||
use serde::ser::SerializeSeq;
|
||||
use serde::de::{Visitor, SeqAccess};
|
||||
use super::*;
|
||||
use std::fmt;
|
||||
|
||||
const MAX_NUM_FIELDS: usize = 255;
|
||||
|
||||
/// Tantivy has a very strict schema.
|
||||
/// You need to specify in advance whether a field is indexed or not,
|
||||
/// stored or not, and RAM-based or not.
|
||||
@@ -48,7 +44,7 @@ impl SchemaBuilder {
|
||||
SchemaBuilder::default()
|
||||
}
|
||||
|
||||
/// Adds a new u32 field.
|
||||
/// Adds a new u64 field.
|
||||
/// Returns the associated field handle
|
||||
///
|
||||
/// # Caution
|
||||
@@ -58,12 +54,31 @@ impl SchemaBuilder {
|
||||
/// by the second one.
|
||||
/// The first field will get a field id
|
||||
/// but only the second one will be indexed
|
||||
pub fn add_u32_field(
|
||||
pub fn add_u64_field(
|
||||
&mut self,
|
||||
field_name_str: &str,
|
||||
field_options: U32Options) -> Field {
|
||||
field_options: IntOptions) -> Field {
|
||||
let field_name = String::from(field_name_str);
|
||||
let field_entry = FieldEntry::new_u32(field_name, field_options);
|
||||
let field_entry = FieldEntry::new_u64(field_name, field_options);
|
||||
self.add_field(field_entry)
|
||||
}
|
||||
|
||||
/// Adds a new i64 field.
|
||||
/// Returns the associated field handle
|
||||
///
|
||||
/// # Caution
|
||||
///
|
||||
/// Appending two fields with the same name
|
||||
/// will result in the shadowing of the first
|
||||
/// by the second one.
|
||||
/// The first field will get a field id
|
||||
/// but only the second one will be indexed
|
||||
pub fn add_i64_field(
|
||||
&mut self,
|
||||
field_name_str: &str,
|
||||
field_options: IntOptions) -> Field {
|
||||
let field_name = String::from(field_name_str);
|
||||
let field_entry = FieldEntry::new_i64(field_name, field_options);
|
||||
self.add_field(field_entry)
|
||||
}
|
||||
|
||||
@@ -89,8 +104,8 @@ impl SchemaBuilder {
|
||||
|
||||
/// Adds a field entry to the schema in build.
|
||||
fn add_field(&mut self, field_entry: FieldEntry) -> Field {
|
||||
let field = Field(self.fields.len() as u8);
|
||||
let field_name = field_entry.name().clone();
|
||||
let field = Field(self.fields.len() as u32);
|
||||
let field_name = field_entry.name().to_string();
|
||||
self.fields.push(field_entry);
|
||||
self.fields_map.insert(field_name, field);
|
||||
field
|
||||
@@ -100,9 +115,6 @@ impl SchemaBuilder {
|
||||
/// Finalize the creation of a `Schema`
|
||||
/// This will consume your `SchemaBuilder`
|
||||
pub fn build(self,) -> Schema {
|
||||
if self.fields.len() > MAX_NUM_FIELDS {
|
||||
panic!("There may be at most 255 fields.");
|
||||
}
|
||||
Schema(Arc::new(InnerSchema {
|
||||
fields: self.fields,
|
||||
fields_map: self.fields_map,
|
||||
@@ -159,7 +171,7 @@ impl Schema {
|
||||
}
|
||||
|
||||
/// Return the field name for a given `Field`.
|
||||
pub fn get_field_name(&self, field: Field) -> &String {
|
||||
pub fn get_field_name(&self, field: Field) -> &str {
|
||||
self.get_field_entry(field).name()
|
||||
}
|
||||
|
||||
@@ -191,7 +203,7 @@ impl Schema {
|
||||
.map(|field_val| field_val.value() )
|
||||
.cloned()
|
||||
.collect();
|
||||
field_map.insert(field_name.clone(), values);
|
||||
field_map.insert(field_name.to_string(), values);
|
||||
}
|
||||
NamedFieldDocument(field_map)
|
||||
}
|
||||
@@ -201,14 +213,12 @@ impl Schema {
|
||||
///
|
||||
/// Encoding a document cannot fail.
|
||||
pub fn to_json(&self, doc: &Document) -> String {
|
||||
json::encode(&self.to_named_doc(doc)).unwrap()
|
||||
serde_json::to_string(&self.to_named_doc(doc)).expect("doc encoding failed. This is a bug")
|
||||
}
|
||||
|
||||
/// Build a document object from a json-object.
|
||||
pub fn parse_document(&self, doc_json: &str) -> Result<Document, DocParsingError> {
|
||||
let json_node = try!(Json::from_str(doc_json));
|
||||
let some_json_obj = json_node.as_object();
|
||||
if !some_json_obj.is_some() {
|
||||
let json_obj: JsonObject<String, JsonValue> = serde_json::from_str(doc_json).map_err(|_| {
|
||||
let doc_json_sample: String =
|
||||
if doc_json.len() < 20 {
|
||||
String::from(doc_json)
|
||||
@@ -216,9 +226,9 @@ impl Schema {
|
||||
else {
|
||||
format!("{:?}...", &doc_json[0..20])
|
||||
};
|
||||
return Err(DocParsingError::NotJSONObject(doc_json_sample))
|
||||
}
|
||||
let json_obj = some_json_obj.unwrap();
|
||||
DocParsingError::NotJSON(doc_json_sample)
|
||||
})?;
|
||||
|
||||
let mut doc = Document::default();
|
||||
for (field_name, json_value) in json_obj.iter() {
|
||||
match self.get_field(field_name) {
|
||||
@@ -226,7 +236,7 @@ impl Schema {
|
||||
let field_entry = self.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
match *json_value {
|
||||
Json::Array(ref json_items) => {
|
||||
JsonValue::Array(ref json_items) => {
|
||||
for json_item in json_items {
|
||||
let value = try!(
|
||||
field_type
|
||||
@@ -262,30 +272,50 @@ impl fmt::Debug for Schema {
|
||||
}
|
||||
}
|
||||
|
||||
impl Decodable for Schema {
|
||||
fn decode<D: Decoder>(d: &mut D) -> Result <Self, D::Error> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
try!(d.read_seq(|d, num_fields| {
|
||||
for _ in 0..num_fields {
|
||||
let field_entry = try!(FieldEntry::decode(d));
|
||||
schema_builder.add_field(field_entry);
|
||||
}
|
||||
Ok(())
|
||||
}));
|
||||
Ok(schema_builder.build())
|
||||
impl Serialize for Schema {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer
|
||||
{
|
||||
let mut seq = serializer.serialize_seq(Some(self.0.fields.len()))?;
|
||||
for e in &self.0.fields {
|
||||
seq.serialize_element(e)?;
|
||||
}
|
||||
seq.end()
|
||||
}
|
||||
}
|
||||
|
||||
impl Encodable for Schema {
|
||||
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
|
||||
try!(s.emit_seq(self.0.fields.len(),
|
||||
|mut e| {
|
||||
for (ord, field) in self.0.fields.iter().enumerate() {
|
||||
try!(e.emit_seq_elt(ord, |e| field.encode(e)));
|
||||
impl<'de> Deserialize<'de> for Schema
|
||||
{
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where D: Deserializer<'de>
|
||||
{
|
||||
struct SchemaVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for SchemaVisitor
|
||||
{
|
||||
type Value = Schema;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str("struct Schema")
|
||||
}
|
||||
|
||||
fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
|
||||
where A: SeqAccess<'de>
|
||||
{
|
||||
let mut schema = SchemaBuilder {
|
||||
fields: Vec::with_capacity(seq.size_hint().unwrap_or(0)),
|
||||
fields_map: HashMap::with_capacity(seq.size_hint().unwrap_or(0)),
|
||||
};
|
||||
|
||||
while let Some(value) = seq.next_element()? {
|
||||
schema.add_field(value);
|
||||
}
|
||||
Ok(())
|
||||
}));
|
||||
Ok(())
|
||||
|
||||
Ok(schema.build())
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_map(SchemaVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -305,39 +335,33 @@ impl From<SchemaBuilder> for Schema {
|
||||
#[derive(Debug)]
|
||||
pub enum DocParsingError {
|
||||
/// The payload given is not valid JSON.
|
||||
NotJSON(json::ParserError),
|
||||
/// The payload given is not a JSON Object (`{...}`).
|
||||
NotJSONObject(String),
|
||||
NotJSON(String),
|
||||
/// One of the value node could not be parsed.
|
||||
ValueError(String, ValueParsingError),
|
||||
/// The json-document contains a field that is not declared in the schema.
|
||||
NoSuchFieldInSchema(String),
|
||||
}
|
||||
|
||||
impl From<json::ParserError> for DocParsingError {
|
||||
fn from(err: json::ParserError) -> DocParsingError {
|
||||
DocParsingError::NotJSON(err)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use schema::*;
|
||||
use rustc_serialize::json;
|
||||
use serde_json;
|
||||
use schema::field_type::ValueParsingError;
|
||||
use schema::schema::DocParsingError::NotJSON;
|
||||
|
||||
#[test]
|
||||
pub fn test_schema_serialization() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let count_options = U32Options::default().set_stored().set_fast();
|
||||
let count_options = IntOptions::default().set_stored().set_fast();
|
||||
let popularity_options = IntOptions::default().set_stored().set_fast();
|
||||
schema_builder.add_text_field("title", TEXT);
|
||||
schema_builder.add_text_field("author", STRING);
|
||||
schema_builder.add_u32_field("count", count_options);
|
||||
schema_builder.add_u64_field("count", count_options);
|
||||
schema_builder.add_i64_field("popularity", popularity_options);
|
||||
let schema = schema_builder.build();
|
||||
let schema_json: String = format!("{}", json::as_pretty_json(&schema));
|
||||
let schema_json = serde_json::to_string_pretty(&schema).unwrap();
|
||||
let expected = r#"[
|
||||
{
|
||||
"name": "title",
|
||||
@@ -357,7 +381,16 @@ mod tests {
|
||||
},
|
||||
{
|
||||
"name": "count",
|
||||
"type": "u32",
|
||||
"type": "u64",
|
||||
"options": {
|
||||
"indexed": false,
|
||||
"fast": true,
|
||||
"stored": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "popularity",
|
||||
"type": "i64",
|
||||
"options": {
|
||||
"indexed": false,
|
||||
"fast": true,
|
||||
@@ -365,8 +398,18 @@ mod tests {
|
||||
}
|
||||
}
|
||||
]"#;
|
||||
println!("{}", schema_json);
|
||||
println!("{}", expected);
|
||||
assert_eq!(schema_json, expected);
|
||||
|
||||
let schema: Schema = serde_json::from_str(expected).unwrap();
|
||||
|
||||
let mut fields = schema.fields().iter();
|
||||
|
||||
assert_eq!("title", fields.next().unwrap().name());
|
||||
assert_eq!("author", fields.next().unwrap().name());
|
||||
assert_eq!("count", fields.next().unwrap().name());
|
||||
assert_eq!("popularity", fields.next().unwrap().name());
|
||||
}
|
||||
|
||||
|
||||
@@ -374,10 +417,10 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_document_to_json() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let count_options = U32Options::default().set_stored().set_fast();
|
||||
let count_options = IntOptions::default().set_stored().set_fast();
|
||||
schema_builder.add_text_field("title", TEXT);
|
||||
schema_builder.add_text_field("author", STRING);
|
||||
schema_builder.add_u32_field("count", count_options);
|
||||
schema_builder.add_u64_field("count", count_options);
|
||||
let schema = schema_builder.build();
|
||||
let doc_json = r#"{
|
||||
"title": "my title",
|
||||
@@ -385,6 +428,7 @@ mod tests {
|
||||
"count": 4
|
||||
}"#;
|
||||
let doc = schema.parse_document(doc_json).unwrap();
|
||||
|
||||
let doc_serdeser = schema.parse_document(&schema.to_json(&doc)).unwrap();
|
||||
assert_eq!(doc, doc_serdeser);
|
||||
}
|
||||
@@ -392,10 +436,12 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_parse_document() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let count_options = U32Options::default().set_stored().set_fast();
|
||||
let count_options = IntOptions::default().set_stored().set_fast();
|
||||
let popularity_options = IntOptions::default().set_stored().set_fast();
|
||||
let title_field = schema_builder.add_text_field("title", TEXT);
|
||||
let author_field = schema_builder.add_text_field("author", STRING);
|
||||
let count_field = schema_builder.add_u32_field("count", count_options);
|
||||
let count_field = schema_builder.add_u64_field("count", count_options);
|
||||
let popularity_field = schema_builder.add_i64_field("popularity", popularity_options);
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let doc = schema.parse_document("{}").unwrap();
|
||||
@@ -405,32 +451,20 @@ mod tests {
|
||||
let doc = schema.parse_document(r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 4
|
||||
"count": 4,
|
||||
"popularity": 10
|
||||
}"#).unwrap();
|
||||
assert_eq!(doc.get_first(title_field).unwrap().text(), "my title");
|
||||
assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton");
|
||||
assert_eq!(doc.get_first(count_field).unwrap().u32_value(), 4);
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton"
|
||||
"count": 4
|
||||
}"#);
|
||||
match json_err {
|
||||
Err(DocParsingError::NotJSON(__)) => {
|
||||
assert!(true);
|
||||
}
|
||||
_ => {
|
||||
assert!(false);
|
||||
}
|
||||
}
|
||||
assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4);
|
||||
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 4,
|
||||
"popularity": 10,
|
||||
"jambon": "bayonne"
|
||||
}"#);
|
||||
match json_err {
|
||||
@@ -438,7 +472,7 @@ mod tests {
|
||||
assert_eq!(field_name, "jambon");
|
||||
}
|
||||
_ => {
|
||||
assert!(false);
|
||||
panic!("expected additional field 'jambon' to fail but didn't");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -447,6 +481,7 @@ mod tests {
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": "5",
|
||||
"popularity": "10",
|
||||
"jambon": "bayonne"
|
||||
}"#);
|
||||
match json_err {
|
||||
@@ -454,7 +489,7 @@ mod tests {
|
||||
assert!(true);
|
||||
}
|
||||
_ => {
|
||||
assert!(false);
|
||||
panic!("expected string of 5 to fail but didn't");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -462,29 +497,62 @@ mod tests {
|
||||
let json_err = schema.parse_document(r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": -5
|
||||
}"#);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => {
|
||||
assert!(true);
|
||||
}
|
||||
_ => {
|
||||
assert!(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 5000000000
|
||||
"count": -5,
|
||||
"popularity": 10
|
||||
}"#);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
|
||||
assert!(true);
|
||||
}
|
||||
_ => {
|
||||
assert!(false);
|
||||
panic!("expected -5 to fail but didn't");
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 9223372036854775808,
|
||||
"popularity": 10
|
||||
}"#);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
|
||||
panic!("expected 9223372036854775808 to fit into u64, but it didn't");
|
||||
}
|
||||
_ => {
|
||||
assert!(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 50,
|
||||
"popularity": 9223372036854775808
|
||||
}"#);
|
||||
match json_err {
|
||||
Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => {
|
||||
assert!(true);
|
||||
},
|
||||
_ => {
|
||||
panic!("expected 9223372036854775808 to overflow i64, but it didn't");
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(r#"{
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 50,
|
||||
}"#);
|
||||
match json_err {
|
||||
Err(NotJSON(_)) => {
|
||||
assert!(true);
|
||||
},
|
||||
_ => {
|
||||
panic!("expected invalid JSON to fail parsing, but it didn't");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,56 +1,82 @@
|
||||
use std::fmt;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::allocate_vec;
|
||||
use common;
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use super::Field;
|
||||
use std::str;
|
||||
|
||||
|
||||
/// Size (in bytes) of the buffer of a int field.
|
||||
const INT_TERM_LEN: usize = 4 + 8;
|
||||
|
||||
/// Term represents the value that the token can take.
|
||||
///
|
||||
/// It actually wraps a `Vec<u8>`.
|
||||
#[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
|
||||
pub struct Term(Vec<u8>);
|
||||
|
||||
/// Extract `field` from Term.
|
||||
pub(crate) fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field {
|
||||
Field(BigEndian::read_u32(&term_bytes[..4]))
|
||||
}
|
||||
|
||||
impl Term {
|
||||
|
||||
/// Pre-allocate a term buffer.
|
||||
pub fn allocate(field: Field, num_bytes: usize) -> Term {
|
||||
let mut term = Term(Vec::with_capacity(num_bytes));
|
||||
field.serialize(&mut term.0).expect("Serializing term in a Vec should never fail");
|
||||
term
|
||||
}
|
||||
|
||||
/// Set the content of the term.
|
||||
pub fn set_content(&mut self, content: &[u8]) {
|
||||
assert!(content.len() >= 4);
|
||||
self.0.resize(content.len(), 0u8);
|
||||
(&mut self.0[..]).clone_from_slice(content);
|
||||
}
|
||||
|
||||
/// Returns the field id.
|
||||
fn field_id(&self,) -> u8 {
|
||||
self.0[0]
|
||||
}
|
||||
|
||||
/// Returns the field.
|
||||
pub fn field(&self,) -> Field {
|
||||
Field(self.field_id())
|
||||
extract_field_from_term_bytes(&self.0)
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a u32-value
|
||||
/// Returns the field.
|
||||
pub fn set_field(&mut self, field: Field) {
|
||||
if self.0.len() < 4 {
|
||||
self.0.resize(4, 0u8);
|
||||
}
|
||||
BigEndian::write_u32(&mut self.0[0..4], field.0);
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a u64-value
|
||||
///
|
||||
/// Assuming the term has a field id of 1, and a u32 value of 3234,
|
||||
/// the Term will have 5 bytes.
|
||||
/// The first byte is `1`, and the 4 following bytes are that of the u32.
|
||||
pub fn from_field_u32(field: Field, val: u32) -> Term {
|
||||
const U32_TERM_LEN: usize = 1 + 4;
|
||||
let mut buffer = allocate_vec(U32_TERM_LEN);
|
||||
buffer[0] = field.0;
|
||||
// we want BigEndian here to have lexicographic order
|
||||
// match the natural order of vals.
|
||||
BigEndian::write_u32(&mut buffer[1..5], val);
|
||||
Term(buffer)
|
||||
/// Assuming the term has a field id of 1, and a u64 value of 3234,
|
||||
/// the Term will have 8 bytes.
|
||||
///
|
||||
/// The first four byte are dedicated to storing the field id as a u64.
|
||||
/// The 4 following bytes are encoding the u64 value.
|
||||
pub fn from_field_u64(field: Field, val: u64) -> Term {
|
||||
let mut term = Term(vec![0u8; INT_TERM_LEN]);
|
||||
term.set_field(field);
|
||||
term.set_u64(val);
|
||||
term
|
||||
}
|
||||
|
||||
/// Sets a u64 value in the term.
|
||||
///
|
||||
/// U64 are serialized using (8-byte) BigEndian
|
||||
/// representation.
|
||||
/// The use of BigEndian has the benefit of preserving
|
||||
/// the natural order of the values.
|
||||
pub fn set_u64(&mut self, val: u64) {
|
||||
self.0.resize(INT_TERM_LEN, 0u8);
|
||||
BigEndian::write_u64(&mut self.0[4..], val);
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a u64-value
|
||||
///
|
||||
/// Assuming the term has a field id of 1, and a u64 value of 3234,
|
||||
/// the Term will have 8 bytes.
|
||||
///
|
||||
/// The first four byte are dedicated to storing the field id as a u64.
|
||||
/// The 4 following bytes are encoding the u64 value.
|
||||
pub fn from_field_i64(field: Field, val: i64) -> Term {
|
||||
let val_u64: u64 = common::i64_to_u64(val);
|
||||
Term::from_field_u64(field, val_u64)
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a string value
|
||||
@@ -60,18 +86,28 @@ impl Term {
|
||||
/// The first byte is 2, and the three following bytes are the utf-8
|
||||
/// representation of "abc".
|
||||
pub fn from_field_text(field: Field, text: &str) -> Term {
|
||||
let mut buffer = Vec::with_capacity(1 + text.len());
|
||||
buffer.clear();
|
||||
field.serialize(&mut buffer).unwrap();
|
||||
buffer.extend(text.as_bytes());
|
||||
Term(buffer)
|
||||
let buffer = Vec::with_capacity(4 + text.len());
|
||||
let mut term = Term(buffer);
|
||||
term.set_field(field);
|
||||
term.set_text(text);
|
||||
term
|
||||
}
|
||||
|
||||
/// Assume the term is a u32 field.
|
||||
/// Creates a new Term with an empty buffer,
|
||||
/// but with a given capacity.
|
||||
///
|
||||
/// Panics if the term is not a u32 field.
|
||||
pub fn get_u32(&self) -> u32 {
|
||||
BigEndian::read_u32(&self.0[1..])
|
||||
/// It is declared unsafe, as the term content
|
||||
/// is not initialized, and a call to `.field()`
|
||||
/// would panic.
|
||||
pub(crate) unsafe fn with_capacity(num_bytes: usize) -> Term {
|
||||
Term(Vec::with_capacity(num_bytes))
|
||||
}
|
||||
|
||||
/// Assume the term is a u64 field.
|
||||
///
|
||||
/// Panics if the term is not a u64 field.
|
||||
pub fn get_u64(&self) -> u64 {
|
||||
BigEndian::read_u64(&self.0[4..])
|
||||
}
|
||||
|
||||
/// Builds a term from its byte representation.
|
||||
@@ -86,10 +122,10 @@ impl Term {
|
||||
/// (this does not include the field.)
|
||||
///
|
||||
/// If the term is a string, its value is utf-8 encoded.
|
||||
/// If the term is a u32, its value is encoded according
|
||||
/// If the term is a u64, its value is encoded according
|
||||
/// to `byteorder::LittleEndian`.
|
||||
pub fn value(&self) -> &[u8] {
|
||||
&self.0[1..]
|
||||
&self.0[4..]
|
||||
}
|
||||
|
||||
/// Returns the text associated with the term.
|
||||
@@ -98,13 +134,13 @@ impl Term {
|
||||
/// If the value is not valid utf-8. This may happen
|
||||
/// if the index is corrupted or if you try to
|
||||
/// call this method on a non-string type.
|
||||
pub unsafe fn text(&self) -> &str {
|
||||
str::from_utf8_unchecked(self.value())
|
||||
pub fn text(&self) -> &str {
|
||||
str::from_utf8(self.value()).expect("Term does not contain valid utf-8.")
|
||||
}
|
||||
|
||||
/// Set the texts only, keeping the field untouched.
|
||||
pub fn set_text(&mut self, text: &str) {
|
||||
self.0.resize(1, 0u8);
|
||||
self.0.resize(4, 0u8);
|
||||
self.0.extend(text.as_bytes());
|
||||
}
|
||||
|
||||
@@ -141,18 +177,22 @@ mod tests {
|
||||
{
|
||||
let term = Term::from_field_text(title_field, "test");
|
||||
assert_eq!(term.field(), title_field);
|
||||
assert_eq!(term.as_slice()[0], 1u8);
|
||||
assert_eq!(&term.as_slice()[1..], "test".as_bytes());
|
||||
assert_eq!(&term.as_slice()[0..4], &[0u8,0u8,0u8,1u8]);
|
||||
assert_eq!(&term.as_slice()[4..], "test".as_bytes());
|
||||
}
|
||||
{
|
||||
let term = Term::from_field_u32(count_field, 983u32);
|
||||
let term = Term::from_field_u64(count_field, 983u64);
|
||||
assert_eq!(term.field(), count_field);
|
||||
assert_eq!(term.as_slice()[0], 2u8);
|
||||
assert_eq!(term.as_slice().len(), 5);
|
||||
assert_eq!(term.as_slice()[1], 0u8);
|
||||
assert_eq!(term.as_slice()[2], 0u8);
|
||||
assert_eq!(term.as_slice()[3], (933u32 / 256u32) as u8);
|
||||
assert_eq!(term.as_slice()[4], (983u32 % 256u32) as u8);
|
||||
assert_eq!(&term.as_slice()[0..4], &[0u8, 0u8, 0u8, 2u8]);
|
||||
assert_eq!(term.as_slice().len(), 4 + 8);
|
||||
assert_eq!(term.as_slice()[4], 0u8);
|
||||
assert_eq!(term.as_slice()[5], 0u8);
|
||||
assert_eq!(term.as_slice()[6], 0u8);
|
||||
assert_eq!(term.as_slice()[7], 0u8);
|
||||
assert_eq!(term.as_slice()[8], 0u8);
|
||||
assert_eq!(term.as_slice()[9], 0u8);
|
||||
assert_eq!(term.as_slice()[10], (933u64 / 256u64) as u8);
|
||||
assert_eq!(term.as_slice()[11], (983u64 % 256u64) as u8);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,12 +1,8 @@
|
||||
use std::ops::BitOr;
|
||||
use rustc_serialize::Decodable;
|
||||
use rustc_serialize::Decoder;
|
||||
use rustc_serialize::Encodable;
|
||||
use rustc_serialize::Encoder;
|
||||
|
||||
|
||||
/// Define how a text field should be handled by tantivy.
|
||||
#[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
|
||||
#[derive(Clone,Debug,PartialEq,Eq, Serialize, Deserialize)]
|
||||
pub struct TextOptions {
|
||||
indexing: TextIndexingOptions,
|
||||
stored: bool,
|
||||
@@ -51,9 +47,10 @@ impl Default for TextOptions {
|
||||
|
||||
|
||||
/// Describe how a field should be indexed
|
||||
#[derive(Clone,Copy,Debug,PartialEq,PartialOrd,Eq,Hash)]
|
||||
#[derive(Clone,Copy,Debug,PartialEq,PartialOrd,Eq,Hash, Serialize, Deserialize)]
|
||||
pub enum TextIndexingOptions {
|
||||
/// Unindexed fields will not generate any postings. They will not be searchable either.
|
||||
#[serde(rename="unindexed")]
|
||||
Unindexed,
|
||||
/// Untokenized means that the field text will not be split into tokens before being indexed.
|
||||
/// A field with the value "Hello world", will have the document suscribe to one single
|
||||
@@ -61,62 +58,26 @@ pub enum TextIndexingOptions {
|
||||
///
|
||||
/// It will **not** be searchable if the user enter "hello" for instance.
|
||||
/// This can be useful for tags, or ids for instance.
|
||||
#[serde(rename="untokenized")]
|
||||
Untokenized,
|
||||
/// TokenizedNoFreq will tokenize the field value, and append the document doc id
|
||||
/// to the posting lists associated to all of the tokens.
|
||||
/// The frequence of appearance of the term in the document however will be lost.
|
||||
/// The term frequency used in the TfIdf formula will always be 1.
|
||||
#[serde(rename="tokenize")]
|
||||
TokenizedNoFreq,
|
||||
/// TokenizedWithFreq will tokenize the field value, and encode
|
||||
/// both the docid and the term frequency in the posting lists associated to all
|
||||
#[serde(rename="freq")]
|
||||
// of the tokens.
|
||||
TokenizedWithFreq,
|
||||
/// Like TokenizedWithFreq, but also encodes the positions of the
|
||||
/// terms in a separate file. This option is required for phrase queries.
|
||||
/// Don't use this if you are certain you won't need it, the term positions file can be very big.
|
||||
#[serde(rename="position")]
|
||||
TokenizedWithFreqAndPosition,
|
||||
}
|
||||
|
||||
impl Encodable for TextIndexingOptions {
|
||||
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
|
||||
let name = match *self {
|
||||
TextIndexingOptions::Unindexed => {
|
||||
"unindexed"
|
||||
}
|
||||
TextIndexingOptions::Untokenized => {
|
||||
"untokenized"
|
||||
}
|
||||
TextIndexingOptions::TokenizedNoFreq => {
|
||||
"tokenize"
|
||||
}
|
||||
TextIndexingOptions::TokenizedWithFreq => {
|
||||
"freq"
|
||||
}
|
||||
TextIndexingOptions::TokenizedWithFreqAndPosition => {
|
||||
"position"
|
||||
}
|
||||
};
|
||||
s.emit_str(name)
|
||||
}
|
||||
}
|
||||
|
||||
impl Decodable for TextIndexingOptions {
|
||||
fn decode<D: Decoder>(d: &mut D) -> Result<Self, D::Error> {
|
||||
use self::TextIndexingOptions::*;
|
||||
let option_name: String = try!(d.read_str());
|
||||
Ok(match option_name.as_ref() {
|
||||
"unindexed" => Unindexed,
|
||||
"untokenized" => Untokenized,
|
||||
"tokenize" => TokenizedNoFreq,
|
||||
"freq" => TokenizedWithFreq,
|
||||
"position" => TokenizedWithFreqAndPosition,
|
||||
_ => {
|
||||
return Err(d.error(&format!("Encoding option {:?} unknown", option_name)));
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TextIndexingOptions {
|
||||
|
||||
/// Returns true iff the term frequency will be encoded.
|
||||
|
||||
@@ -1,17 +1,65 @@
|
||||
|
||||
use common::BinarySerializable;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
use std::fmt;
|
||||
use serde::{Serialize, Serializer, Deserialize, Deserializer};
|
||||
use serde::de::Visitor;
|
||||
|
||||
/// Value represents the value of a any field.
|
||||
/// It is an enum over all over all of the possible field type.
|
||||
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd, RustcEncodable, RustcDecodable)]
|
||||
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
||||
pub enum Value {
|
||||
/// The str type is used for any text information.
|
||||
Str(String),
|
||||
/// Unsigned 32-bits Integer `u32`
|
||||
U32(u32),
|
||||
/// Unsigned 64-bits Integer `u64`
|
||||
U64(u64),
|
||||
/// Signed 64-bits Integer `i64`
|
||||
I64(i64)
|
||||
}
|
||||
|
||||
impl Serialize for Value {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer
|
||||
{
|
||||
match *self {
|
||||
Value::Str(ref v) => serializer.serialize_str(v),
|
||||
Value::U64(u) => serializer.serialize_u64(u),
|
||||
Value::I64(u) => serializer.serialize_i64(u),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Value
|
||||
{
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where D: Deserializer<'de>
|
||||
{
|
||||
struct ValueVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for ValueVisitor
|
||||
{
|
||||
type Value = Value;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str("a string or u32")
|
||||
}
|
||||
|
||||
fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E> {
|
||||
Ok(Value::U64(v))
|
||||
}
|
||||
|
||||
fn visit_i64<E>(self, v: i64) -> Result<Self::Value, E> {
|
||||
Ok(Value::I64(v))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> {
|
||||
Ok(Value::Str(v.to_owned()))
|
||||
}
|
||||
|
||||
fn visit_string<E>(self, v: String) -> Result<Self::Value, E> {
|
||||
Ok(Value::Str(v))
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_any(ValueVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
impl Value {
|
||||
@@ -30,13 +78,28 @@ impl Value {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the u32-value, provided the value is of the `U32` type.
|
||||
/// Returns the u64-value, provided the value is of the `U64` type.
|
||||
///
|
||||
/// # Panics
|
||||
/// If the value is not of type `U32`
|
||||
pub fn u32_value(&self) -> u32 {
|
||||
/// If the value is not of type `U64`
|
||||
pub fn u64_value(&self) -> u64 {
|
||||
match *self {
|
||||
Value::U32(ref value) => {
|
||||
Value::U64(ref value) => {
|
||||
*value
|
||||
}
|
||||
_ => {
|
||||
panic!("This is not a text field.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the i64-value, provided the value is of the `I64` type.
|
||||
///
|
||||
/// # Panics
|
||||
/// If the value is not of type `I64`
|
||||
pub fn i64_value(&self) -> i64 {
|
||||
match *self {
|
||||
Value::I64(ref value) => {
|
||||
*value
|
||||
}
|
||||
_ => {
|
||||
@@ -53,9 +116,15 @@ impl From<String> for Value {
|
||||
}
|
||||
|
||||
|
||||
impl From<u32> for Value {
|
||||
fn from(v: u32) -> Value {
|
||||
Value::U32(v)
|
||||
impl From<u64> for Value {
|
||||
fn from(v: u64) -> Value {
|
||||
Value::U64(v)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<i64> for Value {
|
||||
fn from(v: i64) -> Value {
|
||||
Value::I64(v)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,39 +134,53 @@ impl<'a> From<&'a str> for Value {
|
||||
}
|
||||
}
|
||||
|
||||
const TEXT_CODE: u8 = 0;
|
||||
const U32_CODE: u8 = 1;
|
||||
mod binary_serialize {
|
||||
use common::BinarySerializable;
|
||||
use std::io::{self, Read, Write};
|
||||
use super::Value;
|
||||
|
||||
const TEXT_CODE: u8 = 0;
|
||||
const U64_CODE: u8 = 1;
|
||||
const I64_CODE: u8 = 2;
|
||||
|
||||
impl BinarySerializable for Value {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
let mut written_size = 0;
|
||||
match *self {
|
||||
Value::Str(ref text) => {
|
||||
written_size += try!(TEXT_CODE.serialize(writer));
|
||||
written_size += try!(text.serialize(writer));
|
||||
},
|
||||
Value::U32(ref val) => {
|
||||
written_size += try!(U32_CODE.serialize(writer));
|
||||
written_size += try!(val.serialize(writer));
|
||||
},
|
||||
impl BinarySerializable for Value {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
let mut written_size = 0;
|
||||
match *self {
|
||||
Value::Str(ref text) => {
|
||||
written_size += try!(TEXT_CODE.serialize(writer));
|
||||
written_size += try!(text.serialize(writer));
|
||||
},
|
||||
Value::U64(ref val) => {
|
||||
written_size += try!(U64_CODE.serialize(writer));
|
||||
written_size += try!(val.serialize(writer));
|
||||
},
|
||||
Value::I64(ref val) => {
|
||||
written_size += try!(I64_CODE.serialize(writer));
|
||||
written_size += try!(val.serialize(writer));
|
||||
},
|
||||
}
|
||||
Ok(written_size)
|
||||
}
|
||||
fn deserialize(reader: &mut Read) -> io::Result<Self> {
|
||||
let type_code = try!(u8::deserialize(reader));
|
||||
match type_code {
|
||||
TEXT_CODE => {
|
||||
let text = try!(String::deserialize(reader));
|
||||
Ok(Value::Str(text))
|
||||
}
|
||||
U64_CODE => {
|
||||
let value = try!(u64::deserialize(reader));
|
||||
Ok(Value::U64(value))
|
||||
}
|
||||
I64_CODE => {
|
||||
let value = try!(i64::deserialize(reader));
|
||||
Ok(Value::I64(value))
|
||||
}
|
||||
_ => {
|
||||
Err(io::Error::new(io::ErrorKind::InvalidData, format!("No field type is associated with code {:?}", type_code)))
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(written_size)
|
||||
}
|
||||
fn deserialize(reader: &mut Read) -> io::Result<Self> {
|
||||
let type_code = try!(u8::deserialize(reader));
|
||||
match type_code {
|
||||
TEXT_CODE => {
|
||||
let text = try!(String::deserialize(reader));
|
||||
Ok(Value::Str(text))
|
||||
}
|
||||
U32_CODE => {
|
||||
let value = try!(u32::deserialize(reader));
|
||||
Ok(Value::U32(value))
|
||||
}
|
||||
_ => {
|
||||
Err(io::Error::new(io::ErrorKind::InvalidData, format!("No field type is associated with code {:?}", type_code)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user