mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-02 07:22:53 +00:00
Compare commits
370 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
97b7984200 | ||
|
|
8683718159 | ||
|
|
0cf274135b | ||
|
|
a3b44773bb | ||
|
|
ec7c582109 | ||
|
|
ee7ab72fb1 | ||
|
|
e82859f2e6 | ||
|
|
fdb9c3c516 | ||
|
|
6fb114224a | ||
|
|
2c3e33895a | ||
|
|
d512b53688 | ||
|
|
c8afd2b55d | ||
|
|
3fd6d7125b | ||
|
|
de6a3987a9 | ||
|
|
3dedc465fa | ||
|
|
f16cc6367e | ||
|
|
4026fc5fb1 | ||
|
|
43742a93ef | ||
|
|
2a843d86cb | ||
|
|
9a706c296a | ||
|
|
5ff8123b7a | ||
|
|
6061158506 | ||
|
|
4e8b0e89d9 | ||
|
|
0540ebb49e | ||
|
|
ef94582203 | ||
|
|
2f242d5f52 | ||
|
|
da3d372e6e | ||
|
|
42fd3fe5c7 | ||
|
|
5dae6e6bbc | ||
|
|
e608e0a1df | ||
|
|
6c8c90d348 | ||
|
|
eb50e92ec4 | ||
|
|
20bede9462 | ||
|
|
4640ab4e65 | ||
|
|
cd51ed0f9f | ||
|
|
6676fe5717 | ||
|
|
292bb17346 | ||
|
|
0300e7272b | ||
|
|
8760899fa2 | ||
|
|
c89d570a79 | ||
|
|
1da06d867b | ||
|
|
76e8db6ed3 | ||
|
|
31e5580bfa | ||
|
|
930d3db2f7 | ||
|
|
1593e1dc6f | ||
|
|
e0189fc9e6 | ||
|
|
ffdb4ef0a7 | ||
|
|
58845344c2 | ||
|
|
548ec9ecca | ||
|
|
86b700fa93 | ||
|
|
e95c49e749 | ||
|
|
f3033a8469 | ||
|
|
c4125bda59 | ||
|
|
a7ffc0e610 | ||
|
|
9370427ae2 | ||
|
|
1fc7afa90a | ||
|
|
6a104e4f69 | ||
|
|
920f086e1d | ||
|
|
13aaca7e11 | ||
|
|
df53dc4ceb | ||
|
|
dd028841e8 | ||
|
|
eb84b8a60d | ||
|
|
c05f46ad0e | ||
|
|
435ff9d524 | ||
|
|
fdd5dd8496 | ||
|
|
fb5476d5de | ||
|
|
dd8332c327 | ||
|
|
63d201150b | ||
|
|
b78efdc59f | ||
|
|
5cb08f7996 | ||
|
|
1947a19700 | ||
|
|
271b019420 | ||
|
|
340693184f | ||
|
|
97782a9511 | ||
|
|
930010aa88 | ||
|
|
7f5b07d4e7 | ||
|
|
3edb3dce6a | ||
|
|
1edaf7a312 | ||
|
|
137906ff29 | ||
|
|
143a143cde | ||
|
|
4f5ce12a77 | ||
|
|
813efa4ab3 | ||
|
|
c3b6c1dc0b | ||
|
|
6f5e0ef6f4 | ||
|
|
7224f58895 | ||
|
|
49519c3f61 | ||
|
|
cb11b92505 | ||
|
|
7b2dcfbd91 | ||
|
|
d2e30e6681 | ||
|
|
ef109927b3 | ||
|
|
44e5c4dfd3 | ||
|
|
6f223253ea | ||
|
|
f7b0392bd5 | ||
|
|
442bc9a1b8 | ||
|
|
db7d784573 | ||
|
|
79132e803a | ||
|
|
9e132b7dde | ||
|
|
1e55189db1 | ||
|
|
8b1b389a76 | ||
|
|
46f3ec87a5 | ||
|
|
f24e5f405e | ||
|
|
2589be3984 | ||
|
|
a02a9294e4 | ||
|
|
8023445b63 | ||
|
|
05ce093f97 | ||
|
|
6937e23a56 | ||
|
|
974c321153 | ||
|
|
f30ec9b36b | ||
|
|
acd7c1ea2d | ||
|
|
aaeeda2bc5 | ||
|
|
ac4d433fad | ||
|
|
a298c084e6 | ||
|
|
185a72b341 | ||
|
|
bb41ae76f9 | ||
|
|
74d32e522a | ||
|
|
927dd1ee6f | ||
|
|
2c9302290f | ||
|
|
426cc436da | ||
|
|
68d42c9cf2 | ||
|
|
ca49d6130f | ||
|
|
3588ca0561 | ||
|
|
7c6cdcd876 | ||
|
|
71366b9a56 | ||
|
|
a3247ebcfb | ||
|
|
3ec13a8719 | ||
|
|
f8593c76d5 | ||
|
|
f8710bd4b0 | ||
|
|
8d05b8f7b2 | ||
|
|
fc25516b7a | ||
|
|
5b1e71947f | ||
|
|
69351fb4a5 | ||
|
|
3d0082d020 | ||
|
|
8e450c770a | ||
|
|
a757902aed | ||
|
|
b3a8074826 | ||
|
|
4289625348 | ||
|
|
850f10c1fe | ||
|
|
d7f9bfdfc5 | ||
|
|
d0d5db4515 | ||
|
|
303fc7e820 | ||
|
|
744edb2c5c | ||
|
|
2d70efb7b0 | ||
|
|
eb5b2ffdcc | ||
|
|
38513014d5 | ||
|
|
9cb7a0f6e6 | ||
|
|
8d466b8a76 | ||
|
|
413d0e1719 | ||
|
|
0eb3c872fd | ||
|
|
f9203228be | ||
|
|
8f377b92d0 | ||
|
|
1e89f86267 | ||
|
|
d1f61a50c1 | ||
|
|
2bb85ed575 | ||
|
|
236fa74767 | ||
|
|
63b35dd87b | ||
|
|
efb910f4e8 | ||
|
|
aff7e64d4e | ||
|
|
92a3f3981f | ||
|
|
447a9361d8 | ||
|
|
5f59139484 | ||
|
|
27c373d26d | ||
|
|
80ae136646 | ||
|
|
52b1398702 | ||
|
|
7b9cd09a6e | ||
|
|
4c423ad2ca | ||
|
|
9f542d5252 | ||
|
|
77d8e81ae4 | ||
|
|
76e07b9705 | ||
|
|
ea4e9fdaf1 | ||
|
|
e418bee693 | ||
|
|
af4f1a86bc | ||
|
|
753b639454 | ||
|
|
5907a47547 | ||
|
|
586a6e62a2 | ||
|
|
fdae0eff5a | ||
|
|
6eea407f20 | ||
|
|
1ba51d4dc4 | ||
|
|
6e742d5145 | ||
|
|
1843259e91 | ||
|
|
4ebacb7297 | ||
|
|
fb75e60c6e | ||
|
|
04b15c6c11 | ||
|
|
b05b5f5487 | ||
|
|
4fe96483bc | ||
|
|
09e27740e2 | ||
|
|
e51feea574 | ||
|
|
93e7f28cc0 | ||
|
|
8875b9794a | ||
|
|
f26874557e | ||
|
|
a7d10b65ae | ||
|
|
e120e3b7aa | ||
|
|
90fcfb3f43 | ||
|
|
e547e8abad | ||
|
|
5aa4565424 | ||
|
|
3637620187 | ||
|
|
a94679d74d | ||
|
|
a35a8638cc | ||
|
|
97a051996f | ||
|
|
69525cb3c7 | ||
|
|
63867a7150 | ||
|
|
19c073385a | ||
|
|
0521844e56 | ||
|
|
8d4778f94d | ||
|
|
1d5464351d | ||
|
|
522ebdc674 | ||
|
|
4a805733db | ||
|
|
568d149db8 | ||
|
|
4cfc9806c0 | ||
|
|
37042e3ccb | ||
|
|
b316cd337a | ||
|
|
c04991e5ad | ||
|
|
c59b712eeb | ||
|
|
da61baed3b | ||
|
|
b6140d2962 | ||
|
|
6a9a71bb1b | ||
|
|
e8fc4c77e2 | ||
|
|
80837601ea | ||
|
|
2b2703cf51 | ||
|
|
d79018a7f8 | ||
|
|
d8a7c428f7 | ||
|
|
45595234cc | ||
|
|
1bcebdd29e | ||
|
|
ed0333a404 | ||
|
|
ac0b1a21eb | ||
|
|
6bbc789d84 | ||
|
|
87152daef3 | ||
|
|
e0fce4782a | ||
|
|
a633c2a49a | ||
|
|
51623d593e | ||
|
|
29bf740ddf | ||
|
|
511bd25a31 | ||
|
|
66e14ac1b1 | ||
|
|
09e94072ba | ||
|
|
6c68136d31 | ||
|
|
aaf1b2c6b6 | ||
|
|
8a6af2aefa | ||
|
|
7a6e62976b | ||
|
|
2712930bd6 | ||
|
|
cb05f8c098 | ||
|
|
c0c9d04ca9 | ||
|
|
7ea5e740e0 | ||
|
|
2afa6c372a | ||
|
|
c7db8866b5 | ||
|
|
02d992324a | ||
|
|
4ab511ffc6 | ||
|
|
f318172ea4 | ||
|
|
581449a824 | ||
|
|
272589a381 | ||
|
|
73d54c6379 | ||
|
|
3e4606de5d | ||
|
|
020779f61b | ||
|
|
835936585f | ||
|
|
bdd05e97d1 | ||
|
|
2be5f08cd6 | ||
|
|
3f49d65a87 | ||
|
|
f9baf4bcc8 | ||
|
|
7ee93fbed5 | ||
|
|
57a5547ae8 | ||
|
|
c57ab6a335 | ||
|
|
02bfa9be52 | ||
|
|
b3f62b8acc | ||
|
|
2a08c247af | ||
|
|
d2926b6ee0 | ||
|
|
0272167c2e | ||
|
|
a9cf0bde16 | ||
|
|
5a457df45d | ||
|
|
ca76fd5ba0 | ||
|
|
e79a316e41 | ||
|
|
733f54d80e | ||
|
|
7b2b181652 | ||
|
|
b3f39f2343 | ||
|
|
a13122d392 | ||
|
|
113917c521 | ||
|
|
1352b95b07 | ||
|
|
c0538dbe9a | ||
|
|
0d5ea98132 | ||
|
|
0404df3fd5 | ||
|
|
a67caee141 | ||
|
|
f5fb29422a | ||
|
|
4e48bbf0ea | ||
|
|
6fea510869 | ||
|
|
39958ec476 | ||
|
|
36f51e289e | ||
|
|
5c83153035 | ||
|
|
8e407bb314 | ||
|
|
103ba6ba35 | ||
|
|
3965b26cd2 | ||
|
|
1cd0b378fb | ||
|
|
92f383fa51 | ||
|
|
6ae34d2a77 | ||
|
|
1af1f7e0d1 | ||
|
|
feec2e2620 | ||
|
|
3e2ad7542d | ||
|
|
ac02c76b1e | ||
|
|
e5c7c0b8b9 | ||
|
|
49dbe4722f | ||
|
|
f64ff77424 | ||
|
|
2bf93e9e51 | ||
|
|
3dde748b25 | ||
|
|
1dabe26395 | ||
|
|
5590537739 | ||
|
|
ccf0f9cb2f | ||
|
|
e21913ecdc | ||
|
|
2cc826adc7 | ||
|
|
4d90d8fc1d | ||
|
|
0606a8ae73 | ||
|
|
03564214e7 | ||
|
|
4c8f9742f8 | ||
|
|
a23b7a1815 | ||
|
|
6f89a86b14 | ||
|
|
b2beac1203 | ||
|
|
8cd5a2d81d | ||
|
|
b26c22ada0 | ||
|
|
8a35259300 | ||
|
|
db56167a5d | ||
|
|
ab66ffed4e | ||
|
|
e04f2f0b08 | ||
|
|
7a5df33c85 | ||
|
|
ee0873dd07 | ||
|
|
695c8828b8 | ||
|
|
4ff7dc7a4f | ||
|
|
69832bfd03 | ||
|
|
ecbdd70c37 | ||
|
|
fb1b2be782 | ||
|
|
9cd7458978 | ||
|
|
4c4c28e2c4 | ||
|
|
9f9e588905 | ||
|
|
6fd17e0ead | ||
|
|
65dc5b0d83 | ||
|
|
15d15c01f8 | ||
|
|
106832a66a | ||
|
|
477b9136b9 | ||
|
|
7852d097b8 | ||
|
|
0bd56241bb | ||
|
|
54ab897755 | ||
|
|
1369d2d144 | ||
|
|
d3f829dc8a | ||
|
|
e82ccf9627 | ||
|
|
d3d29f7f54 | ||
|
|
3566717979 | ||
|
|
90bc3e3773 | ||
|
|
ffb62b6835 | ||
|
|
4f9ce91d6a | ||
|
|
3c3a2fbfe8 | ||
|
|
0508571d1a | ||
|
|
7b733dd34f | ||
|
|
2c798e3147 | ||
|
|
2c13f210bc | ||
|
|
0dad02791c | ||
|
|
2947364ae1 | ||
|
|
05111599b3 | ||
|
|
83263eabbb | ||
|
|
5cb5c9a8f2 | ||
|
|
9ab92b7739 | ||
|
|
962bddfbbf | ||
|
|
26cfe2909f | ||
|
|
afdfb1a69b | ||
|
|
b26ad1d57a | ||
|
|
1dbd54edbb | ||
|
|
deb04eb090 | ||
|
|
bed34bf502 | ||
|
|
80f1e26c3b | ||
|
|
3e68b61d8f | ||
|
|
95bfb71901 | ||
|
|
74e10843a7 | ||
|
|
1b922e6d23 | ||
|
|
a7c6c31538 | ||
|
|
9d071c8d46 | ||
|
|
04074f7bcb | ||
|
|
8a28d1643d |
6
.gitignore
vendored
6
.gitignore
vendored
@@ -1,3 +1,4 @@
|
||||
*.swp
|
||||
target
|
||||
target/debug
|
||||
.vscode
|
||||
@@ -5,4 +6,7 @@ target/release
|
||||
Cargo.lock
|
||||
benchmark
|
||||
.DS_Store
|
||||
cpp/simdcomp/bitpackingbenchmark
|
||||
cpp/simdcomp/bitpackingbenchmark
|
||||
*.bk
|
||||
.idea
|
||||
trace.dat
|
||||
|
||||
27
.travis.yml
27
.travis.yml
@@ -1,4 +1,6 @@
|
||||
language: rust
|
||||
sudo: required
|
||||
cache: cargo
|
||||
rust:
|
||||
- nightly
|
||||
env:
|
||||
@@ -11,6 +13,7 @@ addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
- kalakris-cmake
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- g++-4.8
|
||||
@@ -18,18 +21,18 @@ addons:
|
||||
- libelf-dev
|
||||
- libdw-dev
|
||||
- binutils-dev
|
||||
- cmake
|
||||
before_script:
|
||||
- |
|
||||
pip install 'travis-cargo<0.2' --user &&
|
||||
export PATH=$HOME/.local/bin:$PATH
|
||||
- export PATH=$HOME/.cargo/bin:$PATH
|
||||
- cargo install cargo-update || echo "cargo-update already installed"
|
||||
- cargo install cargo-travis || echo "cargo-travis already installed"
|
||||
- cargo install-update -a # update outdated cached binaries
|
||||
script:
|
||||
- |
|
||||
travis-cargo build &&
|
||||
travis-cargo test &&
|
||||
travis-cargo bench &&
|
||||
travis-cargo doc
|
||||
- cargo build
|
||||
- cargo test
|
||||
- cargo test -- --ignored
|
||||
- cargo run --example simple_search
|
||||
- cargo doc
|
||||
after_success:
|
||||
- bash ./script/build-doc.sh
|
||||
- travis-cargo doc-upload
|
||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then travis-cargo coveralls --no-sudo --verify; fi
|
||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then ./kcov/build/src/kcov --verify --coveralls-id=$TRAVIS_JOB_ID --include-path=`pwd`/src --exclude-path=`pwd`/cpp --exclude-pattern=/.cargo target/kcov target/debug/tantivy-*; fi
|
||||
- cargo coveralls --exclude-pattern cpp/,src/functional_test.rs
|
||||
- cargo doc-upload
|
||||
|
||||
13
.vimrc
Normal file
13
.vimrc
Normal file
@@ -0,0 +1,13 @@
|
||||
set wildignore+=*/examples/*
|
||||
|
||||
set tabstop=2
|
||||
set shiftwidth=2
|
||||
set softtabstop=2
|
||||
set expandtab
|
||||
set nosmarttab
|
||||
|
||||
set textwidth=100
|
||||
|
||||
autocmd BufRead *.rs :setlocal tags=./rusty-tags.vi;/
|
||||
autocmd BufWritePost *.rs :silent! exec "!rusty-tags vi -o --quiet --start-dir=" . expand('%:p:h') . "&" | redraw!
|
||||
|
||||
70
CHANGELOG.md
70
CHANGELOG.md
@@ -1,3 +1,72 @@
|
||||
Tantivy 0.5.1
|
||||
==========================
|
||||
- bugfix #254 : tantivy failed if no documents in a segment contained a specific field.
|
||||
|
||||
|
||||
Tantivy 0.5
|
||||
==========================
|
||||
- Faceting
|
||||
- RangeQuery
|
||||
- Configurable tokenization pipeline
|
||||
- Bugfix in PhraseQuery
|
||||
- Various query optimisation
|
||||
- Allowing very large indexes
|
||||
- 64 bits file address
|
||||
- Smarter encoding of the `TermInfo` objects
|
||||
|
||||
|
||||
|
||||
Tantivy 0.4.3
|
||||
==========================
|
||||
|
||||
- Bugfix race condition when deleting files. (#198)
|
||||
|
||||
|
||||
Tantivy 0.4.2
|
||||
==========================
|
||||
|
||||
- Prevent usage of AVX2 instructions (#201)
|
||||
|
||||
|
||||
Tantivy 0.4.1
|
||||
==========================
|
||||
|
||||
- Bugfix for non-indexed fields. (#199)
|
||||
|
||||
|
||||
Tantivy 0.4.0
|
||||
==========================
|
||||
|
||||
- Raise the limit of number of fields (previously 256 fields) (@fulmicoton)
|
||||
- Removed u32 fields. They are replaced by u64 and i64 fields (#65) (@fulmicoton)
|
||||
- Optimized skip in SegmentPostings (#130) (@lnicola)
|
||||
- Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola
|
||||
- Using error-chain (@KodrAus)
|
||||
- QueryParser: (@fulmicoton)
|
||||
- Explicit error returned when searched for a term that is not indexed
|
||||
- Searching for a int term via the query parser was broken `(age:1)`
|
||||
- Searching for a non-indexed field returns an explicit Error
|
||||
- Phrase query for non-tokenized field are not tokenized by the query parser.
|
||||
- Faster/Better indexing (@fulmicoton)
|
||||
- using murmurhash2
|
||||
- faster merging
|
||||
- more memory efficient fast field writer (@lnicola )
|
||||
- better handling of collisions
|
||||
- lesser memory usage
|
||||
- Added API, most notably to iterate over ranges of terms (@fulmicoton)
|
||||
- Bugfix that was preventing to unmap segment files, on index drop (@fulmicoton)
|
||||
- Made the doc! macro public (@fulmicoton)
|
||||
- Added an alternative implementation of the streaming dictionary (@fulmicoton)
|
||||
|
||||
|
||||
|
||||
Tantivy 0.3.1
|
||||
==========================
|
||||
|
||||
- Expose a method to trigger files garbage collection
|
||||
|
||||
|
||||
|
||||
Tantivy 0.3
|
||||
==========================
|
||||
|
||||
@@ -19,6 +88,7 @@ You should not expect backward compatibility before
|
||||
tantivy 1.0.
|
||||
|
||||
|
||||
|
||||
New Features
|
||||
------------
|
||||
|
||||
|
||||
33
Cargo.toml
33
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.3.0"
|
||||
version = "0.5.1"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
build = "build.rs"
|
||||
license = "MIT"
|
||||
@@ -14,29 +14,35 @@ keywords = ["search", "information", "retrieval"]
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.0"
|
||||
memmap = "0.4"
|
||||
lazy_static = "0.2.1"
|
||||
tinysegmenter = "0.1.0"
|
||||
regex = "0.2"
|
||||
fst = "0.1.37"
|
||||
fst = "0.2"
|
||||
atomicwrites = "0.1.3"
|
||||
tempfile = "2.1"
|
||||
rustc-serialize = "0.3"
|
||||
log = "0.3.6"
|
||||
combine = "2.2"
|
||||
tempdir = "0.3"
|
||||
bincode = "0.5"
|
||||
libc = {version = "0.2.20", optional=true}
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
libc = { version = "0.2.20", optional=true }
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.5.9"
|
||||
lz4 = "1.20"
|
||||
bit-set = "0.4.0"
|
||||
time = "0.1"
|
||||
uuid = { version = "0.4", features = ["v4", "rustc-serialize"] }
|
||||
uuid = { version = "0.6", features = ["v4", "serde"] }
|
||||
chan = "0.1"
|
||||
version = "2"
|
||||
crossbeam = "0.2"
|
||||
futures = "0.1.9"
|
||||
futures-cpupool = "0.1.2"
|
||||
crossbeam = "0.3"
|
||||
futures = "0.1"
|
||||
futures-cpupool = "0.1"
|
||||
error-chain = "0.8"
|
||||
owning_ref = "0.3"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "0.1.0"
|
||||
downcast = { version="0.9", features = ["nightly"]}
|
||||
matches = "0.1"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
@@ -46,7 +52,7 @@ rand = "0.3"
|
||||
env_logger = "0.4"
|
||||
|
||||
[build-dependencies]
|
||||
gcc = {version = "0.3", optional=true}
|
||||
cc = { version="1.0.0", optional=true }
|
||||
|
||||
[profile.release]
|
||||
opt-level = 3
|
||||
@@ -57,7 +63,8 @@ debug-assertions = false
|
||||
|
||||
[features]
|
||||
default = ["simdcompression"]
|
||||
simdcompression = ["libc", "gcc"]
|
||||
simdcompression = ["libc", "cc"]
|
||||
streamdict = []
|
||||
|
||||
|
||||
[badges]
|
||||
|
||||
14
README.md
14
README.md
@@ -19,10 +19,10 @@ It is strongly inspired by Lucene's design.
|
||||
- Basic query language
|
||||
- Phrase queries
|
||||
- Incremental indexing
|
||||
- Multithreaded indexing (indexing English Wikipedia takes 4 minutes on my desktop)
|
||||
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
|
||||
- mmap based
|
||||
- optional SIMD integer compression
|
||||
- u32 fast fields (equivalent of doc values in Lucene)
|
||||
- u64 and i64 fast fields (equivalent of doc values in Lucene)
|
||||
- LZ4 compressed document store
|
||||
- Cheesy logo with a horse
|
||||
|
||||
@@ -38,12 +38,10 @@ It will walk you through getting a wikipedia search engine up and running in a f
|
||||
- [For the last released version](https://docs.rs/tantivy/)
|
||||
- [For the last master branch](https://tantivy-search.github.io/tantivy/tantivy/index.html)
|
||||
|
||||
# Compiling
|
||||
# Compiling
|
||||
|
||||
Tantivy requires Rust Nightly because it uses requires the features [`box_syntax`](https://doc.rust-lang.org/stable/book/box-syntax-and-patterns.html), [`optin_builtin_traits`](https://github.com/rust-lang/rfcs/blob/master/text/0019-opt-in-builtin-traits.md), and [`conservative_impl_trait`](https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md).
|
||||
By default, `tantivy` uses a git submodule called `simdcomp`.
|
||||
After cloning the repository, you will need to initialize and update
|
||||
the submodules. The project can then be built using `cargo`.
|
||||
The project can then be built using `cargo`.
|
||||
|
||||
git clone git@github.com:tantivy-search/tantivy.git
|
||||
cd tantivy
|
||||
@@ -54,9 +52,9 @@ Alternatively, if you are trying to compile `tantivy` without simd compression,
|
||||
you can disable this functionality. In this case, this submodule is not required
|
||||
and you can compile tantivy by using the `--no-default-features` flag.
|
||||
|
||||
cargo build --no-default-features
|
||||
cargo build --no-default-features
|
||||
|
||||
|
||||
# Contribute
|
||||
|
||||
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||
|
||||
@@ -21,4 +21,5 @@ install:
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
|
||||
- REM SET RUST_BACKTRACE=1 & cargo run --example simple_search
|
||||
|
||||
25
build.rs
25
build.rs
@@ -1,10 +1,11 @@
|
||||
#[cfg(feature = "simdcompression")]
|
||||
mod build {
|
||||
extern crate gcc;
|
||||
extern crate cc;
|
||||
|
||||
pub fn build() {
|
||||
let mut config = gcc::Config::new();
|
||||
config.include("./cpp/simdcomp/include")
|
||||
let mut config = cc::Build::new();
|
||||
config
|
||||
.include("./cpp/simdcomp/include")
|
||||
.file("cpp/simdcomp/src/avxbitpacking.c")
|
||||
.file("cpp/simdcomp/src/simdintegratedbitpacking.c")
|
||||
.file("cpp/simdcomp/src/simdbitpacking.c")
|
||||
@@ -18,18 +19,26 @@ mod build {
|
||||
config.opt_level(3);
|
||||
|
||||
if cfg!(target_env = "msvc") {
|
||||
config.define("NDEBUG", None)
|
||||
config
|
||||
.define("NDEBUG", None)
|
||||
.flag("/Gm-")
|
||||
.flag("/GS-")
|
||||
.flag("/Gy")
|
||||
.flag("/Oi")
|
||||
.flag("/GL");
|
||||
} else {
|
||||
config.flag("-msse4.1")
|
||||
.flag("-march=native");
|
||||
}
|
||||
}
|
||||
|
||||
if !cfg!(target_env = "msvc") {
|
||||
config
|
||||
.include("./cpp/streamvbyte/include")
|
||||
.file("cpp/streamvbyte/src/streamvbyte.c")
|
||||
.file("cpp/streamvbyte/src/streamvbytedelta.c")
|
||||
.flag("-msse4.1")
|
||||
.flag("-march=native")
|
||||
.flag("-std=c99");
|
||||
}
|
||||
|
||||
config.compile("libsimdcomp.a");
|
||||
|
||||
// Workaround for linking static libraries built with /GL
|
||||
@@ -37,6 +46,8 @@ mod build {
|
||||
if !cfg!(debug_assertions) && cfg!(target_env = "msvc") {
|
||||
println!("cargo:rustc-link-lib=dylib=simdcomp");
|
||||
}
|
||||
|
||||
println!("cargo:rerun-if-changed=cpp");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
32
cpp/streamvbyte/.gitignore
vendored
Normal file
32
cpp/streamvbyte/.gitignore
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
# Object files
|
||||
*.o
|
||||
*.ko
|
||||
*.obj
|
||||
*.elf
|
||||
|
||||
# Precompiled Headers
|
||||
*.gch
|
||||
*.pch
|
||||
|
||||
# Libraries
|
||||
*.lib
|
||||
*.a
|
||||
*.la
|
||||
*.lo
|
||||
|
||||
# Shared objects (inc. Windows DLLs)
|
||||
*.dll
|
||||
*.so
|
||||
*.so.*
|
||||
*.dylib
|
||||
|
||||
# Executables
|
||||
*.exe
|
||||
*.out
|
||||
*.app
|
||||
*.i*86
|
||||
*.x86_64
|
||||
*.hex
|
||||
|
||||
# Debug files
|
||||
*.dSYM/
|
||||
7
cpp/streamvbyte/.travis.yml
Normal file
7
cpp/streamvbyte/.travis.yml
Normal file
@@ -0,0 +1,7 @@
|
||||
language: c
|
||||
sudo: false
|
||||
compiler:
|
||||
- gcc
|
||||
- clang
|
||||
|
||||
script: make && ./unit
|
||||
202
cpp/streamvbyte/LICENSE
Normal file
202
cpp/streamvbyte/LICENSE
Normal file
@@ -0,0 +1,202 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "{}"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright {yyyy} {name of copyright owner}
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
60
cpp/streamvbyte/README.md
Normal file
60
cpp/streamvbyte/README.md
Normal file
@@ -0,0 +1,60 @@
|
||||
streamvbyte
|
||||
===========
|
||||
[](https://travis-ci.org/lemire/streamvbyte)
|
||||
|
||||
StreamVByte is a new integer compression technique that applies SIMD instructions (vectorization) to
|
||||
Google's Group Varint approach. The net result is faster than other byte-oriented compression
|
||||
techniques.
|
||||
|
||||
The approach is patent-free, the code is available under the Apache License.
|
||||
|
||||
|
||||
It includes fast differential coding.
|
||||
|
||||
It assumes a recent Intel processor (e.g., haswell or better) .
|
||||
|
||||
The code should build using most standard-compliant C99 compilers. The provided makefile
|
||||
expects a Linux-like system.
|
||||
|
||||
|
||||
Usage:
|
||||
|
||||
make
|
||||
./unit
|
||||
|
||||
See example.c for an example.
|
||||
|
||||
Short code sample:
|
||||
```C
|
||||
// suppose that datain is an array of uint32_t integers
|
||||
size_t compsize = streamvbyte_encode(datain, N, compressedbuffer); // encoding
|
||||
// here the result is stored in compressedbuffer using compsize bytes
|
||||
streamvbyte_decode(compressedbuffer, recovdata, N); // decoding (fast)
|
||||
```
|
||||
|
||||
If the values are sorted, then it might be preferable to use differential coding:
|
||||
```C
|
||||
// suppose that datain is an array of uint32_t integers
|
||||
size_t compsize = streamvbyte_delta_encode(datain, N, compressedbuffer,0); // encoding
|
||||
// here the result is stored in compressedbuffer using compsize bytes
|
||||
streamvbyte_delta_decode(compressedbuffer, recovdata, N,0); // decoding (fast)
|
||||
```
|
||||
You have to know how many integers were coded when you decompress. You can store this
|
||||
information along with the compressed stream.
|
||||
|
||||
See also
|
||||
--------
|
||||
* SIMDCompressionAndIntersection: A C++ library to compress and intersect sorted lists of integers using SIMD instructions https://github.com/lemire/SIMDCompressionAndIntersect
|
||||
* The FastPFOR C++ library : Fast integer compression https://github.com/lemire/FastPFor
|
||||
* High-performance dictionary coding https://github.com/lemire/dictionary
|
||||
* LittleIntPacker: C library to pack and unpack short arrays of integers as fast as possible https://github.com/lemire/LittleIntPacker
|
||||
* The SIMDComp library: A simple C library for compressing lists of integers using binary packing https://github.com/lemire/simdcomp
|
||||
* MaskedVByte: Fast decoder for VByte-compressed integers https://github.com/lemire/MaskedVByte
|
||||
* CSharpFastPFOR: A C# integer compression library https://github.com/Genbox/CSharpFastPFOR
|
||||
* JavaFastPFOR: A java integer compression library https://github.com/lemire/JavaFastPFOR
|
||||
* Encoding: Integer Compression Libraries for Go https://github.com/zhenjl/encoding
|
||||
* FrameOfReference is a C++ library dedicated to frame-of-reference (FOR) compression: https://github.com/lemire/FrameOfReference
|
||||
* libvbyte: A fast implementation for varbyte 32bit/64bit integer compression https://github.com/cruppstahl/libvbyte
|
||||
* TurboPFor is a C library that offers lots of interesting optimizations. Well worth checking! (GPL license) https://github.com/powturbo/TurboPFor
|
||||
* Oroch is a C++ library that offers a usable API (MIT license) https://github.com/ademakov/Oroch
|
||||
|
||||
24
cpp/streamvbyte/example.c
Normal file
24
cpp/streamvbyte/example.c
Normal file
@@ -0,0 +1,24 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "streamvbyte.h"
|
||||
|
||||
int main() {
|
||||
int N = 5000;
|
||||
uint32_t * datain = malloc(N * sizeof(uint32_t));
|
||||
uint8_t * compressedbuffer = malloc(N * sizeof(uint32_t));
|
||||
uint32_t * recovdata = malloc(N * sizeof(uint32_t));
|
||||
for (int k = 0; k < N; ++k)
|
||||
datain[k] = 120;
|
||||
size_t compsize = streamvbyte_encode(datain, N, compressedbuffer); // encoding
|
||||
// here the result is stored in compressedbuffer using compsize bytes
|
||||
size_t compsize2 = streamvbyte_decode(compressedbuffer, recovdata,
|
||||
N); // decoding (fast)
|
||||
assert(compsize == compsize2);
|
||||
free(datain);
|
||||
free(compressedbuffer);
|
||||
free(recovdata);
|
||||
printf("Compressed %d integers down to %d bytes.\n",N,(int) compsize);
|
||||
return 0;
|
||||
}
|
||||
19
cpp/streamvbyte/include/streamvbyte.h
Normal file
19
cpp/streamvbyte/include/streamvbyte.h
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
#ifndef VARINTDECODE_H_
|
||||
#define VARINTDECODE_H_
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <inttypes.h>
|
||||
#include <stdint.h>// please use a C99-compatible compiler
|
||||
#include <stddef.h>
|
||||
|
||||
|
||||
// Encode an array of a given length read from in to bout in varint format.
|
||||
// Returns the number of bytes written.
|
||||
size_t streamvbyte_encode(const uint32_t *in, uint32_t length, uint8_t *out);
|
||||
|
||||
// Read "length" 32-bit integers in varint format from in, storing the result in out.
|
||||
// Returns the number of bytes read.
|
||||
size_t streamvbyte_decode(const uint8_t* in, uint32_t* out, uint32_t length);
|
||||
|
||||
|
||||
#endif /* VARINTDECODE_H_ */
|
||||
24
cpp/streamvbyte/include/streamvbytedelta.h
Normal file
24
cpp/streamvbyte/include/streamvbytedelta.h
Normal file
@@ -0,0 +1,24 @@
|
||||
/*
|
||||
* streamvbytedelta.h
|
||||
*
|
||||
* Created on: Apr 14, 2016
|
||||
* Author: lemire
|
||||
*/
|
||||
|
||||
#ifndef INCLUDE_STREAMVBYTEDELTA_H_
|
||||
#define INCLUDE_STREAMVBYTEDELTA_H_
|
||||
|
||||
|
||||
// Encode an array of a given length read from in to bout in StreamVByte format.
|
||||
// Returns the number of bytes written.
|
||||
// this version uses differential coding (coding differences between values) starting at prev (you can often set prev to zero)
|
||||
size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t length, uint8_t *out, uint32_t prev);
|
||||
|
||||
// Read "length" 32-bit integers in StreamVByte format from in, storing the result in out.
|
||||
// Returns the number of bytes read.
|
||||
// this version uses differential coding (coding differences between values) starting at prev (you can often set prev to zero)
|
||||
size_t streamvbyte_delta_decode(const uint8_t* in, uint32_t* out, uint32_t length, uint32_t prev);
|
||||
|
||||
|
||||
|
||||
#endif /* INCLUDE_STREAMVBYTEDELTA_H_ */
|
||||
58
cpp/streamvbyte/makefile
Normal file
58
cpp/streamvbyte/makefile
Normal file
@@ -0,0 +1,58 @@
|
||||
# minimalist makefile
|
||||
.SUFFIXES:
|
||||
#
|
||||
.SUFFIXES: .cpp .o .c .h
|
||||
|
||||
CFLAGS = -fPIC -march=native -std=c99 -O3 -Wall -Wextra -pedantic -Wshadow
|
||||
LDFLAGS = -shared
|
||||
LIBNAME=libstreamvbyte.so.0.0.1
|
||||
all: unit $(LIBNAME)
|
||||
test:
|
||||
./unit
|
||||
install: $(OBJECTS)
|
||||
cp $(LIBNAME) /usr/local/lib
|
||||
ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libstreamvbyte.so
|
||||
ldconfig
|
||||
cp $(HEADERS) /usr/local/include
|
||||
|
||||
|
||||
|
||||
HEADERS=./include/streamvbyte.h ./include/streamvbytedelta.h
|
||||
|
||||
uninstall:
|
||||
for h in $(HEADERS) ; do rm /usr/local/$$h; done
|
||||
rm /usr/local/lib/$(LIBNAME)
|
||||
rm /usr/local/lib/libstreamvbyte.so
|
||||
ldconfig
|
||||
|
||||
|
||||
OBJECTS= streamvbyte.o streamvbytedelta.o
|
||||
|
||||
|
||||
|
||||
streamvbytedelta.o: ./src/streamvbytedelta.c $(HEADERS)
|
||||
$(CC) $(CFLAGS) -c ./src/streamvbytedelta.c -Iinclude
|
||||
|
||||
|
||||
streamvbyte.o: ./src/streamvbyte.c $(HEADERS)
|
||||
$(CC) $(CFLAGS) -c ./src/streamvbyte.c -Iinclude
|
||||
|
||||
|
||||
|
||||
$(LIBNAME): $(OBJECTS)
|
||||
$(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS)
|
||||
|
||||
|
||||
|
||||
|
||||
example: ./example.c $(HEADERS) $(OBJECTS)
|
||||
$(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS)
|
||||
|
||||
unit: ./tests/unit.c $(HEADERS) $(OBJECTS)
|
||||
$(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude $(OBJECTS)
|
||||
|
||||
dynunit: ./tests/unit.c $(HEADERS) $(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude -lstreamvbyte
|
||||
|
||||
clean:
|
||||
rm -f unit *.o $(LIBNAME) example
|
||||
495
cpp/streamvbyte/src/streamvbyte.c
Normal file
495
cpp/streamvbyte/src/streamvbyte.c
Normal file
@@ -0,0 +1,495 @@
|
||||
#include "streamvbyte.h"
|
||||
#if defined(_MSC_VER)
|
||||
/* Microsoft C/C++-compatible compiler */
|
||||
#include <intrin.h>
|
||||
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
||||
/* GCC-compatible compiler, targeting x86/x86-64 */
|
||||
#include <x86intrin.h>
|
||||
#elif defined(__GNUC__) && defined(__ARM_NEON__)
|
||||
/* GCC-compatible compiler, targeting ARM with NEON */
|
||||
#include <arm_neon.h>
|
||||
#elif defined(__GNUC__) && defined(__IWMMXT__)
|
||||
/* GCC-compatible compiler, targeting ARM with WMMX */
|
||||
#include <mmintrin.h>
|
||||
#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
|
||||
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
|
||||
#include <altivec.h>
|
||||
#elif defined(__GNUC__) && defined(__SPE__)
|
||||
/* GCC-compatible compiler, targeting PowerPC with SPE */
|
||||
#include <spe.h>
|
||||
#endif
|
||||
|
||||
static uint8_t lengthTable[256] = { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9,
|
||||
10, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8,
|
||||
9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
|
||||
11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
|
||||
11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10,
|
||||
8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
|
||||
12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
|
||||
11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12,
|
||||
13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
|
||||
11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8,
|
||||
9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12,
|
||||
10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
|
||||
13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15,
|
||||
13, 14, 15, 16 };
|
||||
|
||||
static uint8_t shuffleTable[256][16] = { { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1,
|
||||
-1, -1, 3, -1, -1, -1 }, // 1111
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 2111
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 3111
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 4111
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 1211
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 2211
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 3211
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 4211
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 1311
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 2311
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 3311
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1 }, // 4311
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1 }, // 1411
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1 }, // 2411
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1 }, // 3411
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1 }, // 4411
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 }, // 1121
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 2121
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 3121
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 4121
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 1221
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 2221
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 3221
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 4221
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 1321
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 2321
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 3321
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1 }, // 4321
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1 }, // 1421
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1 }, // 2421
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1 }, // 3421
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1 }, // 4421
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1 }, // 1131
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 2131
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 3131
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 4131
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 1231
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 2231
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 3231
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 4231
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 1331
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 2331
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 3331
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1 }, // 4331
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1 }, // 1431
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1 }, // 2431
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1 }, // 3431
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1 }, // 4431
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 }, // 1141
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 2141
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 3141
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 4141
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 1241
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 2241
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 3241
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 4241
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 1341
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 2341
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 3341
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1 }, // 4341
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1 }, // 1441
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1 }, // 2441
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1 }, // 3441
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 }, // 4441
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 }, // 1112
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 2112
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 3112
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 4112
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 1212
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 2212
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 3212
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 4212
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 1312
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 2312
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 3312
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1 }, // 4312
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1 }, // 1412
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1 }, // 2412
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1 }, // 3412
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1 }, // 4412
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 }, // 1122
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 2122
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 3122
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 4122
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 1222
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 2222
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 3222
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 4222
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 1322
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 2322
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 3322
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1 }, // 4322
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1 }, // 1422
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1 }, // 2422
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1 }, // 3422
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1 }, // 4422
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1 }, // 1132
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 2132
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 3132
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 4132
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 1232
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 2232
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 3232
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 4232
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 1332
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 2332
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 3332
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1 }, // 4332
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1 }, // 1432
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1 }, // 2432
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1 }, // 3432
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1 }, // 4432
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 }, // 1142
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 2142
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 3142
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 4142
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 1242
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 2242
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 3242
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 4242
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 1342
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 2342
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 3342
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1 }, // 4342
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1 }, // 1442
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1 }, // 2442
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1 }, // 3442
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 }, // 4442
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1 }, // 1113
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 2113
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 3113
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 4113
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 1213
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 2213
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 3213
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 4213
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 1313
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 2313
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 3313
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1 }, // 4313
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1 }, // 1413
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1 }, // 2413
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1 }, // 3413
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1 }, // 4413
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1 }, // 1123
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 2123
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 3123
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 4123
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 1223
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 2223
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 3223
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 4223
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 1323
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 2323
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 3323
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1 }, // 4323
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1 }, // 1423
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1 }, // 2423
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1 }, // 3423
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1 }, // 4423
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1 }, // 1133
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 2133
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 3133
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 4133
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 1233
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 2233
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 3233
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 4233
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 1333
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 2333
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 3333
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1 }, // 4333
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1 }, // 1433
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1 }, // 2433
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1 }, // 3433
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1 }, // 4433
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1 }, // 1143
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 2143
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 3143
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 4143
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 1243
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 2243
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 3243
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 4243
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 1343
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 2343
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 3343
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1 }, // 4343
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1 }, // 1443
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1 }, // 2443
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 }, // 3443
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1 }, // 4443
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 }, // 1114
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 2114
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 3114
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 4114
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 1214
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 2214
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 3214
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 4214
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 1314
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 2314
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 3314
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11 }, // 4314
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9 }, // 1414
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10 }, // 2414
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11 }, // 3414
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12 }, // 4414
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 }, // 1124
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 2124
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 3124
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 4124
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 1224
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 2224
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 3224
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 4224
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 1324
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 2324
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 3324
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12 }, // 4324
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10 }, // 1424
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11 }, // 2424
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12 }, // 3424
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13 }, // 4424
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8 }, // 1134
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 2134
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 3134
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 4134
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 1234
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 2234
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 3234
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 4234
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 1334
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 2334
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 3334
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13 }, // 4334
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11 }, // 1434
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12 }, // 2434
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13 }, // 3434
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14 }, // 4434
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 }, // 1144
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 2144
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 3144
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 4144
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 1244
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 2244
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 3244
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 4244
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 1344
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 2344
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 3344
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14 }, // 4344
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, // 1444
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, // 2444
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, // 3444
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } // 4444
|
||||
};
|
||||
|
||||
static uint8_t _encode_data(uint32_t val, uint8_t *__restrict__ *dataPtrPtr) {
|
||||
uint8_t *dataPtr = *dataPtrPtr;
|
||||
uint8_t code;
|
||||
|
||||
if (val < (1 << 8)) { // 1 byte
|
||||
*dataPtr = (uint8_t)(val);
|
||||
*dataPtrPtr += 1;
|
||||
code = 0;
|
||||
} else if (val < (1 << 16)) { // 2 bytes
|
||||
*(uint16_t *) dataPtr = (uint16_t)(val);
|
||||
*dataPtrPtr += 2;
|
||||
code = 1;
|
||||
} else if (val < (1 << 24)) { // 3 bytes
|
||||
*(uint16_t *) dataPtr = (uint16_t)(val);
|
||||
*(dataPtr + 2) = (uint8_t)(val >> 16);
|
||||
*dataPtrPtr += 3;
|
||||
code = 2;
|
||||
} else { // 4 bytes
|
||||
*(uint32_t *) dataPtr = val;
|
||||
*dataPtrPtr += 4;
|
||||
code = 3;
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
static uint8_t *svb_encode_scalar(const uint32_t *in,
|
||||
uint8_t *__restrict__ keyPtr, uint8_t *__restrict__ dataPtr,
|
||||
uint32_t count) {
|
||||
if (count == 0)
|
||||
return dataPtr; // exit immediately if no data
|
||||
|
||||
uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ...
|
||||
uint8_t key = 0;
|
||||
for (uint32_t c = 0; c < count; c++) {
|
||||
if (shift == 8) {
|
||||
shift = 0;
|
||||
*keyPtr++ = key;
|
||||
key = 0;
|
||||
}
|
||||
uint32_t val = in[c];
|
||||
uint8_t code = _encode_data(val, &dataPtr);
|
||||
key |= code << shift;
|
||||
shift += 2;
|
||||
}
|
||||
|
||||
*keyPtr = key; // write last key (no increment needed)
|
||||
return dataPtr; // pointer to first unused data byte
|
||||
}
|
||||
|
||||
// Encode an array of a given length read from in to bout in streamvbyte format.
|
||||
// Returns the number of bytes written.
|
||||
size_t streamvbyte_encode(const uint32_t *in, uint32_t count, uint8_t *out) {
|
||||
uint8_t *keyPtr = out;
|
||||
uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte
|
||||
uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys
|
||||
return svb_encode_scalar(in, keyPtr, dataPtr, count) - out;
|
||||
}
|
||||
|
||||
static inline __m128i _decode_avx(uint32_t key,
|
||||
const uint8_t *__restrict__ *dataPtrPtr) {
|
||||
uint8_t len = lengthTable[key];
|
||||
__m128i Data = _mm_loadu_si128((__m128i *) *dataPtrPtr);
|
||||
__m128i Shuf = *(__m128i *) &shuffleTable[key];
|
||||
|
||||
Data = _mm_shuffle_epi8(Data, Shuf);
|
||||
*dataPtrPtr += len;
|
||||
return Data;
|
||||
}
|
||||
|
||||
static inline void _write_avx(uint32_t *out, __m128i Vec) {
|
||||
_mm_storeu_si128((__m128i *) out, Vec);
|
||||
}
|
||||
|
||||
static inline uint32_t _decode_data(const uint8_t **dataPtrPtr, uint8_t code) {
|
||||
const uint8_t *dataPtr = *dataPtrPtr;
|
||||
uint32_t val;
|
||||
|
||||
if (code == 0) { // 1 byte
|
||||
val = (uint32_t) * dataPtr;
|
||||
dataPtr += 1;
|
||||
} else if (code == 1) { // 2 bytes
|
||||
val = (uint32_t) * (uint16_t *) dataPtr;
|
||||
dataPtr += 2;
|
||||
} else if (code == 2) { // 3 bytes
|
||||
val = (uint32_t) * (uint16_t *) dataPtr;
|
||||
val |= *(dataPtr + 2) << 16;
|
||||
dataPtr += 3;
|
||||
} else { // code == 3
|
||||
val = *(uint32_t *) dataPtr; // 4 bytes
|
||||
dataPtr += 4;
|
||||
}
|
||||
|
||||
*dataPtrPtr = dataPtr;
|
||||
return val;
|
||||
}
|
||||
static const uint8_t *svb_decode_scalar(uint32_t *outPtr, const uint8_t *keyPtr,
|
||||
const uint8_t *dataPtr, uint32_t count) {
|
||||
if (count == 0)
|
||||
return dataPtr; // no reads or writes if no data
|
||||
|
||||
uint8_t shift = 0;
|
||||
uint32_t key = *keyPtr++;
|
||||
for (uint32_t c = 0; c < count; c++) {
|
||||
if (shift == 8) {
|
||||
shift = 0;
|
||||
key = *keyPtr++;
|
||||
}
|
||||
uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3);
|
||||
*outPtr++ = val;
|
||||
shift += 2;
|
||||
}
|
||||
|
||||
return dataPtr; // pointer to first unused byte after end
|
||||
}
|
||||
|
||||
const uint8_t *svb_decode_avx_simple(uint32_t *out,
|
||||
const uint8_t *__restrict__ keyPtr, const uint8_t *__restrict__ dataPtr,
|
||||
uint64_t count) {
|
||||
|
||||
uint64_t keybytes = count / 4; // number of key bytes
|
||||
__m128i Data;
|
||||
if (keybytes >= 8) {
|
||||
|
||||
int64_t Offset = -(int64_t) keybytes / 8 + 1;
|
||||
|
||||
const uint64_t *keyPtr64 = (const uint64_t *) keyPtr - Offset;
|
||||
uint64_t nextkeys = keyPtr64[Offset];
|
||||
for (; Offset != 0; ++Offset) {
|
||||
uint64_t keys = nextkeys;
|
||||
nextkeys = keyPtr64[Offset + 1];
|
||||
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 4, Data);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out + 8, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 12, Data);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out + 16, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 20, Data);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out + 24, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 28, Data);
|
||||
|
||||
out += 32;
|
||||
}
|
||||
{
|
||||
uint64_t keys = nextkeys;
|
||||
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 4, Data);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out + 8, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 12, Data);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out + 16, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 20, Data);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0xFF), &dataPtr);
|
||||
_write_avx(out + 24, Data);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
_write_avx(out + 28, Data);
|
||||
|
||||
out += 32;
|
||||
}
|
||||
}
|
||||
uint64_t consumedkeys = keybytes - (keybytes & 7);
|
||||
return svb_decode_scalar(out, keyPtr + consumedkeys, dataPtr, count & 31);
|
||||
}
|
||||
|
||||
// Read count 32-bit integers in maskedvbyte format from in, storing the result in out. Returns the number of bytes read.
|
||||
size_t streamvbyte_decode(const uint8_t* in, uint32_t* out, uint32_t count) {
|
||||
if (count == 0)
|
||||
return 0;
|
||||
const uint8_t *keyPtr = in; // full list of keys is next
|
||||
uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up)
|
||||
const uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys
|
||||
return svb_decode_avx_simple(out, keyPtr, dataPtr, count) - in;
|
||||
|
||||
}
|
||||
575
cpp/streamvbyte/src/streamvbytedelta.c
Normal file
575
cpp/streamvbyte/src/streamvbytedelta.c
Normal file
@@ -0,0 +1,575 @@
|
||||
#include "streamvbyte.h"
|
||||
#if defined(_MSC_VER)
|
||||
/* Microsoft C/C++-compatible compiler */
|
||||
#include <intrin.h>
|
||||
#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
|
||||
/* GCC-compatible compiler, targeting x86/x86-64 */
|
||||
#include <x86intrin.h>
|
||||
#elif defined(__GNUC__) && defined(__ARM_NEON__)
|
||||
/* GCC-compatible compiler, targeting ARM with NEON */
|
||||
#include <arm_neon.h>
|
||||
#elif defined(__GNUC__) && defined(__IWMMXT__)
|
||||
/* GCC-compatible compiler, targeting ARM with WMMX */
|
||||
#include <mmintrin.h>
|
||||
#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__))
|
||||
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
|
||||
#include <altivec.h>
|
||||
#elif defined(__GNUC__) && defined(__SPE__)
|
||||
/* GCC-compatible compiler, targeting PowerPC with SPE */
|
||||
#include <spe.h>
|
||||
#endif
|
||||
|
||||
static uint8_t lengthTable[256] = { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9,
|
||||
10, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8,
|
||||
9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
|
||||
11, 12, 10, 11, 12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
|
||||
11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10,
|
||||
8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
|
||||
12, 10, 11, 12, 13, 11, 12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10,
|
||||
11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12,
|
||||
13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
|
||||
11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8,
|
||||
9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12,
|
||||
10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
|
||||
13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15,
|
||||
13, 14, 15, 16 };
|
||||
|
||||
static uint8_t shuffleTable[256][16] = { { 0, -1, -1, -1, 1, -1, -1, -1, 2, -1,
|
||||
-1, -1, 3, -1, -1, -1 }, // 1111
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 2111
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 3111
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 4111
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, -1, -1, -1 }, // 1211
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 2211
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 3211
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 4211
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, -1, -1, -1 }, // 1311
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, -1, -1, -1 }, // 2311
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, -1, -1 }, // 3311
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, -1, -1, -1 }, // 4311
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, -1, -1, -1 }, // 1411
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, -1, -1, -1 }, // 2411
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, -1, -1, -1 }, // 3411
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, -1, -1, -1 }, // 4411
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1 }, // 1121
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 2121
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 3121
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 4121
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, -1, -1, -1 }, // 1221
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 2221
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 3221
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 4221
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, -1, -1 }, // 1321
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, -1, -1 }, // 2321
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, -1, -1 }, // 3321
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, -1, -1, -1 }, // 4321
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, -1, -1, -1 }, // 1421
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, -1, -1, -1 }, // 2421
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, -1, -1, -1 }, // 3421
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, -1, -1, -1 }, // 4421
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1 }, // 1131
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 2131
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 3131
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 4131
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, -1, -1 }, // 1231
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 2231
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 3231
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 4231
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, -1, -1 }, // 1331
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, -1, -1 }, // 2331
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, -1, -1 }, // 3331
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, -1, -1, -1 }, // 4331
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, -1, -1 }, // 1431
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, -1, -1, -1 }, // 2431
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, -1, -1, -1 }, // 3431
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, -1, -1, -1 }, // 4431
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1 }, // 1141
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 2141
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 3141
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 4141
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, -1, -1, -1 }, // 1241
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 2241
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 3241
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 4241
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, -1, -1, -1 }, // 1341
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, -1, -1, -1 }, // 2341
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, -1, -1, -1 }, // 3341
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, -1, -1, -1 }, // 4341
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1 }, // 1441
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1 }, // 2441
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1, -1 }, // 3441
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1 }, // 4441
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1 }, // 1112
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 2112
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 3112
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 4112
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, -1, -1 }, // 1212
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 2212
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 3212
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 4212
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, -1, -1 }, // 1312
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, -1, -1 }, // 2312
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, -1, -1 }, // 3312
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, -1, -1 }, // 4312
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, -1, -1 }, // 1412
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, -1, -1 }, // 2412
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, -1, -1 }, // 3412
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, -1, -1 }, // 4412
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1 }, // 1122
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 2122
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 3122
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 4122
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, -1, -1 }, // 1222
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 2222
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 3222
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 4222
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, -1, -1 }, // 1322
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, -1, -1 }, // 2322
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, -1, -1 }, // 3322
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, -1, -1 }, // 4322
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, -1, -1 }, // 1422
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, -1, -1 }, // 2422
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, -1, -1 }, // 3422
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, -1, -1 }, // 4422
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1 }, // 1132
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 2132
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 3132
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 4132
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, -1, -1 }, // 1232
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 2232
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 3232
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 4232
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, -1, -1 }, // 1332
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, -1, -1 }, // 2332
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, -1, -1 }, // 3332
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, -1, -1 }, // 4332
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, -1, -1 }, // 1432
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, -1, -1 }, // 2432
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, -1, -1 }, // 3432
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, -1, -1 }, // 4432
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1 }, // 1142
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 2142
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 3142
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 4142
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, -1, -1 }, // 1242
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 2242
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 3242
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 4242
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, -1, -1 }, // 1342
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1, -1 }, // 2342
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, -1, -1 }, // 3342
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, -1, -1 }, // 4342
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1 }, // 1442
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1, -1 }, // 2442
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1 }, // 3442
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1 }, // 4442
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1 }, // 1113
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 2113
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 3113
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 4113
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, -1 }, // 1213
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 2213
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 3213
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 4213
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, -1 }, // 1313
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, -1 }, // 2313
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1 }, // 3313
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, -1 }, // 4313
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, -1 }, // 1413
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, -1 }, // 2413
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, -1 }, // 3413
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, -1 }, // 4413
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1 }, // 1123
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 2123
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 3123
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 4123
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, -1 }, // 1223
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 2223
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 3223
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 4223
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, -1 }, // 1323
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, -1 }, // 2323
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, -1 }, // 3323
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, -1 }, // 4323
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, -1 }, // 1423
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, -1 }, // 2423
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, -1 }, // 3423
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, -1 }, // 4423
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1 }, // 1133
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 2133
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 3133
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 4133
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, -1 }, // 1233
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 2233
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 3233
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 4233
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, -1 }, // 1333
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, -1 }, // 2333
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1 }, // 3333
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, -1 }, // 4333
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, -1 }, // 1433
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, -1 }, // 2433
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, -1 }, // 3433
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, -1 }, // 4433
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1 }, // 1143
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 2143
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 3143
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 4143
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, -1 }, // 1243
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 2243
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 3243
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 4243
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, -1 }, // 1343
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, -1 }, // 2343
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, -1 }, // 3343
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, -1 }, // 4343
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1 }, // 1443
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1 }, // 2443
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1 }, // 3443
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1 }, // 4443
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6 }, // 1114
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 2114
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 3114
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 4114
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, 5, 6, 7 }, // 1214
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 2214
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 3214
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 4214
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, 7, 8 }, // 1314
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, 8, 9 }, // 2314
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, 9, 10 }, // 3314
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, -1, -1, -1, 8, 9, 10, 11 }, // 4314
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, -1, -1, -1, 6, 7, 8, 9 }, // 1414
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, -1, -1, -1, 7, 8, 9, 10 }, // 2414
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, -1, -1, -1, 8, 9, 10, 11 }, // 3414
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, -1, -1, -1, 9, 10, 11, 12 }, // 4414
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7 }, // 1124
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 2124
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 3124
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 4124
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, 7, 8 }, // 1224
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 2224
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 3224
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 4224
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, 8, 9 }, // 1324
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, 9, 10 }, // 2324
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, 10, 11 }, // 3324
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, -1, -1, 9, 10, 11, 12 }, // 4324
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9, 10 }, // 1424
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, -1, -1, 8, 9, 10, 11 }, // 2424
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, -1, -1, 9, 10, 11, 12 }, // 3424
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, 10, 11, 12, 13 }, // 4424
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8 }, // 1134
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 2134
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 3134
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 4134
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, 8, 9 }, // 1234
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 2234
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 3234
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 4234
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, 9, 10 }, // 1334
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, 10, 11 }, // 2334
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, 12 }, // 3334
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, -1, 10, 11, 12, 13 }, // 4334
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, -1, 8, 9, 10, 11 }, // 1434
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, -1, 9, 10, 11, 12 }, // 2434
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, -1, 10, 11, 12, 13 }, // 3434
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, 11, 12, 13, 14 }, // 4434
|
||||
{ 0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9 }, // 1144
|
||||
{ 0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 2144
|
||||
{ 0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 3144
|
||||
{ 0, 1, 2, 3, 4, -1, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 4144
|
||||
{ 0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, 6, 7, 8, 9, 10 }, // 1244
|
||||
{ 0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 2244
|
||||
{ 0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 3244
|
||||
{ 0, 1, 2, 3, 4, 5, -1, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 4244
|
||||
{ 0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, 7, 8, 9, 10, 11 }, // 1344
|
||||
{ 0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, 11, 12 }, // 2344
|
||||
{ 0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, 9, 10, 11, 12, 13 }, // 3344
|
||||
{ 0, 1, 2, 3, 4, 5, 6, -1, 7, 8, 9, 10, 11, 12, 13, 14 }, // 4344
|
||||
{ 0, -1, -1, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, // 1444
|
||||
{ 0, 1, -1, -1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, // 2444
|
||||
{ 0, 1, 2, -1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, // 3444
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } // 4444
|
||||
};
|
||||
|
||||
static uint8_t _encode_data(uint32_t val, uint8_t *__restrict__ *dataPtrPtr) {
|
||||
uint8_t *dataPtr = *dataPtrPtr;
|
||||
uint8_t code;
|
||||
|
||||
if (val < (1 << 8)) { // 1 byte
|
||||
*dataPtr = (uint8_t)(val);
|
||||
*dataPtrPtr += 1;
|
||||
code = 0;
|
||||
} else if (val < (1 << 16)) { // 2 bytes
|
||||
*(uint16_t *) dataPtr = (uint16_t)(val);
|
||||
*dataPtrPtr += 2;
|
||||
code = 1;
|
||||
} else if (val < (1 << 24)) { // 3 bytes
|
||||
*(uint16_t *) dataPtr = (uint16_t)(val);
|
||||
*(dataPtr + 2) = (uint8_t)(val >> 16);
|
||||
*dataPtrPtr += 3;
|
||||
code = 2;
|
||||
} else { // 4 bytes
|
||||
*(uint32_t *) dataPtr = val;
|
||||
*dataPtrPtr += 4;
|
||||
code = 3;
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
static uint8_t *svb_encode_scalar_d1_init(const uint32_t *in,
|
||||
uint8_t *__restrict__ keyPtr, uint8_t *__restrict__ dataPtr,
|
||||
uint32_t count, uint32_t prev) {
|
||||
if (count == 0)
|
||||
return dataPtr; // exit immediately if no data
|
||||
|
||||
uint8_t shift = 0; // cycles 0, 2, 4, 6, 0, 2, 4, 6, ...
|
||||
uint8_t key = 0;
|
||||
for (uint32_t c = 0; c < count; c++) {
|
||||
if (shift == 8) {
|
||||
shift = 0;
|
||||
*keyPtr++ = key;
|
||||
key = 0;
|
||||
}
|
||||
uint32_t val = in[c] - prev;
|
||||
prev = in[c];
|
||||
uint8_t code = _encode_data(val, &dataPtr);
|
||||
key |= code << shift;
|
||||
shift += 2;
|
||||
}
|
||||
|
||||
*keyPtr = key; // write last key (no increment needed)
|
||||
return dataPtr; // pointer to first unused data byte
|
||||
}
|
||||
|
||||
size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t count, uint8_t *out,
|
||||
uint32_t prev) {
|
||||
uint8_t *keyPtr = out; // keys come immediately after 32-bit count
|
||||
uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte
|
||||
uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys
|
||||
|
||||
return svb_encode_scalar_d1_init(in, keyPtr, dataPtr, count, prev) - out;
|
||||
|
||||
}
|
||||
|
||||
static inline __m128i _decode_avx(uint32_t key, const uint8_t *__restrict__ *dataPtrPtr) {
|
||||
uint8_t len = lengthTable[key];
|
||||
__m128i Data = _mm_loadu_si128((__m128i *) *dataPtrPtr);
|
||||
__m128i Shuf = *(__m128i *) &shuffleTable[key];
|
||||
|
||||
Data = _mm_shuffle_epi8(Data, Shuf);
|
||||
*dataPtrPtr += len;
|
||||
|
||||
return Data;
|
||||
}
|
||||
#define BroadcastLastXMM 0xFF // bits 0-7 all set to choose highest element
|
||||
|
||||
|
||||
|
||||
static inline void _write_avx(uint32_t *out, __m128i Vec) {
|
||||
_mm_storeu_si128((__m128i *) out, Vec);
|
||||
}
|
||||
|
||||
static __m128i _write_avx_d1(uint32_t *out, __m128i Vec, __m128i Prev) {
|
||||
__m128i Add = _mm_slli_si128(Vec, 4); // Cycle 1: [- A B C] (already done)
|
||||
Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // Cycle 2: [P P P P]
|
||||
Vec = _mm_add_epi32(Vec, Add); // Cycle 2: [A AB BC CD]
|
||||
Add = _mm_slli_si128(Vec, 8); // Cycle 3: [- - A AB]
|
||||
Vec = _mm_add_epi32(Vec, Prev); // Cycle 3: [PA PAB PBC PCD]
|
||||
Vec = _mm_add_epi32(Vec, Add); // Cycle 4: [PA PAB PABC PABCD]
|
||||
|
||||
_write_avx(out, Vec);
|
||||
return Vec;
|
||||
}
|
||||
|
||||
#ifndef _MSC_VER
|
||||
static __m128i High16To32 = {0xFFFF0B0AFFFF0908, 0xFFFF0F0EFFFF0D0C};
|
||||
#else
|
||||
static __m128i High16To32 = {8, 9, -1, -1, 10, 11, -1, -1,
|
||||
12, 13, -1, -1, 14, 15, -1, -1};
|
||||
#endif
|
||||
|
||||
static inline __m128i _write_16bit_avx_d1(uint32_t *out, __m128i Vec, __m128i Prev) {
|
||||
// vec == [A B C D E F G H] (16 bit values)
|
||||
__m128i Add = _mm_slli_si128(Vec, 2); // [- A B C D E F G]
|
||||
Prev = _mm_shuffle_epi32(Prev, BroadcastLastXMM); // [P P P P] (32-bit)
|
||||
Vec = _mm_add_epi32(Vec, Add); // [A AB BC CD DE FG GH]
|
||||
Add = _mm_slli_si128(Vec, 4); // [- - A AB BC CD DE EF]
|
||||
Vec = _mm_add_epi32(Vec, Add); // [A AB ABC ABCD BCDE CDEF DEFG EFGH]
|
||||
__m128i V1 = _mm_cvtepu16_epi32(Vec); // [A AB ABC ABCD] (32-bit)
|
||||
V1 = _mm_add_epi32(V1, Prev); // [PA PAB PABC PABCD] (32-bit)
|
||||
__m128i V2 =
|
||||
_mm_shuffle_epi8(Vec, High16To32); // [BCDE CDEF DEFG EFGH] (32-bit)
|
||||
V2 = _mm_add_epi32(V1, V2); // [PABCDE PABCDEF PABCDEFG PABCDEFGH] (32-bit)
|
||||
_write_avx(out, V1);
|
||||
_write_avx(out + 4, V2);
|
||||
return V2;
|
||||
}
|
||||
|
||||
static inline uint32_t _decode_data(const uint8_t **dataPtrPtr, uint8_t code) {
|
||||
const uint8_t *dataPtr = *dataPtrPtr;
|
||||
uint32_t val;
|
||||
|
||||
if (code == 0) { // 1 byte
|
||||
val = (uint32_t) * dataPtr;
|
||||
dataPtr += 1;
|
||||
} else if (code == 1) { // 2 bytes
|
||||
val = (uint32_t) * (uint16_t *) dataPtr;
|
||||
dataPtr += 2;
|
||||
} else if (code == 2) { // 3 bytes
|
||||
val = (uint32_t) * (uint16_t *) dataPtr;
|
||||
val |= *(dataPtr + 2) << 16;
|
||||
dataPtr += 3;
|
||||
} else { // code == 3
|
||||
val = *(uint32_t *) dataPtr; // 4 bytes
|
||||
dataPtr += 4;
|
||||
}
|
||||
|
||||
*dataPtrPtr = dataPtr;
|
||||
return val;
|
||||
}
|
||||
|
||||
const uint8_t *svb_decode_scalar_d1_init(uint32_t *outPtr, const uint8_t *keyPtr,
|
||||
const uint8_t *dataPtr, uint32_t count,
|
||||
uint32_t prev) {
|
||||
if (count == 0)
|
||||
return dataPtr; // no reads or writes if no data
|
||||
|
||||
uint8_t shift = 0;
|
||||
uint32_t key = *keyPtr++;
|
||||
|
||||
for (uint32_t c = 0; c < count; c++) {
|
||||
if (shift == 8) {
|
||||
shift = 0;
|
||||
key = *keyPtr++;
|
||||
}
|
||||
uint32_t val = _decode_data(&dataPtr, (key >> shift) & 0x3);
|
||||
val += prev;
|
||||
*outPtr++ = val;
|
||||
prev = val;
|
||||
shift += 2;
|
||||
}
|
||||
|
||||
return dataPtr; // pointer to first unused byte after end
|
||||
}
|
||||
|
||||
const uint8_t *svb_decode_avx_d1_init(uint32_t *out, const uint8_t *__restrict__ keyPtr,
|
||||
const uint8_t *__restrict__ dataPtr, uint64_t count, uint32_t prev) {
|
||||
uint64_t keybytes = count / 4; // number of key bytes
|
||||
if (keybytes >= 8) {
|
||||
__m128i Prev = _mm_set1_epi32(prev);
|
||||
__m128i Data;
|
||||
|
||||
int64_t Offset = -(int64_t) keybytes / 8 + 1;
|
||||
|
||||
const uint64_t *keyPtr64 = (const uint64_t *) keyPtr - Offset;
|
||||
uint64_t nextkeys = keyPtr64[Offset];
|
||||
for (; Offset != 0; ++Offset) {
|
||||
uint64_t keys = nextkeys;
|
||||
nextkeys = keyPtr64[Offset + 1];
|
||||
// faster 16-bit delta since we only have 8-bit values
|
||||
if (!keys) { // 32 1-byte ints in a row
|
||||
|
||||
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((__m128i *) (dataPtr)));
|
||||
Prev = _write_16bit_avx_d1(out, Data, Prev);
|
||||
Data = _mm_cvtepu8_epi16(
|
||||
_mm_lddqu_si128((__m128i *) (dataPtr + 8)));
|
||||
Prev = _write_16bit_avx_d1(out + 8, Data, Prev);
|
||||
Data = _mm_cvtepu8_epi16(
|
||||
_mm_lddqu_si128((__m128i *) (dataPtr + 16)));
|
||||
Prev = _write_16bit_avx_d1(out + 16, Data, Prev);
|
||||
Data = _mm_cvtepu8_epi16(
|
||||
_mm_lddqu_si128((__m128i *) (dataPtr + 24)));
|
||||
Prev = _write_16bit_avx_d1(out + 24, Data, Prev);
|
||||
out += 32;
|
||||
dataPtr += 32;
|
||||
continue;
|
||||
}
|
||||
|
||||
Data = _decode_avx(keys & 0x00FF, &dataPtr);
|
||||
Prev = _write_avx_d1(out, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 4, Data, Prev);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0x00FF), &dataPtr);
|
||||
Prev = _write_avx_d1(out + 8, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 12, Data, Prev);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0x00FF), &dataPtr);
|
||||
Prev = _write_avx_d1(out + 16, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 20, Data, Prev);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0x00FF), &dataPtr);
|
||||
Prev = _write_avx_d1(out + 24, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 28, Data, Prev);
|
||||
|
||||
out += 32;
|
||||
}
|
||||
{
|
||||
uint64_t keys = nextkeys;
|
||||
// faster 16-bit delta since we only have 8-bit values
|
||||
if (!keys) { // 32 1-byte ints in a row
|
||||
Data = _mm_cvtepu8_epi16(_mm_lddqu_si128((__m128i *) (dataPtr)));
|
||||
Prev = _write_16bit_avx_d1(out, Data, Prev);
|
||||
Data = _mm_cvtepu8_epi16(
|
||||
_mm_lddqu_si128((__m128i *) (dataPtr + 8)));
|
||||
Prev = _write_16bit_avx_d1(out + 8, Data, Prev);
|
||||
Data = _mm_cvtepu8_epi16(
|
||||
_mm_lddqu_si128((__m128i *) (dataPtr + 16)));
|
||||
Prev = _write_16bit_avx_d1(out + 16, Data, Prev);
|
||||
Data = _mm_cvtepu8_epi16(
|
||||
_mm_loadl_epi64((__m128i *) (dataPtr + 24)));
|
||||
Prev = _write_16bit_avx_d1(out + 24, Data, Prev);
|
||||
out += 32;
|
||||
dataPtr += 32;
|
||||
|
||||
} else {
|
||||
|
||||
Data = _decode_avx(keys & 0x00FF, &dataPtr);
|
||||
Prev = _write_avx_d1(out, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 4, Data, Prev);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0x00FF), &dataPtr);
|
||||
Prev = _write_avx_d1(out + 8, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 12, Data, Prev);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0x00FF), &dataPtr);
|
||||
Prev = _write_avx_d1(out + 16, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 20, Data, Prev);
|
||||
|
||||
keys >>= 16;
|
||||
Data = _decode_avx((keys & 0x00FF), &dataPtr);
|
||||
Prev = _write_avx_d1(out + 24, Data, Prev);
|
||||
Data = _decode_avx((keys & 0xFF00) >> 8, &dataPtr);
|
||||
Prev = _write_avx_d1(out + 28, Data, Prev);
|
||||
|
||||
out += 32;
|
||||
}
|
||||
}
|
||||
prev = out[-1];
|
||||
}
|
||||
uint64_t consumedkeys = keybytes - (keybytes & 7);
|
||||
return svb_decode_scalar_d1_init(out, keyPtr + consumedkeys, dataPtr,
|
||||
count & 31, prev);
|
||||
}
|
||||
|
||||
size_t streamvbyte_delta_decode(const uint8_t* in, uint32_t* out,
|
||||
uint32_t count, uint32_t prev) {
|
||||
uint32_t keyLen = ((count + 3) / 4); // 2-bits per key (rounded up)
|
||||
const uint8_t *keyPtr = in;
|
||||
const uint8_t *dataPtr = keyPtr + keyLen; // data starts at end of keys
|
||||
return svb_decode_avx_d1_init(out, keyPtr, dataPtr, count, prev) - in;
|
||||
}
|
||||
73
cpp/streamvbyte/tests/unit.c
Normal file
73
cpp/streamvbyte/tests/unit.c
Normal file
@@ -0,0 +1,73 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "streamvbyte.h"
|
||||
#include "streamvbytedelta.h"
|
||||
|
||||
int main() {
|
||||
int N = 4096;
|
||||
uint32_t * datain = malloc(N * sizeof(uint32_t));
|
||||
uint8_t * compressedbuffer = malloc(2 * N * sizeof(uint32_t));
|
||||
uint32_t * recovdata = malloc(N * sizeof(uint32_t));
|
||||
|
||||
for (int length = 0; length <= N;) {
|
||||
printf("length = %d \n", length);
|
||||
for (uint32_t gap = 1; gap <= 387420489; gap *= 3) {
|
||||
for (int k = 0; k < length; ++k)
|
||||
datain[k] = gap;
|
||||
size_t compsize = streamvbyte_encode(datain, length,
|
||||
compressedbuffer);
|
||||
size_t usedbytes = streamvbyte_decode(compressedbuffer, recovdata,
|
||||
length);
|
||||
if (compsize != usedbytes) {
|
||||
printf(
|
||||
"[streamvbyte_decode] code is buggy gap = %d, size mismatch %d %d \n",
|
||||
(int) gap, (int) compsize, (int) usedbytes);
|
||||
return -1;
|
||||
}
|
||||
for (int k = 0; k < length; ++k) {
|
||||
if (recovdata[k] != datain[k]) {
|
||||
printf("[streamvbyte_decode] code is buggy gap = %d\n",
|
||||
(int) gap);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("Delta \n");
|
||||
for (size_t gap = 1; gap <= 531441; gap *= 3) {
|
||||
for (int k = 0; k < length; ++k)
|
||||
datain[k] = gap * k;
|
||||
size_t compsize = streamvbyte_delta_encode(datain, length,
|
||||
compressedbuffer, 0);
|
||||
size_t usedbytes = streamvbyte_delta_decode(compressedbuffer,
|
||||
recovdata, length, 0);
|
||||
if (compsize != usedbytes) {
|
||||
printf(
|
||||
"[streamvbyte_delta_decode] code is buggy gap = %d, size mismatch %d %d \n",
|
||||
(int) gap, (int) compsize, (int) usedbytes);
|
||||
return -1;
|
||||
}
|
||||
for (int k = 0; k < length; ++k) {
|
||||
if (recovdata[k] != datain[k]) {
|
||||
printf(
|
||||
"[streamvbyte_delta_decode] code is buggy gap = %d\n",
|
||||
(int) gap);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (length < 128)
|
||||
++length;
|
||||
else {
|
||||
length *= 2;
|
||||
}
|
||||
}
|
||||
free(datain);
|
||||
free(compressedbuffer);
|
||||
free(recovdata);
|
||||
printf("Code looks good.\n");
|
||||
return 0;
|
||||
}
|
||||
@@ -30,10 +30,12 @@
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre><span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> rustc_serialize;
|
||||
<span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> tantivy;
|
||||
<div class="content"><div class='highlight'><pre><span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> tantivy;
|
||||
<span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> tempdir;
|
||||
|
||||
<span class="hljs-meta">#[macro_use]</span>
|
||||
<span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> serde_json;
|
||||
|
||||
<span class="hljs-keyword">use</span> std::path::Path;
|
||||
<span class="hljs-keyword">use</span> tempdir::TempDir;
|
||||
<span class="hljs-keyword">use</span> tantivy::Index;
|
||||
@@ -108,8 +110,8 @@ be indexed”.</p>
|
||||
<a class="pilcrow" href="#section-5">¶</a>
|
||||
</div>
|
||||
<p>Our first field is title.
|
||||
We want full-text search for it, and we want to be able
|
||||
to retrieve the document after the search.</p>
|
||||
We want full-text search for it, and we also want
|
||||
to be able to retrieve the document after the search.</p>
|
||||
<p>TEXT | STORED is some syntactic sugar to describe
|
||||
that.</p>
|
||||
<p><code>TEXT</code> means the field should be tokenized and indexed,
|
||||
@@ -132,9 +134,12 @@ documents that were selected during the search phase.</p>
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-6">¶</a>
|
||||
</div>
|
||||
<p>Our first field is body.
|
||||
We want full-text search for it, and we want to be able
|
||||
to retrieve the body after the search.</p>
|
||||
<p>Our second field is body.
|
||||
We want full-text search for it, but we do not
|
||||
need to be able to be able to retrieve it
|
||||
for our application. </p>
|
||||
<p>We can make our index lighter and
|
||||
by omitting <code>STORED</code> flag.</p>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -158,7 +163,7 @@ with our schema in the directory.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> index = <span class="hljs-built_in">try!</span>(Index::create(index_path, schema.clone()));</pre></div></div>
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> index = Index::create(index_path, schema.clone())?;</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
@@ -178,7 +183,7 @@ heap for the indexer can increase its throughput.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> index_writer = <span class="hljs-built_in">try!</span>(index.writer(<span class="hljs-number">50_000_000</span>));</pre></div></div>
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> index_writer = index.writer(<span class="hljs-number">50_000_000</span>)?;</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
@@ -214,9 +219,11 @@ one by one in a Document object.</p>
|
||||
|
||||
<span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> old_man_doc = Document::<span class="hljs-keyword">default</span>();
|
||||
old_man_doc.add_text(title, <span class="hljs-string">"The Old Man and the Sea"</span>);
|
||||
old_man_doc.add_text(body,
|
||||
<span class="hljs-string">"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."</span>);</pre></div></div>
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
<span class="hljs-string">"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."</span>,
|
||||
);</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
@@ -243,16 +250,25 @@ one by one in a Document object.</p>
|
||||
<a class="pilcrow" href="#section-12">¶</a>
|
||||
</div>
|
||||
<h3 id="create-a-document-directly-from-json-">Create a document directly from json.</h3>
|
||||
<p>Alternatively, we can use our schema to parse
|
||||
a document object directly from json.</p>
|
||||
<p>Alternatively, we can use our schema to parse a
|
||||
document object directly from json.
|
||||
The document is a string, but we use the <code>json</code> macro
|
||||
from <code>serde_json</code> for the convenience of multi-line support.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre>
|
||||
<span class="hljs-keyword">let</span> mice_and_men_doc = <span class="hljs-built_in">try!</span>(schema.parse_document(r#<span class="hljs-string">"{
|
||||
"</span>title<span class="hljs-string">": "</span>Of Mice and Men<span class="hljs-string">",
|
||||
"</span>body<span class="hljs-string">": "</span>few miles south of Soledad, the Salinas River drops <span class="hljs-keyword">in</span> close to the hillside bank and runs deep and green. The water is warm too, <span class="hljs-keyword">for</span> it has slipped twinkling over the yellow sands <span class="hljs-keyword">in</span> the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying <span class="hljs-keyword">in</span> their lower leaf junctures the debris of the winter’s flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool<span class="hljs-string">"
|
||||
}"</span>#));
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> json = json!({
|
||||
<span class="hljs-string">"title"</span>: <span class="hljs-string">"Of Mice and Men"</span>,
|
||||
<span class="hljs-string">"body"</span>: <span class="hljs-string">"A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"</span>
|
||||
});
|
||||
<span class="hljs-keyword">let</span> mice_and_men_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(mice_and_men_doc);</pre></div></div>
|
||||
|
||||
@@ -271,10 +287,15 @@ The following document has two titles.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> frankenstein_doc = <span class="hljs-built_in">try!</span>(schema.parse_document(r#<span class="hljs-string">"{
|
||||
"</span>title<span class="hljs-string">": ["</span>Frankenstein<span class="hljs-string">", "</span>The Modern Promotheus<span class="hljs-string">"],
|
||||
"</span>body<span class="hljs-string">": "</span>You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence <span class="hljs-keyword">in</span> the success of my undertaking.<span class="hljs-string">"
|
||||
}"</span>#));
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> json = json!({
|
||||
<span class="hljs-string">"title"</span>: [<span class="hljs-string">"Frankenstein"</span>, <span class="hljs-string">"The Modern Prometheus"</span>],
|
||||
<span class="hljs-string">"body"</span>: <span class="hljs-string">"You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."</span>
|
||||
});
|
||||
<span class="hljs-keyword">let</span> frankenstein_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(frankenstein_doc);</pre></div></div>
|
||||
|
||||
</li>
|
||||
@@ -313,7 +334,7 @@ the existence of new documents.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-built_in">try!</span>(index_writer.commit());</pre></div></div>
|
||||
<div class="content"><div class='highlight'><pre> index_writer.commit()?;</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
@@ -349,7 +370,7 @@ after every commit().</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-built_in">try!</span>(index.load_searchers());</pre></div></div>
|
||||
<div class="content"><div class='highlight'><pre> index.load_searchers()?;</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
@@ -384,7 +405,7 @@ in both title and body.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> query_parser = QueryParser::new(index.schema(), <span class="hljs-built_in">vec!</span>[title, body]);</pre></div></div>
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> query_parser = QueryParser::for_index(index, <span class="hljs-built_in">vec!</span>[title, body]);</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
@@ -401,7 +422,7 @@ A ticket has been opened regarding this problem.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> query = <span class="hljs-built_in">try!</span>(query_parser.parse_query(<span class="hljs-string">"sea whale"</span>));</pre></div></div>
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> query = query_parser.parse_query(<span class="hljs-string">"sea whale"</span>)?;</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
@@ -451,7 +472,7 @@ is the role of the TopCollector.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-built_in">try!</span>(searcher.search(&*query, &<span class="hljs-keyword">mut</span> top_collector));</pre></div></div>
|
||||
<div class="content"><div class='highlight'><pre> searcher.search(&*query, &<span class="hljs-keyword">mut</span> top_collector)?;</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
@@ -488,9 +509,27 @@ a title.</p>
|
||||
|
||||
<div class="content"><div class='highlight'><pre>
|
||||
<span class="hljs-keyword">for</span> doc_address <span class="hljs-keyword">in</span> doc_addresses {
|
||||
<span class="hljs-keyword">let</span> retrieved_doc = <span class="hljs-built_in">try!</span>(searcher.doc(&doc_address));
|
||||
<span class="hljs-keyword">let</span> retrieved_doc = searcher.doc(&doc_address)?;
|
||||
<span class="hljs-built_in">println!</span>(<span class="hljs-string">"{}"</span>, schema.to_json(&retrieved_doc));
|
||||
}
|
||||
}</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-26">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-26">¶</a>
|
||||
</div>
|
||||
<p>Wait for indexing and merging threads to shut down.
|
||||
Usually this isn’t needed, but in <code>main</code> we try to
|
||||
delete the temporary directory and that fails on
|
||||
Windows if the files are still open.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> index_writer.wait_merging_threads()?;
|
||||
|
||||
<span class="hljs-literal">Ok</span>(())
|
||||
}</pre></div></div>
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
extern crate rustc_serialize;
|
||||
extern crate tantivy;
|
||||
extern crate tempdir;
|
||||
|
||||
#[macro_use]
|
||||
extern crate serde_json;
|
||||
|
||||
use std::path::Path;
|
||||
use tempdir::TempDir;
|
||||
use tantivy::Index;
|
||||
@@ -18,10 +20,7 @@ fn main() {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
|
||||
|
||||
// # Defining the schema
|
||||
//
|
||||
// The Tantivy index requires a very strict schema.
|
||||
@@ -29,13 +28,12 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// and for each field, its type and "the way it should
|
||||
// be indexed".
|
||||
|
||||
|
||||
// first we need to define a schema ...
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
// Our first field is title.
|
||||
// We want full-text search for it, and we want to be able
|
||||
// to retrieve the document after the search.
|
||||
// We want full-text search for it, and we also want
|
||||
// to be able to retrieve the document after the search.
|
||||
//
|
||||
// TEXT | STORED is some syntactic sugar to describe
|
||||
// that.
|
||||
@@ -49,24 +47,24 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// documents that were selected during the search phase.
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
|
||||
// Our first field is body.
|
||||
// We want full-text search for it, and we want to be able
|
||||
// to retrieve the body after the search.
|
||||
// Our second field is body.
|
||||
// We want full-text search for it, but we do not
|
||||
// need to be able to be able to retrieve it
|
||||
// for our application.
|
||||
//
|
||||
// We can make our index lighter and
|
||||
// by omitting `STORED` flag.
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Let's create a brand new index.
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = try!(Index::create(index_path, schema.clone()));
|
||||
|
||||
|
||||
let index = Index::create(index_path, schema.clone())?;
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
@@ -75,12 +73,11 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
//
|
||||
// Here we use a buffer of 50MB per thread. Using a bigger
|
||||
// heap for the indexer can increase its throughput.
|
||||
let mut index_writer = try!(index.writer(50_000_000));
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// Let's index our documents!
|
||||
// We first need a handle on the title and the body field.
|
||||
|
||||
|
||||
// ### Create a document "manually".
|
||||
//
|
||||
// We can create a document manually, by setting the fields
|
||||
@@ -90,32 +87,48 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
|
||||
let mut old_man_doc = Document::default();
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
old_man_doc.add_text(body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.");
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.",
|
||||
);
|
||||
|
||||
// ... and add it to the `IndexWriter`.
|
||||
index_writer.add_document(old_man_doc);
|
||||
|
||||
// ### Create a document directly from json.
|
||||
//
|
||||
// Alternatively, we can use our schema to parse
|
||||
// a document object directly from json.
|
||||
|
||||
let mice_and_men_doc = try!(schema.parse_document(r#"{
|
||||
// Alternatively, we can use our schema to parse a
|
||||
// document object directly from json.
|
||||
// The document is a string, but we use the `json` macro
|
||||
// from `serde_json` for the convenience of multi-line support.
|
||||
let json = json!({
|
||||
"title": "Of Mice and Men",
|
||||
"body": "few miles south of Soledad, the Salinas River drops in close to the hillside bank and runs deep and green. The water is warm too, for it has slipped twinkling over the yellow sands in the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying in their lower leaf junctures the debris of the winter’s flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool"
|
||||
}"#));
|
||||
"body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
});
|
||||
let mice_and_men_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(mice_and_men_doc);
|
||||
|
||||
// Multi-valued field are allowed, they are
|
||||
// expressed in JSON by an array.
|
||||
// The following document has two titles.
|
||||
let frankenstein_doc = try!(schema.parse_document(r#"{
|
||||
"title": ["Frankenstein", "The Modern Promotheus"],
|
||||
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
|
||||
}"#));
|
||||
let json = json!({
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
});
|
||||
let frankenstein_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(frankenstein_doc);
|
||||
|
||||
// This is an example, so we will only index 3 documents
|
||||
@@ -124,7 +137,6 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// Indexing 5 million articles of the English wikipedia takes
|
||||
// around 4 minutes on my computer!
|
||||
|
||||
|
||||
// ### Committing
|
||||
//
|
||||
// At this point our documents are not searchable.
|
||||
@@ -136,7 +148,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// the existence of new documents.
|
||||
//
|
||||
// This call is blocking.
|
||||
try!(index_writer.commit());
|
||||
index_writer.commit()?;
|
||||
|
||||
// If `.commit()` returns correctly, then all of the
|
||||
// documents that have been added are guaranteed to be
|
||||
@@ -146,13 +158,12 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// tantivy behaves as if has rolled back to its last
|
||||
// commit.
|
||||
|
||||
|
||||
// # Searching
|
||||
//
|
||||
// Let's search our index. Start by reloading
|
||||
// searchers in the index. This should be done
|
||||
// after every commit().
|
||||
try!(index.load_searchers());
|
||||
index.load_searchers()?;
|
||||
|
||||
// Afterwards create one (or more) searchers.
|
||||
//
|
||||
@@ -164,13 +175,12 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// Here, if the user does not specify which
|
||||
// field they want to search, tantivy will search
|
||||
// in both title and body.
|
||||
let query_parser = QueryParser::new(index.schema(), vec![title, body]);
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// QueryParser may fail if the query is not in the right
|
||||
// format. For user facing applications, this can be a problem.
|
||||
// A ticket has been opened regarding this problem.
|
||||
let query = try!(query_parser.parse_query("sea whale"));
|
||||
|
||||
let query = query_parser.parse_query("sea whale")?;
|
||||
|
||||
// A query defines a set of documents, as
|
||||
// well as the way they should be scored.
|
||||
@@ -187,7 +197,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
// We can now perform our query.
|
||||
try!(searcher.search(&*query, &mut top_collector));
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
|
||||
// Our top collector now contains the 10
|
||||
// most relevant doc ids...
|
||||
@@ -201,9 +211,15 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// a title.
|
||||
|
||||
for doc_address in doc_addresses {
|
||||
let retrieved_doc = try!(searcher.doc(&doc_address));
|
||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
// Wait for indexing and merging threads to shut down.
|
||||
// Usually this isn't needed, but in `main` we try to
|
||||
// delete the temporary directory and that fails on
|
||||
// Windows if the files are still open.
|
||||
index_writer.wait_merging_threads()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
1
rustfmt.toml
Normal file
1
rustfmt.toml
Normal file
@@ -0,0 +1 @@
|
||||
use_try_shorthand = true
|
||||
@@ -1,86 +0,0 @@
|
||||
extern crate regex;
|
||||
|
||||
use std::str::Chars;
|
||||
use std::ascii::AsciiExt;
|
||||
|
||||
pub struct TokenIter<'a> {
|
||||
chars: Chars<'a>,
|
||||
term_buffer: String,
|
||||
}
|
||||
|
||||
fn append_char_lowercase(c: char, term_buffer: &mut String) {
|
||||
term_buffer.push(c.to_ascii_lowercase());
|
||||
}
|
||||
|
||||
pub trait StreamingIterator<'a, T> {
|
||||
fn next(&'a mut self) -> Option<T>;
|
||||
}
|
||||
|
||||
impl<'a, 'b> TokenIter<'b> {
|
||||
fn consume_token(&'a mut self) -> Option<&'a str> {
|
||||
for c in &mut self.chars {
|
||||
if c.is_alphanumeric() {
|
||||
append_char_lowercase(c, &mut self.term_buffer);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Some(&self.term_buffer)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> {
|
||||
|
||||
#[inline]
|
||||
fn next(&'a mut self,) -> Option<&'a str> {
|
||||
self.term_buffer.clear();
|
||||
// skipping non-letter characters.
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some(c) => {
|
||||
if c.is_alphanumeric() {
|
||||
append_char_lowercase(c, &mut self.term_buffer);
|
||||
return self.consume_token();
|
||||
}
|
||||
}
|
||||
None => { return None; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub struct SimpleTokenizer;
|
||||
|
||||
|
||||
impl SimpleTokenizer {
|
||||
|
||||
pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> {
|
||||
TokenIter {
|
||||
term_buffer: String::new(),
|
||||
chars: text.chars(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer() {
|
||||
let simple_tokenizer = SimpleTokenizer;
|
||||
let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!");
|
||||
assert_eq!(term_reader.next().unwrap(), "hello");
|
||||
assert_eq!(term_reader.next().unwrap(), "happy");
|
||||
assert_eq!(term_reader.next().unwrap(), "tax");
|
||||
assert_eq!(term_reader.next().unwrap(), "payer");
|
||||
assert_eq!(term_reader.next(), None);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let simple_tokenizer = SimpleTokenizer;
|
||||
let mut term_reader = simple_tokenizer.tokenize("");
|
||||
assert_eq!(term_reader.next(), None);
|
||||
}
|
||||
@@ -5,10 +5,9 @@ use SegmentReader;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
|
||||
/// Collector that does nothing.
|
||||
/// This is used in the chain Collector and will hopefully
|
||||
/// be optimized away by the compiler.
|
||||
/// This is used in the chain Collector and will hopefully
|
||||
/// be optimized away by the compiler.
|
||||
pub struct DoNothingCollector;
|
||||
impl Collector for DoNothingCollector {
|
||||
#[inline]
|
||||
@@ -17,6 +16,10 @@ impl Collector for DoNothingCollector {
|
||||
}
|
||||
#[inline]
|
||||
fn collect(&mut self, _doc: DocId, _score: Score) {}
|
||||
#[inline]
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero-cost abstraction used to collect on multiple collectors.
|
||||
@@ -24,10 +27,10 @@ impl Collector for DoNothingCollector {
|
||||
/// are known at compile time.
|
||||
pub struct ChainedCollector<Left: Collector, Right: Collector> {
|
||||
left: Left,
|
||||
right: Right
|
||||
right: Right,
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||
/// Adds a collector
|
||||
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
|
||||
ChainedCollector {
|
||||
@@ -38,9 +41,13 @@ impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
|
||||
try!(self.left.set_segment(segment_local_id, segment));
|
||||
try!(self.right.set_segment(segment_local_id, segment));
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
self.left.set_segment(segment_local_id, segment)?;
|
||||
self.right.set_segment(segment_local_id, segment)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -48,6 +55,10 @@ impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Rig
|
||||
self.left.collect(doc, score);
|
||||
self.right.collect(doc, score);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.left.requires_scoring() || self.right.requires_scoring()
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a `ChainedCollector`
|
||||
@@ -58,7 +69,6 @@ pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -70,9 +80,7 @@ mod tests {
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors = chain()
|
||||
.push(&mut top_collector)
|
||||
.push(&mut count_collector);
|
||||
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
@@ -80,4 +88,4 @@ mod tests {
|
||||
assert_eq!(count_collector.count(), 3);
|
||||
assert!(top_collector.at_capacity());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,8 @@ use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
/// documents match the query.
|
||||
/// documents match the query.
|
||||
#[derive(Default)]
|
||||
pub struct CountCollector {
|
||||
count: usize,
|
||||
}
|
||||
@@ -14,20 +15,12 @@ pub struct CountCollector {
|
||||
impl CountCollector {
|
||||
/// Returns the count of documents that were
|
||||
/// collected.
|
||||
pub fn count(&self,) -> usize {
|
||||
pub fn count(&self) -> usize {
|
||||
self.count
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CountCollector {
|
||||
fn default() -> CountCollector {
|
||||
CountCollector {count: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for CountCollector {
|
||||
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
@@ -35,23 +28,27 @@ impl Collector for CountCollector {
|
||||
fn collect(&mut self, _: DocId, _: Score) {
|
||||
self.count += 1;
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
use collector::Collector;
|
||||
use collector::{Collector, CountCollector};
|
||||
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
for doc in 0..1_000_000 {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
#[test]
|
||||
fn test_count_collector() {
|
||||
let mut count_collector = CountCollector::default();
|
||||
assert_eq!(count_collector.count(), 0);
|
||||
count_collector.collect(0u32, 1f32);
|
||||
assert_eq!(count_collector.count(), 1);
|
||||
assert_eq!(count_collector.count(), 1);
|
||||
count_collector.collect(1u32, 1f32);
|
||||
assert_eq!(count_collector.count(), 2);
|
||||
assert!(!count_collector.requires_scoring());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
637
src/collector/facet_collector.rs
Normal file
637
src/collector/facet_collector.rs
Normal file
@@ -0,0 +1,637 @@
|
||||
use std::mem;
|
||||
use collector::Collector;
|
||||
use fastfield::FacetReader;
|
||||
use schema::Field;
|
||||
use std::cell::UnsafeCell;
|
||||
use schema::Facet;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::Bound;
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermStreamer;
|
||||
use termdict::TermStreamerBuilder;
|
||||
use std::collections::BTreeSet;
|
||||
use termdict::TermMerger;
|
||||
use docset::SkipResult;
|
||||
use std::{usize, u64};
|
||||
use std::iter::Peekable;
|
||||
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
struct Hit<'a> {
|
||||
count: u64,
|
||||
facet: &'a Facet,
|
||||
}
|
||||
|
||||
impl<'a> Eq for Hit<'a> {}
|
||||
|
||||
impl<'a> PartialEq<Hit<'a>> for Hit<'a> {
|
||||
fn eq(&self, other: &Hit) -> bool {
|
||||
self.count == other.count
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> PartialOrd<Hit<'a>> for Hit<'a> {
|
||||
fn partial_cmp(&self, other: &Hit) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Ord for Hit<'a> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other.count.cmp(&self.count)
|
||||
}
|
||||
}
|
||||
|
||||
struct SegmentFacetCounter {
|
||||
pub facet_reader: FacetReader,
|
||||
pub facet_ords: Vec<u64>,
|
||||
pub facet_counts: Vec<u64>,
|
||||
}
|
||||
|
||||
fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
if facet_bytes.is_empty() {
|
||||
0
|
||||
} else {
|
||||
facet_bytes.iter().cloned().filter(|b| *b == 0u8).count() + 1
|
||||
}
|
||||
}
|
||||
|
||||
/// Collector for faceting
|
||||
///
|
||||
/// The collector collects all facets. You need to configure it
|
||||
/// beforehand with the facet you want to extract.
|
||||
///
|
||||
/// This is done by calling `.add_facet(...)` with the root of the
|
||||
/// facet you want to extract as argument.
|
||||
///
|
||||
/// Facet counts will only be computed for the facet that are direct children
|
||||
/// of such a root facet.
|
||||
///
|
||||
/// For instance, if your index represents books, your hierarchy of facets
|
||||
/// may contain `category`, `language`.
|
||||
///
|
||||
/// The category facet may include `subcategories`. For instance, a book
|
||||
/// could belong to `/category/fiction/fantasy`.
|
||||
///
|
||||
/// If you request the facet counts for `/category`, the result will be
|
||||
/// the breakdown of counts for the direct children of `/category`
|
||||
/// (e.g. `/category/fiction`, `/category/biography`, `/category/personal_development`).
|
||||
///
|
||||
/// Once collection is finished, you can harvest its results in the form
|
||||
/// of a `FacetCounts` object, and extract your face t counts from it.
|
||||
///
|
||||
/// This implementation assumes you are working with a number of facets that
|
||||
/// is much hundreds of time lower than your number of documents.
|
||||
///
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{Facet, SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::FacetCollector;
|
||||
/// use tantivy::query::AllQuery;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
///
|
||||
/// // Facet have their own specific type.
|
||||
/// // It is not a bad practise to put all of your
|
||||
/// // facet information in the same field.
|
||||
/// let facet = schema_builder.add_facet_field("facet");
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// // a document can be associated to any number of facets
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/fiction/fantasy")
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "Dune",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/fiction/sci-fi")
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "La Vénus d'Ille",
|
||||
/// facet => Facet::from("/lang/fr"),
|
||||
/// facet => Facet::from("/category/fiction/fantasy"),
|
||||
/// facet => Facet::from("/category/fiction/horror")
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/biography")
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||
/// facet_collector.add_facet("/lang");
|
||||
/// facet_collector.add_facet("/category");
|
||||
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
///
|
||||
/// // this object contains count aggregate for all of the facets.
|
||||
/// let counts = facet_collector.harvest();
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = counts
|
||||
/// .get("/category")
|
||||
/// .collect();
|
||||
/// assert_eq!(facets, vec![
|
||||
/// (&Facet::from("/category/biography"), 1),
|
||||
/// (&Facet::from("/category/fiction"), 3)
|
||||
/// ]);
|
||||
/// }
|
||||
///
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||
/// facet_collector.add_facet("/category/fiction");
|
||||
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
///
|
||||
/// // this object contains count aggregate for all of the facets.
|
||||
/// let counts = facet_collector.harvest();
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = counts
|
||||
/// .get("/category/fiction")
|
||||
/// .collect();
|
||||
/// assert_eq!(facets, vec![
|
||||
/// (&Facet::from("/category/fiction/fantasy"), 2),
|
||||
/// (&Facet::from("/category/fiction/horror"), 1),
|
||||
/// (&Facet::from("/category/fiction/sci-fi"), 1)
|
||||
/// ]);
|
||||
/// }
|
||||
///
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||
/// facet_collector.add_facet("/category/fiction");
|
||||
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
///
|
||||
/// // this object contains count aggregate for all of the facets.
|
||||
/// let counts = facet_collector.harvest();
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = counts.top_k("/category/fiction", 1);
|
||||
/// assert_eq!(facets, vec![
|
||||
/// (&Facet::from("/category/fiction/fantasy"), 2)
|
||||
/// ]);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct FacetCollector {
|
||||
facet_ords: Vec<u64>,
|
||||
field: Field,
|
||||
ff_reader: Option<UnsafeCell<FacetReader>>,
|
||||
segment_counters: Vec<SegmentFacetCounter>,
|
||||
|
||||
// facet_ord -> collapse facet_id
|
||||
current_segment_collapse_mapping: Vec<usize>,
|
||||
// collapse facet_id -> count
|
||||
current_segment_counts: Vec<u64>,
|
||||
// collapse facet_id -> facet_ord
|
||||
current_collapse_facet_ords: Vec<u64>,
|
||||
|
||||
facets: BTreeSet<Facet>,
|
||||
}
|
||||
|
||||
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||
target: &[u8],
|
||||
collapse_it: &mut Peekable<I>,
|
||||
) -> SkipResult {
|
||||
loop {
|
||||
match collapse_it.peek() {
|
||||
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
|
||||
Ordering::Less => {}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
}
|
||||
Ordering::Equal => {
|
||||
return SkipResult::Reached;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
return SkipResult::End;
|
||||
}
|
||||
}
|
||||
collapse_it.next();
|
||||
}
|
||||
}
|
||||
|
||||
impl FacetCollector {
|
||||
/// Create a facet collector to collect the facets
|
||||
/// from a specific facet `Field`.
|
||||
///
|
||||
/// This function does not check whether the field
|
||||
/// is of the proper type.
|
||||
pub fn for_field(field: Field) -> FacetCollector {
|
||||
FacetCollector {
|
||||
facet_ords: Vec::with_capacity(255),
|
||||
segment_counters: Vec::new(),
|
||||
field,
|
||||
ff_reader: None,
|
||||
facets: BTreeSet::new(),
|
||||
|
||||
current_segment_collapse_mapping: Vec::new(),
|
||||
current_collapse_facet_ords: Vec::new(),
|
||||
current_segment_counts: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds a facet that we want to record counts
|
||||
///
|
||||
/// Adding facet `Facet::from("/country")` for instance,
|
||||
/// will record the counts of all of the direct children of the facet country
|
||||
/// (e.g. `/country/FR`, `/country/UK`).
|
||||
///
|
||||
/// Adding two facets within which one is the prefix of the other is forbidden.
|
||||
/// If you need the correct number of unique documents for two such facets,
|
||||
/// just add them in separate `FacetCollector`.
|
||||
pub fn add_facet<T>(&mut self, facet_from: T)
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
let facet = Facet::from(facet_from);
|
||||
for old_facet in &self.facets {
|
||||
assert!(
|
||||
!old_facet.is_prefix_of(&facet),
|
||||
"Tried to add a facet which is a descendant of an already added facet."
|
||||
);
|
||||
assert!(
|
||||
!facet.is_prefix_of(old_facet),
|
||||
"Tried to add a facet which is an ancestor of an already added facet."
|
||||
);
|
||||
}
|
||||
self.facets.insert(facet);
|
||||
}
|
||||
|
||||
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
|
||||
self.current_segment_collapse_mapping.clear();
|
||||
self.current_collapse_facet_ords.clear();
|
||||
self.current_segment_counts.clear();
|
||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||
self.current_collapse_facet_ords.push(0);
|
||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||
if !facet_streamer.advance() {
|
||||
return;
|
||||
}
|
||||
'outer: loop {
|
||||
// at the begining of this loop, facet_streamer
|
||||
// is positionned on a term that has not been processed yet.
|
||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||
match skip_result {
|
||||
SkipResult::Reached => {
|
||||
// we reach a facet we decided to collapse.
|
||||
let collapse_depth = facet_depth(facet_streamer.key());
|
||||
let mut collapsed_id = 0;
|
||||
self.current_segment_collapse_mapping.push(0);
|
||||
while facet_streamer.advance() {
|
||||
let depth = facet_depth(facet_streamer.key());
|
||||
if depth <= collapse_depth {
|
||||
continue 'outer;
|
||||
}
|
||||
if depth == collapse_depth + 1 {
|
||||
collapsed_id = self.current_collapse_facet_ords.len();
|
||||
self.current_collapse_facet_ords
|
||||
.push(facet_streamer.term_ord());
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
} else {
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
SkipResult::End | SkipResult::OverStep => {
|
||||
self.current_segment_collapse_mapping.push(0);
|
||||
if !facet_streamer.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize_segment(&mut self) {
|
||||
if self.ff_reader.is_some() {
|
||||
self.segment_counters.push(SegmentFacetCounter {
|
||||
facet_reader: self.ff_reader.take().unwrap().into_inner(),
|
||||
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
|
||||
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the results of the collection.
|
||||
///
|
||||
/// This method does not just return the counters,
|
||||
/// it also translates the facet ordinals of the last segment.
|
||||
pub fn harvest(mut self) -> FacetCounts {
|
||||
self.finalize_segment();
|
||||
|
||||
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_ords[..])
|
||||
.collect();
|
||||
let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_counts[..])
|
||||
.collect();
|
||||
|
||||
let facet_streams = self.segment_counters
|
||||
.iter()
|
||||
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut facet_merger = TermMerger::new(facet_streams);
|
||||
let mut facet_counts = BTreeMap::new();
|
||||
|
||||
while facet_merger.advance() {
|
||||
let count = facet_merger
|
||||
.current_kvs()
|
||||
.iter()
|
||||
.map(|it| {
|
||||
let seg_ord = it.segment_ord;
|
||||
let term_ord = it.streamer.term_ord();
|
||||
collapsed_facet_ords[seg_ord]
|
||||
.binary_search(&term_ord)
|
||||
.map(|collapsed_term_id| {
|
||||
if collapsed_term_id == 0 {
|
||||
0
|
||||
} else {
|
||||
collapsed_facet_counts[seg_ord][collapsed_term_id]
|
||||
}
|
||||
})
|
||||
.unwrap_or(0)
|
||||
})
|
||||
.sum();
|
||||
if count > 0u64 {
|
||||
let bytes = facet_merger.key().to_owned();
|
||||
facet_counts.insert(Facet::from_encoded(bytes), count);
|
||||
}
|
||||
}
|
||||
FacetCounts { facet_counts }
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for FacetCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.finalize_segment();
|
||||
let facet_reader = reader.facet_reader(self.field)?;
|
||||
self.set_collapse_mapping(&facet_reader);
|
||||
self.current_segment_counts
|
||||
.resize(self.current_collapse_facet_ords.len(), 0);
|
||||
self.ff_reader = Some(UnsafeCell::new(facet_reader));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
let facet_reader: &mut FacetReader = unsafe {
|
||||
&mut *self.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.get()
|
||||
};
|
||||
facet_reader.facet_ords(doc, &mut self.facet_ords);
|
||||
let mut previous_collapsed_ord: usize = usize::MAX;
|
||||
for &facet_ord in &self.facet_ords {
|
||||
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
|
||||
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
|
||||
{
|
||||
0
|
||||
} else {
|
||||
1
|
||||
};
|
||||
previous_collapsed_ord = collapsed_ord;
|
||||
}
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Intermediary result of the `FacetCollector` that stores
|
||||
/// the facet counts for all the segments.
|
||||
pub struct FacetCounts {
|
||||
facet_counts: BTreeMap<Facet, u64>,
|
||||
}
|
||||
|
||||
impl FacetCounts {
|
||||
#[allow(needless_lifetimes)] //< compiler fails if we remove the lifetime
|
||||
pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator<Item = (&'a Facet, u64)>
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
let facet = Facet::from(facet_from);
|
||||
let left_bound = Bound::Excluded(facet.clone());
|
||||
let right_bound = if facet.is_root() {
|
||||
Bound::Unbounded
|
||||
} else {
|
||||
let mut facet_after_bytes = facet.encoded_bytes().to_owned();
|
||||
facet_after_bytes.push(1u8);
|
||||
let facet_after = Facet::from_encoded(facet_after_bytes);
|
||||
Bound::Excluded(facet_after)
|
||||
};
|
||||
|
||||
self.facet_counts
|
||||
.range((left_bound, right_bound))
|
||||
.map(|(facet, count)| (facet, *count))
|
||||
}
|
||||
|
||||
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
let mut heap = BinaryHeap::with_capacity(k);
|
||||
let mut it = self.get(facet);
|
||||
|
||||
for (facet, count) in (&mut it).take(k) {
|
||||
heap.push(Hit { count, facet });
|
||||
}
|
||||
|
||||
let mut lowest_count: u64 = heap.peek().map(|hit| hit.count).unwrap_or(u64::MIN);
|
||||
for (facet, count) in it {
|
||||
if count > lowest_count {
|
||||
lowest_count = count;
|
||||
if let Some(mut head) = heap.peek_mut() {
|
||||
*head = Hit { count, facet };
|
||||
}
|
||||
}
|
||||
}
|
||||
heap.into_sorted_vec()
|
||||
.into_iter()
|
||||
.map(|hit| (hit.facet, hit.count))
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use test::Bencher;
|
||||
use core::Index;
|
||||
use schema::{Document, Facet, SchemaBuilder};
|
||||
use query::AllQuery;
|
||||
use super::{FacetCollector, FacetCounts};
|
||||
use std::iter;
|
||||
use schema::Field;
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_drilldown() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let num_facets: usize = 3 * 4 * 5;
|
||||
let facets: Vec<Facet> = (0..num_facets)
|
||||
.map(|mut n| {
|
||||
let top = n % 3;
|
||||
n /= 3;
|
||||
let mid = n % 4;
|
||||
n /= 4;
|
||||
let leaf = n % 5;
|
||||
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
|
||||
})
|
||||
.collect();
|
||||
for i in 0..num_facets * 10 {
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet(Facet::from("/top1"));
|
||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
|
||||
let counts: FacetCounts = facet_collector.harvest();
|
||||
{
|
||||
let facets: Vec<(String, u64)> = counts
|
||||
.get("/top1")
|
||||
.map(|(facet, count)| (facet.to_string(), count))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
facets,
|
||||
[
|
||||
("/top1/mid0", 50),
|
||||
("/top1/mid1", 50),
|
||||
("/top1/mid2", 50),
|
||||
("/top1/mid3", 50),
|
||||
].iter()
|
||||
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet.")]
|
||||
fn test_misused_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
facet_collector.add_facet(Facet::from("/country/europe"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_non_used_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
facet_collector.add_facet(Facet::from("/countryeurope"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_topk() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut docs: Vec<Document> = vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
|
||||
.into_iter()
|
||||
.flat_map(|(c, count)| {
|
||||
let facet = Facet::from(&format!("/facet_{}", c));
|
||||
let doc = doc!(facet_field => facet);
|
||||
iter::repeat(doc).take(count)
|
||||
})
|
||||
.collect();
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let searcher = index.searcher();
|
||||
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet("/");
|
||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
|
||||
let counts: FacetCounts = facet_collector.harvest();
|
||||
{
|
||||
let facets: Vec<(&Facet, u64)> = counts.top_k("/", 3);
|
||||
assert_eq!(
|
||||
facets,
|
||||
vec![
|
||||
(&Facet::from("/facet_b"), 100),
|
||||
(&Facet::from("/facet_e"), 21),
|
||||
(&Facet::from("/facet_d"), 12),
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_facet_collector(b: &mut Bencher) {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut docs = vec![];
|
||||
for val in 0..50 {
|
||||
let facet = Facet::from(&format!("/facet_{}", val));
|
||||
for _ in 0..val * val {
|
||||
docs.push(doc!(facet_field=>facet.clone()));
|
||||
}
|
||||
}
|
||||
// 40425 docs
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let searcher = index.searcher();
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
});
|
||||
}
|
||||
}
|
||||
123
src/collector/int_facet_collector.rs
Normal file
123
src/collector/int_facet_collector.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
use std::cmp::Eq;
|
||||
use std::collections::HashMap;
|
||||
use std::hash::Hash;
|
||||
|
||||
use collector::Collector;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
|
||||
|
||||
/// Facet collector for i64/u64 fast field
|
||||
pub struct IntFacetCollector<T>
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
counters: HashMap<T::ValueType, u64>,
|
||||
field: Field,
|
||||
ff_reader: Option<T>,
|
||||
}
|
||||
|
||||
|
||||
impl<T> IntFacetCollector<T>
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
/// Creates a new facet collector for aggregating a given field.
|
||||
pub fn new(field: Field) -> IntFacetCollector<T> {
|
||||
IntFacetCollector {
|
||||
counters: HashMap::new(),
|
||||
field: field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<T> Collector for IntFacetCollector<T>
|
||||
where
|
||||
T: FastFieldReader,
|
||||
T::ValueType: Eq + Hash,
|
||||
{
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(reader.get_fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
let val = self.ff_reader
|
||||
.as_ref()
|
||||
.expect(
|
||||
"collect() was called before set_segment. \
|
||||
This should never happen.",
|
||||
)
|
||||
.get(doc);
|
||||
*(self.counters.entry(val).or_insert(0)) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use collector::{chain, IntFacetCollector};
|
||||
use query::QueryParser;
|
||||
use fastfield::{I64FastFieldReader, U64FastFieldReader};
|
||||
use schema::{self, FAST, STRING};
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
// create 10 documents, set num field value to 0 or 1 for even/odd ones
|
||||
// make sure we have facet counters correctly filled
|
||||
fn test_facet_collector_results() {
|
||||
|
||||
let mut schema_builder = schema::SchemaBuilder::new();
|
||||
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
|
||||
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
|
||||
let text_field = schema_builder.add_text_field("text", STRING);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
for i in 0u64..10u64 {
|
||||
index_writer.add_document(doc!(
|
||||
num_field_i64 => ((i as i64) % 3i64) as i64,
|
||||
num_field_u64 => (i % 2u64) as u64,
|
||||
text_field => "text"
|
||||
));
|
||||
}
|
||||
}
|
||||
assert_eq!(index_writer.commit().unwrap(), 10u64);
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64);
|
||||
let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64);
|
||||
|
||||
{
|
||||
// perform the query
|
||||
let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64);
|
||||
let mut query_parser = QueryParser::for_index(index, vec![text_field]);
|
||||
let query = query_parser.parse_query("text:text").unwrap();
|
||||
query.search(&searcher, &mut facet_collectors).unwrap();
|
||||
}
|
||||
|
||||
assert_eq!(ffvf_u64.counters[&0], 5);
|
||||
assert_eq!(ffvf_u64.counters[&1], 5);
|
||||
assert_eq!(ffvf_i64.counters[&0], 4);
|
||||
assert_eq!(ffvf_i64.counters[&1], 3);
|
||||
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,7 @@
|
||||
/*!
|
||||
Defines how the documents matching a search query should be processed.
|
||||
*/
|
||||
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use DocId;
|
||||
@@ -13,14 +17,17 @@ pub use self::multi_collector::MultiCollector;
|
||||
mod top_collector;
|
||||
pub use self::top_collector::TopCollector;
|
||||
|
||||
mod facet_collector;
|
||||
pub use self::facet_collector::FacetCollector;
|
||||
|
||||
mod chained_collector;
|
||||
pub use self::chained_collector::chain;
|
||||
|
||||
/// Collectors are in charge of collecting and retaining relevant
|
||||
/// Collectors are in charge of collecting and retaining relevant
|
||||
/// information from the document found and scored by the query.
|
||||
///
|
||||
///
|
||||
/// For instance,
|
||||
/// For instance,
|
||||
///
|
||||
/// - keeping track of the top 10 best documents
|
||||
/// - computing a breakdown over a fast field
|
||||
@@ -29,7 +36,7 @@ pub use self::chained_collector::chain;
|
||||
/// Queries are in charge of pushing the `DocSet` to the collector.
|
||||
///
|
||||
/// As they work on multiple segments, they first inform
|
||||
/// the collector of a change in a segment and then
|
||||
/// the collector of a change in a segment and then
|
||||
/// call the `collect` method to push the document to the collector.
|
||||
///
|
||||
/// Temporally, our collector will receive calls
|
||||
@@ -46,25 +53,38 @@ pub use self::chained_collector::chain;
|
||||
///
|
||||
/// Segments are not guaranteed to be visited in any specific order.
|
||||
pub trait Collector {
|
||||
/// `set_segment` is called before beginning to enumerate
|
||||
/// `set_segment` is called before beginning to enumerate
|
||||
/// on this segment.
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()>;
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()>;
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
|
||||
/// Returns true iff the collector requires to compute scores for documents.
|
||||
fn requires_scoring(&self) -> bool;
|
||||
}
|
||||
|
||||
|
||||
impl<'a, C: Collector> Collector for &'a mut C {
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
(*self).set_segment(segment_local_id, segment)
|
||||
}
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
(*self).collect(doc, score);
|
||||
C::collect(self, doc, score)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
C::requires_scoring(self)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
@@ -74,9 +94,9 @@ pub mod tests {
|
||||
use Score;
|
||||
use core::SegmentReader;
|
||||
use SegmentLocalId;
|
||||
use fastfield::U32FastFieldReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
/// This collector is only used for tests.
|
||||
/// It is unusable in practise, as it does not store
|
||||
@@ -89,7 +109,7 @@ pub mod tests {
|
||||
|
||||
impl TestCollector {
|
||||
/// Return the exhalist of documents.
|
||||
pub fn docs(self,) -> Vec<DocId> {
|
||||
pub fn docs(self) -> Vec<DocId> {
|
||||
self.docs
|
||||
}
|
||||
}
|
||||
@@ -105,7 +125,6 @@ pub mod tests {
|
||||
}
|
||||
|
||||
impl Collector for TestCollector {
|
||||
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.offset += self.segment_max_doc;
|
||||
self.segment_max_doc = reader.max_doc();
|
||||
@@ -115,38 +134,39 @@ pub mod tests {
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
self.docs.push(doc + self.offset);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/// Collects in order all of the fast fields for all of the
|
||||
/// doc in the `DocSet`
|
||||
///
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct FastFieldTestCollector {
|
||||
vals: Vec<u32>,
|
||||
vals: Vec<u64>,
|
||||
field: Field,
|
||||
ff_reader: Option<U32FastFieldReader>,
|
||||
ff_reader: Option<FastFieldReader<u64>>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
pub fn for_field(field: Field) -> FastFieldTestCollector {
|
||||
FastFieldTestCollector {
|
||||
vals: Vec::new(),
|
||||
field: field,
|
||||
field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn vals(self,) -> Vec<u32> {
|
||||
pub fn vals(self) -> Vec<u64> {
|
||||
self.vals
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl Collector for FastFieldTestCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = reader.get_fast_field_reader(self.field);
|
||||
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -154,9 +174,11 @@ pub mod tests {
|
||||
let val = self.ff_reader.as_ref().unwrap().get(doc);
|
||||
self.vals.push(val);
|
||||
}
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
|
||||
@@ -5,9 +5,8 @@ use Result;
|
||||
use SegmentReader;
|
||||
use SegmentLocalId;
|
||||
|
||||
|
||||
/// Multicollector makes it possible to collect on more than one collector.
|
||||
/// It should only be used for use cases where the Collector types is unknown
|
||||
/// It should only be used for use cases where the Collector types is unknown
|
||||
/// at compile time.
|
||||
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
|
||||
pub struct MultiCollector<'a> {
|
||||
@@ -17,17 +16,18 @@ pub struct MultiCollector<'a> {
|
||||
impl<'a> MultiCollector<'a> {
|
||||
/// Constructor
|
||||
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
|
||||
MultiCollector {
|
||||
collectors: collectors,
|
||||
}
|
||||
MultiCollector { collectors }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a> Collector for MultiCollector<'a> {
|
||||
fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
for collector in &mut self.collectors {
|
||||
try!(collector.set_segment(segment_local_id, segment));
|
||||
collector.set_segment(segment_local_id, segment)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -37,10 +37,13 @@ impl<'a> Collector for MultiCollector<'a> {
|
||||
collector.collect(doc, score);
|
||||
}
|
||||
}
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collectors
|
||||
.iter()
|
||||
.any(|collector| collector.requires_scoring())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -52,7 +55,8 @@ mod tests {
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors = MultiCollector::from(vec!(&mut top_collector, &mut count_collector));
|
||||
let mut collectors =
|
||||
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
|
||||
@@ -12,8 +12,7 @@ use Score;
|
||||
#[derive(Clone, Copy)]
|
||||
struct GlobalScoredDoc {
|
||||
score: Score,
|
||||
doc_address: DocAddress
|
||||
|
||||
doc_address: DocAddress,
|
||||
}
|
||||
|
||||
impl PartialOrd for GlobalScoredDoc {
|
||||
@@ -25,10 +24,10 @@ impl PartialOrd for GlobalScoredDoc {
|
||||
impl Ord for GlobalScoredDoc {
|
||||
#[inline]
|
||||
fn cmp(&self, other: &GlobalScoredDoc) -> Ordering {
|
||||
other.score.partial_cmp(&self.score)
|
||||
.unwrap_or(
|
||||
other.doc_address.cmp(&self.doc_address)
|
||||
)
|
||||
other
|
||||
.score
|
||||
.partial_cmp(&self.score)
|
||||
.unwrap_or_else(|| other.doc_address.cmp(&self.doc_address))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,7 +39,6 @@ impl PartialEq for GlobalScoredDoc {
|
||||
|
||||
impl Eq for GlobalScoredDoc {}
|
||||
|
||||
|
||||
/// The Top Collector keeps track of the K documents
|
||||
/// with the best scores.
|
||||
///
|
||||
@@ -53,7 +51,6 @@ pub struct TopCollector {
|
||||
}
|
||||
|
||||
impl TopCollector {
|
||||
|
||||
/// Creates a top collector, with a number of documents equal to "limit".
|
||||
///
|
||||
/// # Panics
|
||||
@@ -63,14 +60,14 @@ impl TopCollector {
|
||||
panic!("Limit must be strictly greater than 0.");
|
||||
}
|
||||
TopCollector {
|
||||
limit: limit,
|
||||
limit,
|
||||
heap: BinaryHeap::with_capacity(limit),
|
||||
segment_id: 0,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns K best documents sorted in decreasing order.
|
||||
///
|
||||
///
|
||||
/// Calling this method triggers the sort.
|
||||
/// The result of the sort is not cached.
|
||||
pub fn docs(&self) -> Vec<DocAddress> {
|
||||
@@ -81,30 +78,27 @@ impl TopCollector {
|
||||
}
|
||||
|
||||
/// Returns K best ScoredDocument sorted in decreasing order.
|
||||
///
|
||||
///
|
||||
/// Calling this method triggers the sort.
|
||||
/// The result of the sort is not cached.
|
||||
pub fn score_docs(&self) -> Vec<(Score, DocAddress)> {
|
||||
let mut scored_docs: Vec<GlobalScoredDoc> = self.heap
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect();
|
||||
let mut scored_docs: Vec<GlobalScoredDoc> = self.heap.iter().cloned().collect();
|
||||
scored_docs.sort();
|
||||
scored_docs.into_iter()
|
||||
.map(|GlobalScoredDoc {score, doc_address}| (score, doc_address))
|
||||
scored_docs
|
||||
.into_iter()
|
||||
.map(|GlobalScoredDoc { score, doc_address }| (score, doc_address))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Return true iff at least K documents have gone through
|
||||
/// the collector.
|
||||
#[inline]
|
||||
pub fn at_capacity(&self, ) -> bool {
|
||||
pub fn at_capacity(&self) -> bool {
|
||||
self.heap.len() >= self.limit
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for TopCollector {
|
||||
|
||||
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
self.segment_id = segment_id;
|
||||
Ok(())
|
||||
@@ -113,25 +107,30 @@ impl Collector for TopCollector {
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
if self.at_capacity() {
|
||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||
let limit_doc: GlobalScoredDoc = *self.heap.peek().expect("Top collector with size 0 is forbidden");
|
||||
let limit_doc: GlobalScoredDoc = *self.heap
|
||||
.peek()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
if limit_doc.score < score {
|
||||
let mut mut_head = self.heap.peek_mut().expect("Top collector with size 0 is forbidden");
|
||||
let mut mut_head = self.heap
|
||||
.peek_mut()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
mut_head.score = score;
|
||||
mut_head.doc_address = DocAddress(self.segment_id, doc);
|
||||
mut_head.doc_address = DocAddress(self.segment_id, doc);
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
let wrapped_doc = GlobalScoredDoc {
|
||||
score: score,
|
||||
doc_address: DocAddress(self.segment_id, doc)
|
||||
score,
|
||||
doc_address: DocAddress(self.segment_id, doc),
|
||||
};
|
||||
self.heap.push(wrapped_doc);
|
||||
}
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -147,13 +146,12 @@ mod tests {
|
||||
top_collector.collect(3, 0.2);
|
||||
top_collector.collect(5, 0.3);
|
||||
assert!(!top_collector.at_capacity());
|
||||
let score_docs: Vec<(Score, DocId)> = top_collector.score_docs()
|
||||
let score_docs: Vec<(Score, DocId)> = top_collector
|
||||
.score_docs()
|
||||
.into_iter()
|
||||
.map(|(score, doc_address)| (score, doc_address.doc()))
|
||||
.collect();
|
||||
assert_eq!(score_docs, vec!(
|
||||
(0.8, 1), (0.3, 5), (0.2, 3),
|
||||
));
|
||||
assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -171,9 +169,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.map(|(score, doc_address)| (score, doc_address.doc()))
|
||||
.collect();
|
||||
assert_eq!(score_docs, vec!(
|
||||
(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)
|
||||
));
|
||||
assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]);
|
||||
}
|
||||
{
|
||||
let docs: Vec<DocId> = top_collector
|
||||
@@ -181,10 +177,8 @@ mod tests {
|
||||
.into_iter()
|
||||
.map(|doc_address| doc_address.doc())
|
||||
.collect();
|
||||
assert_eq!(docs, vec!(7, 1, 5, 3));
|
||||
assert_eq!(docs, vec![7, 1, 5, 3]);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -2,153 +2,180 @@ use std::io::Write;
|
||||
use std::io;
|
||||
use common::serialize::BinarySerializable;
|
||||
use std::mem;
|
||||
use std::ops::Deref;
|
||||
|
||||
|
||||
pub fn compute_num_bits(amplitude: u32) -> u8 {
|
||||
(32u32 - amplitude.leading_zeros()) as u8
|
||||
}
|
||||
|
||||
pub struct BitPacker {
|
||||
pub(crate) struct BitPacker {
|
||||
mini_buffer: u64,
|
||||
mini_buffer_written: usize,
|
||||
num_bits: usize,
|
||||
written_size: usize,
|
||||
}
|
||||
|
||||
impl BitPacker {
|
||||
|
||||
pub fn new(num_bits: usize) -> BitPacker {
|
||||
impl BitPacker {
|
||||
pub fn new() -> BitPacker {
|
||||
BitPacker {
|
||||
mini_buffer: 0u64,
|
||||
mini_buffer_written: 0,
|
||||
num_bits: num_bits,
|
||||
written_size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write<TWrite: Write>(&mut self, val: u32, output: &mut TWrite) -> io::Result<()> {
|
||||
|
||||
pub fn write<TWrite: Write>(
|
||||
&mut self,
|
||||
val: u64,
|
||||
num_bits: u8,
|
||||
output: &mut TWrite,
|
||||
) -> io::Result<()> {
|
||||
let val_u64 = val as u64;
|
||||
if self.mini_buffer_written + self.num_bits > 64 {
|
||||
let num_bits = num_bits as usize;
|
||||
if self.mini_buffer_written + num_bits > 64 {
|
||||
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
||||
self.written_size += self.mini_buffer.serialize(output)?;
|
||||
self.mini_buffer.serialize(output)?;
|
||||
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
||||
self.mini_buffer_written = self.mini_buffer_written + (self.num_bits as usize) - 64;
|
||||
}
|
||||
else {
|
||||
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
|
||||
} else {
|
||||
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
||||
self.mini_buffer_written += self.num_bits;
|
||||
self.mini_buffer_written += num_bits;
|
||||
if self.mini_buffer_written == 64 {
|
||||
self.written_size += self.mini_buffer.serialize(output)?;
|
||||
self.mini_buffer.serialize(output)?;
|
||||
self.mini_buffer_written = 0;
|
||||
self.mini_buffer = 0u64;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()>{
|
||||
|
||||
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
|
||||
output.write_all(&arr[..num_bytes])?;
|
||||
self.written_size += num_bytes;
|
||||
self.mini_buffer_written = 0;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<usize> {
|
||||
|
||||
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
self.flush(output)?;
|
||||
Ok(self.written_size)
|
||||
// Padding the write file to simplify reads.
|
||||
output.write_all(&[0u8; 7])?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
pub struct BitUnpacker {
|
||||
#[derive(Clone)]
|
||||
pub struct BitUnpacker<Data>
|
||||
where
|
||||
Data: Deref<Target = [u8]>,
|
||||
{
|
||||
num_bits: usize,
|
||||
mask: u32,
|
||||
data_ptr: *const u8,
|
||||
data_len: usize,
|
||||
mask: u64,
|
||||
data: Data,
|
||||
}
|
||||
|
||||
impl BitUnpacker {
|
||||
pub fn new(data: &[u8], num_bits: usize) -> BitUnpacker {
|
||||
impl<Data> BitUnpacker<Data>
|
||||
where
|
||||
Data: Deref<Target = [u8]>,
|
||||
{
|
||||
pub fn new(data: Data, num_bits: u8) -> BitUnpacker<Data> {
|
||||
let mask: u64 = if num_bits == 64 {
|
||||
!0u64
|
||||
} else {
|
||||
(1u64 << num_bits) - 1u64
|
||||
};
|
||||
BitUnpacker {
|
||||
num_bits: num_bits,
|
||||
mask: (1u32 << num_bits) - 1u32,
|
||||
data_ptr: data.as_ptr(),
|
||||
data_len: data.len()
|
||||
num_bits: num_bits as usize,
|
||||
mask,
|
||||
data,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: usize) -> u32 {
|
||||
|
||||
pub fn get(&self, idx: usize) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0;
|
||||
return 0u64;
|
||||
}
|
||||
let addr = (idx * self.num_bits) / 8;
|
||||
let bit_shift = idx * self.num_bits - addr * 8;
|
||||
let val_unshifted_unmasked: u64;
|
||||
if addr + 8 <= self.data_len {
|
||||
val_unshifted_unmasked = unsafe { * (self.data_ptr.offset(addr as isize) as *const u64) };
|
||||
}
|
||||
else {
|
||||
let mut arr = [0u8; 8];
|
||||
if addr < self.data_len {
|
||||
for i in 0..self.data_len - addr {
|
||||
arr[i] = unsafe { *self.data_ptr.offset( (addr + i) as isize) };
|
||||
let data: &[u8] = &*self.data;
|
||||
let num_bits = self.num_bits;
|
||||
let mask = self.mask;
|
||||
let addr_in_bits = idx * num_bits;
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
if cfg!(feature = "simdcompression") {
|
||||
// for simdcompression,
|
||||
// the bitpacker is only used for fastfields,
|
||||
// and we expect them to be always padded.
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
} else {
|
||||
let val_unshifted_unmasked: u64 = if addr + 8 <= data.len() {
|
||||
unsafe { *(data[addr..].as_ptr() as *const u64) }
|
||||
} else {
|
||||
let mut buffer = [0u8; 8];
|
||||
for i in addr..data.len() {
|
||||
buffer[i - addr] += data[i];
|
||||
}
|
||||
}
|
||||
val_unshifted_unmasked = unsafe { mem::transmute::<[u8; 8], u64>(arr) };
|
||||
unsafe { *(buffer[..].as_ptr() as *const u64) }
|
||||
};
|
||||
let val_shifted = val_unshifted_unmasked >> (bit_shift as u64);
|
||||
val_shifted & mask
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads a range of values from the fast field.
|
||||
///
|
||||
/// The range of values read is from
|
||||
/// `[start..start + output.len()[`
|
||||
pub fn get_range(&self, start: u32, output: &mut [u64]) {
|
||||
if self.num_bits == 0 {
|
||||
for val in output.iter_mut() {
|
||||
*val = 0u64;
|
||||
}
|
||||
} else {
|
||||
let data: &[u8] = &*self.data;
|
||||
let num_bits = self.num_bits;
|
||||
let mask = self.mask;
|
||||
let mut addr_in_bits = (start as usize) * num_bits;
|
||||
for output_val in output.iter_mut() {
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
let val_unshifted_unmasked: u64 = unsafe { *(data[addr..].as_ptr() as *const u64) };
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
*output_val = val_shifted & mask;
|
||||
addr_in_bits += num_bits;
|
||||
}
|
||||
}
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u32;
|
||||
(val_shifted & self.mask)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::{BitPacker, BitUnpacker, compute_num_bits};
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
assert_eq!(compute_num_bits(0), 0u8);
|
||||
assert_eq!(compute_num_bits(2), 2u8);
|
||||
assert_eq!(compute_num_bits(3), 2u8);
|
||||
assert_eq!(compute_num_bits(4), 3u8);
|
||||
assert_eq!(compute_num_bits(255), 8u8);
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
}
|
||||
|
||||
fn test_bitpacker_util(len: usize, num_bits: usize) {
|
||||
use super::{BitPacker, BitUnpacker};
|
||||
|
||||
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker<Vec<u8>>, Vec<u64>) {
|
||||
let mut data = Vec::new();
|
||||
let mut bitpacker = BitPacker::new(num_bits);
|
||||
let max_val: u32 = (1 << num_bits) - 1;
|
||||
let vals: Vec<u32> = (0u32..len as u32).map(|i| {
|
||||
if max_val == 0 {
|
||||
0
|
||||
}
|
||||
else {
|
||||
i % max_val
|
||||
}
|
||||
}).collect();
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let max_val: u64 = (1u64 << num_bits as u64) - 1u64;
|
||||
let vals: Vec<u64> = (0u64..len as u64)
|
||||
.map(|i| if max_val == 0 { 0 } else { i % max_val })
|
||||
.collect();
|
||||
for &val in &vals {
|
||||
bitpacker.write(val, &mut data).unwrap();
|
||||
bitpacker.write(val, num_bits, &mut data).unwrap();
|
||||
}
|
||||
let num_bytes = bitpacker.close(&mut data).unwrap();
|
||||
assert_eq!(num_bytes, (num_bits * len + 7) / 8);
|
||||
assert_eq!(data.len(), num_bytes);
|
||||
let bitunpacker = BitUnpacker::new(&data, num_bits);
|
||||
bitpacker.close(&mut data).unwrap();
|
||||
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7);
|
||||
let bitunpacker = BitUnpacker::new(data, num_bits);
|
||||
(bitunpacker, vals)
|
||||
}
|
||||
|
||||
fn test_bitpacker_util(len: usize, num_bits: u8) {
|
||||
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
|
||||
for (i, val) in vals.iter().enumerate() {
|
||||
assert_eq!(bitunpacker.get(i), *val);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_bitpacker() {
|
||||
test_bitpacker_util(10, 3);
|
||||
@@ -157,4 +184,17 @@ mod test {
|
||||
test_bitpacker_util(6, 14);
|
||||
test_bitpacker_util(1000, 14);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitpacker_range() {
|
||||
let (bitunpacker, vals) = create_fastfield_bitpacker(100_000, 12);
|
||||
let buffer_len = 100;
|
||||
let mut buffer = vec![0u64; buffer_len];
|
||||
for start in vec![0, 10, 20, 100, 1_000] {
|
||||
bitunpacker.get_range(start as u32, &mut buffer[..]);
|
||||
for i in 0..buffer_len {
|
||||
assert_eq!(buffer[i], vals[start + i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
390
src/common/bitset.rs
Normal file
390
src/common/bitset.rs
Normal file
@@ -0,0 +1,390 @@
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub(crate) struct TinySet(u64);
|
||||
|
||||
impl fmt::Debug for TinySet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
self.into_iter().collect::<Vec<u32>>().fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TinySetIterator(TinySet);
|
||||
impl Iterator for TinySetIterator {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0.pop_lowest()
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for TinySet {
|
||||
type Item = u32;
|
||||
type IntoIter = TinySetIterator;
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
TinySetIterator(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl TinySet {
|
||||
/// Returns an empty `TinySet`.
|
||||
pub fn empty() -> TinySet {
|
||||
TinySet(0u64)
|
||||
}
|
||||
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
fn complement(&self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` contains the element `el`.
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
/// Returns the intersection of `self` and `other`
|
||||
pub fn intersect(&self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
}
|
||||
|
||||
/// Creates a new `TinySet` containing only one element
|
||||
/// within `[0; 64[`
|
||||
#[inline(always)]
|
||||
pub fn singleton(el: u32) -> TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline(always)]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline(always)]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.insert(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline(always)]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 | other.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` is empty.
|
||||
#[inline(always)]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0 == 0u64
|
||||
}
|
||||
|
||||
/// Returns the lowest element in the `TinySet`
|
||||
/// and removes it.
|
||||
#[inline(always)]
|
||||
pub fn pop_lowest(&mut self) -> Option<u32> {
|
||||
if let Some(lowest) = self.lowest() {
|
||||
self.0 ^= TinySet::singleton(lowest).0;
|
||||
Some(lowest)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the lowest element in the `TinySet`
|
||||
/// (or None if the set is empty).
|
||||
#[inline(always)]
|
||||
pub fn lowest(&mut self) -> Option<u32> {
|
||||
if self.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let least_significant_bit = self.0.trailing_zeros() as u32;
|
||||
Some(least_significant_bit)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` than contains all values up
|
||||
/// to limit excluded.
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_lower(upper_bound: u32) -> TinySet {
|
||||
TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` that contains all values greater
|
||||
/// or equal to the given limit, included. (and up to 63)
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
|
||||
TinySet::range_lower(from_included).complement()
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
pub fn len(&self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BitSet {
|
||||
tinysets: Box<[TinySet]>,
|
||||
len: usize, //< Technically it should be u32, but we
|
||||
// count multiple inserts.
|
||||
// `usize` guards us from overflow.
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
fn num_buckets(max_val: u32) -> u32 {
|
||||
(max_val + 63u32) / 64u32
|
||||
}
|
||||
|
||||
impl BitSet {
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val[`.
|
||||
pub fn with_max_value(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
||||
BitSet {
|
||||
tinysets: tinybisets,
|
||||
len: 0,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all elements from the `BitSet`.
|
||||
pub fn clear(&mut self) {
|
||||
for tinyset in self.tinysets.iter_mut() {
|
||||
*tinyset = TinySet::empty();
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the `BitSet`.
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
pub fn insert(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
self.tinyset(el / 64u32).contains(el % 64)
|
||||
}
|
||||
|
||||
/// Returns the first non-empty `TinySet` associated to a bucket lower
|
||||
/// or greater than bucket.
|
||||
///
|
||||
/// Reminder: the tiny set with the bucket `bucket`, represents the
|
||||
/// elements from `bucket * 64` to `(bucket+1) * 64`.
|
||||
pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
|
||||
self.tinysets[bucket as usize..]
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|tinyset| !tinyset.is_empty())
|
||||
.map(|delta_bucket| bucket + delta_bucket as u32)
|
||||
}
|
||||
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
/// Returns the tiny bitset representing the
|
||||
/// the set restricted to the number range from
|
||||
/// `bucket * 64` to `(bucket + 1) * 64`.
|
||||
pub(crate) fn tinyset(&self, bucket: u32) -> TinySet {
|
||||
self.tinysets[bucket as usize]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
extern crate test;
|
||||
use tests;
|
||||
use std::collections::HashSet;
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use tests::generate_nonunique_unsorted;
|
||||
use std::collections::BTreeSet;
|
||||
use query::BitSetDocSet;
|
||||
use docset::DocSet;
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
assert!(TinySet::empty().is_empty());
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32).insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(2u32);
|
||||
assert_eq!(u.pop_lowest(), Some(2u32));
|
||||
u.insert_mut(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(63u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset() {
|
||||
let test_against_hashset = |els: &[u32], max_value: u32| {
|
||||
let mut hashset: HashSet<u32> = HashSet::new();
|
||||
let mut bitset = BitSet::with_max_value(max_value);
|
||||
for &el in els {
|
||||
assert!(el < max_value);
|
||||
hashset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for el in 0..max_value {
|
||||
assert_eq!(hashset.contains(&el), bitset.contains(el));
|
||||
}
|
||||
assert_eq!(bitset.max_value(), max_value);
|
||||
};
|
||||
|
||||
test_against_hashset(&[], 0);
|
||||
test_against_hashset(&[], 1);
|
||||
test_against_hashset(&[0u32], 1);
|
||||
test_against_hashset(&[0u32], 100);
|
||||
test_against_hashset(&[1u32, 2u32], 4);
|
||||
test_against_hashset(&[99u32], 100);
|
||||
test_against_hashset(&[63u32], 64);
|
||||
test_against_hashset(&[62u32, 63u32], 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_large() {
|
||||
let arr = generate_nonunique_unsorted(1_000_000, 50_000);
|
||||
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
for el in arr {
|
||||
btreeset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for i in 0..1_000_000 {
|
||||
assert_eq!(btreeset.contains(&i), bitset.contains(i));
|
||||
}
|
||||
assert_eq!(btreeset.len(), bitset.len());
|
||||
let mut bitset_docset = BitSetDocSet::from(bitset);
|
||||
for el in btreeset.into_iter() {
|
||||
bitset_docset.advance();
|
||||
assert_eq!(bitset_docset.doc(), el);
|
||||
}
|
||||
assert!(!bitset_docset.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_num_buckets() {
|
||||
use super::num_buckets;
|
||||
assert_eq!(num_buckets(0u32), 0);
|
||||
assert_eq!(num_buckets(1u32), 1);
|
||||
assert_eq!(num_buckets(64u32), 1);
|
||||
assert_eq!(num_buckets(65u32), 2);
|
||||
assert_eq!(num_buckets(128u32), 2);
|
||||
assert_eq!(num_buckets(129u32), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tinyset_range() {
|
||||
assert_eq!(
|
||||
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1, 2]
|
||||
);
|
||||
assert!(TinySet::range_lower(0).is_empty());
|
||||
assert_eq!(
|
||||
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
|
||||
(0u32..63u32).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
|
||||
[0]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_greater_or_equal(3)
|
||||
.into_iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
(3u32..64u32).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_len() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(104u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_clear() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
let els = tests::sample(1_000, 0.01f32);
|
||||
for &el in &els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
assert!(els.iter().all(|el| bitset.contains(*el)));
|
||||
bitset.clear();
|
||||
for el in 0u32..1000u32 {
|
||||
assert!(!bitset.contains(el));
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| test::black_box(TinySet::singleton(31u32)).pop_lowest());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
b.iter(|| {
|
||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
}
|
||||
225
src/common/composite_file.rs
Normal file
225
src/common/composite_file.rs
Normal file
@@ -0,0 +1,225 @@
|
||||
use std::io::Write;
|
||||
use common::CountingWriter;
|
||||
use std::collections::HashMap;
|
||||
use schema::Field;
|
||||
use common::VInt;
|
||||
use directory::WritePtr;
|
||||
use std::io::{self, Read};
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
|
||||
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
|
||||
pub struct FileAddr {
|
||||
field: Field,
|
||||
idx: usize,
|
||||
}
|
||||
|
||||
impl FileAddr {
|
||||
fn new(field: Field, idx: usize) -> FileAddr {
|
||||
FileAddr { field, idx }
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for FileAddr {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
self.field.serialize(writer)?;
|
||||
VInt(self.idx as u64).serialize(writer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let field = Field::deserialize(reader)?;
|
||||
let idx = VInt::deserialize(reader)?.0 as usize;
|
||||
Ok(FileAddr {
|
||||
field,
|
||||
idx,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A `CompositeWrite` is used to write a `CompositeFile`.
|
||||
pub struct CompositeWrite<W = WritePtr> {
|
||||
write: CountingWriter<W>,
|
||||
offsets: HashMap<FileAddr, usize>,
|
||||
}
|
||||
|
||||
impl<W: Write> CompositeWrite<W> {
|
||||
/// Crate a new API writer that writes a composite file
|
||||
/// in a given write.
|
||||
pub fn wrap(w: W) -> CompositeWrite<W> {
|
||||
CompositeWrite {
|
||||
write: CountingWriter::wrap(w),
|
||||
offsets: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Start writing a new field.
|
||||
pub fn for_field(&mut self, field: Field) -> &mut CountingWriter<W> {
|
||||
self.for_field_with_idx(field, 0)
|
||||
}
|
||||
|
||||
/// Start writing a new field.
|
||||
pub fn for_field_with_idx(&mut self, field: Field, idx: usize) -> &mut CountingWriter<W> {
|
||||
let offset = self.write.written_bytes();
|
||||
let file_addr = FileAddr::new(field, idx);
|
||||
assert!(!self.offsets.contains_key(&file_addr));
|
||||
self.offsets.insert(file_addr, offset);
|
||||
&mut self.write
|
||||
}
|
||||
|
||||
/// Close the composite file.
|
||||
///
|
||||
/// An index of the different field offsets
|
||||
/// will be written as a footer.
|
||||
pub fn close(mut self) -> io::Result<()> {
|
||||
let footer_offset = self.write.written_bytes();
|
||||
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
|
||||
|
||||
let mut offset_fields: Vec<_> = self.offsets
|
||||
.iter()
|
||||
.map(|(file_addr, offset)| (*offset, *file_addr))
|
||||
.collect();
|
||||
|
||||
offset_fields.sort();
|
||||
|
||||
let mut prev_offset = 0;
|
||||
for (offset, file_addr) in offset_fields {
|
||||
VInt((offset - prev_offset) as u64).serialize(&mut self.write)?;
|
||||
file_addr.serialize(&mut self.write)?;
|
||||
prev_offset = offset;
|
||||
}
|
||||
|
||||
let footer_len = (self.write.written_bytes() - footer_offset) as u32;
|
||||
footer_len.serialize(&mut self.write)?;
|
||||
self.write.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A composite file is an abstraction to store a
|
||||
/// file partitioned by field.
|
||||
///
|
||||
/// The file needs to be written field by field.
|
||||
/// A footer describes the start and stop offsets
|
||||
/// for each field.
|
||||
#[derive(Clone)]
|
||||
pub struct CompositeFile {
|
||||
data: ReadOnlySource,
|
||||
offsets_index: HashMap<FileAddr, (usize, usize)>,
|
||||
}
|
||||
|
||||
impl CompositeFile {
|
||||
/// Opens a composite file stored in a given
|
||||
/// `ReadOnlySource`.
|
||||
pub fn open(data: &ReadOnlySource) -> io::Result<CompositeFile> {
|
||||
let end = data.len();
|
||||
let footer_len_data = data.slice_from(end - 4);
|
||||
let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize;
|
||||
|
||||
let footer_start = end - 4 - footer_len;
|
||||
let footer_data = data.slice(footer_start, footer_start + footer_len);
|
||||
let mut footer_buffer = footer_data.as_slice();
|
||||
let num_fields = VInt::deserialize(&mut footer_buffer)?.0 as usize;
|
||||
|
||||
let mut file_addrs = vec![];
|
||||
let mut offsets = vec![];
|
||||
|
||||
let mut field_index = HashMap::new();
|
||||
|
||||
let mut offset = 0;
|
||||
for _ in 0..num_fields {
|
||||
offset += VInt::deserialize(&mut footer_buffer)?.0 as usize;
|
||||
let file_addr = FileAddr::deserialize(&mut footer_buffer)?;
|
||||
offsets.push(offset);
|
||||
file_addrs.push(file_addr);
|
||||
}
|
||||
offsets.push(footer_start);
|
||||
for i in 0..num_fields {
|
||||
let file_addr = file_addrs[i];
|
||||
let start_offset = offsets[i];
|
||||
let end_offset = offsets[i + 1];
|
||||
field_index.insert(file_addr, (start_offset, end_offset));
|
||||
}
|
||||
|
||||
Ok(CompositeFile {
|
||||
data: data.slice_to(footer_start),
|
||||
offsets_index: field_index,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns a composite file that stores
|
||||
/// no fields.
|
||||
pub fn empty() -> CompositeFile {
|
||||
CompositeFile {
|
||||
offsets_index: HashMap::new(),
|
||||
data: ReadOnlySource::empty(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the `ReadOnlySource` associated
|
||||
/// to a given `Field` and stored in a `CompositeFile`.
|
||||
pub fn open_read(&self, field: Field) -> Option<ReadOnlySource> {
|
||||
self.open_read_with_idx(field, 0)
|
||||
}
|
||||
|
||||
/// Returns the `ReadOnlySource` associated
|
||||
/// to a given `Field` and stored in a `CompositeFile`.
|
||||
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<ReadOnlySource> {
|
||||
self.offsets_index
|
||||
.get(&FileAddr { field, idx, })
|
||||
.map(|&(from, to)| self.data.slice(from, to))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use std::io::Write;
|
||||
use super::{CompositeFile, CompositeWrite};
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use schema::Field;
|
||||
use common::VInt;
|
||||
use common::BinarySerializable;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn test_composite_file() {
|
||||
let path = Path::new("test_path");
|
||||
let mut directory = RAMDirectory::create();
|
||||
{
|
||||
let w = directory.open_write(path).unwrap();
|
||||
let mut composite_write = CompositeWrite::wrap(w);
|
||||
{
|
||||
let mut write_0 = composite_write.for_field(Field(0u32));
|
||||
VInt(32431123u64).serialize(&mut write_0).unwrap();
|
||||
write_0.flush().unwrap();
|
||||
}
|
||||
|
||||
{
|
||||
let mut write_4 = composite_write.for_field(Field(4u32));
|
||||
VInt(2).serialize(&mut write_4).unwrap();
|
||||
write_4.flush().unwrap();
|
||||
}
|
||||
composite_write.close().unwrap();
|
||||
}
|
||||
{
|
||||
let r = directory.open_read(path).unwrap();
|
||||
let composite_file = CompositeFile::open(&r).unwrap();
|
||||
{
|
||||
let file0 = composite_file.open_read(Field(0u32)).unwrap();
|
||||
let mut file0_buf = file0.as_slice();
|
||||
let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0;
|
||||
assert_eq!(file0_buf.len(), 0);
|
||||
assert_eq!(payload_0, 32431123u64);
|
||||
}
|
||||
{
|
||||
let file4 = composite_file.open_read(Field(4u32)).unwrap();
|
||||
let mut file4_buf = file4.as_slice();
|
||||
let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0;
|
||||
assert_eq!(file4_buf.len(), 0);
|
||||
assert_eq!(payload_4, 2u64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
55
src/common/counting_writer.rs
Normal file
55
src/common/counting_writer.rs
Normal file
@@ -0,0 +1,55 @@
|
||||
use std::io::Write;
|
||||
use std::io;
|
||||
|
||||
pub struct CountingWriter<W> {
|
||||
underlying: W,
|
||||
written_bytes: usize,
|
||||
}
|
||||
|
||||
impl<W: Write> CountingWriter<W> {
|
||||
pub fn wrap(underlying: W) -> CountingWriter<W> {
|
||||
CountingWriter {
|
||||
underlying,
|
||||
written_bytes: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn written_bytes(&self) -> usize {
|
||||
self.written_bytes
|
||||
}
|
||||
|
||||
pub fn finish(mut self) -> io::Result<(W, usize)> {
|
||||
self.flush()?;
|
||||
Ok((self.underlying, self.written_bytes))
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write> Write for CountingWriter<W> {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
let written_size = self.underlying.write(buf)?;
|
||||
self.written_bytes += written_size;
|
||||
Ok(written_size)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.underlying.flush()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::CountingWriter;
|
||||
use std::io::Write;
|
||||
|
||||
#[test]
|
||||
fn test_counting_writer() {
|
||||
let buffer: Vec<u8> = vec![];
|
||||
let mut counting_writer = CountingWriter::wrap(buffer);
|
||||
let bytes = (0u8..10u8).collect::<Vec<u8>>();
|
||||
counting_writer.write_all(&bytes).unwrap();
|
||||
let (w, len): (Vec<u8>, usize) = counting_writer.finish().unwrap();
|
||||
assert_eq!(len, 10);
|
||||
assert_eq!(w.len(), 10);
|
||||
}
|
||||
}
|
||||
@@ -1,42 +1,141 @@
|
||||
mod serialize;
|
||||
mod timer;
|
||||
mod vint;
|
||||
mod counting_writer;
|
||||
mod composite_file;
|
||||
pub mod bitpacker;
|
||||
mod bitset;
|
||||
|
||||
pub use self::serialize::BinarySerializable;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::timer::Timing;
|
||||
pub use self::timer::TimerTree;
|
||||
pub use self::timer::OpenTimer;
|
||||
pub use self::vint::VInt;
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
use std::io;
|
||||
|
||||
/// Create a default io error given a string.
|
||||
pub fn make_io_err(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
/// Computes the number of bits that will be used for bitpacking.
|
||||
///
|
||||
/// In general the target is the minimum number of bits
|
||||
/// required to express the amplitude given in argument.
|
||||
///
|
||||
/// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
|
||||
///
|
||||
/// The logic is slightly more convoluted here as for optimization
|
||||
/// reasons, we want to ensure that a value spawns over at most 8 bytes
|
||||
/// of aligns bytes.
|
||||
///
|
||||
/// Spanning over 9 bytes is possible for instance, if we do
|
||||
/// bitpacking with an amplitude of 63 bits.
|
||||
/// In this case, the second int will start on bit
|
||||
/// 63 (which belongs to byte 7) and ends at byte 15;
|
||||
/// Hence 9 bytes (from byte 7 to byte 15 included).
|
||||
///
|
||||
/// To avoid this, we force the number of bits to 64bits
|
||||
/// when the result is greater than `64-8 = 56 bits`.
|
||||
///
|
||||
/// Note that this only affects rare use cases spawning over
|
||||
/// a very large range of values. Even in this case, it results
|
||||
/// in an extra cost of at most 12% compared to the optimal
|
||||
/// number of bits.
|
||||
pub(crate) fn compute_num_bits(n: u64) -> u8 {
|
||||
let amplitude = (64u32 - n.leading_zeros()) as u8;
|
||||
if amplitude <= 64 - 8 {
|
||||
amplitude
|
||||
} else {
|
||||
64
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_power_of_2(n: usize) -> bool {
|
||||
(n > 0) && (n & (n - 1) == 0)
|
||||
}
|
||||
|
||||
/// Create a default io error given a string.
|
||||
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
}
|
||||
|
||||
/// Has length trait
|
||||
pub trait HasLen {
|
||||
/// Return length
|
||||
fn len(&self,) -> usize;
|
||||
|
||||
fn len(&self) -> usize;
|
||||
|
||||
/// Returns true iff empty.
|
||||
fn is_empty(&self,) -> bool {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
}
|
||||
|
||||
const HIGHEST_BIT: u64 = 1 << 63;
|
||||
|
||||
/// Creates an uninitialized Vec of a given usize
|
||||
/// Maps a `i64` to `u64`
|
||||
///
|
||||
/// `allocate_vec` does an unsafe call to `set_len`
|
||||
/// as other solution are extremely slow in debug mode.
|
||||
pub fn allocate_vec<T>(capacity: usize) -> Vec<T> {
|
||||
let mut v = Vec::with_capacity(capacity);
|
||||
unsafe {
|
||||
v.set_len(capacity);
|
||||
/// For simplicity, tantivy internally handles `i64` as `u64`.
|
||||
/// The mapping is defined by this function.
|
||||
///
|
||||
/// Maps `i64` to `u64` so that
|
||||
/// `-2^63 .. 2^63-1` is mapped
|
||||
/// to
|
||||
/// `0 .. 2^64-1`
|
||||
/// in that order.
|
||||
///
|
||||
/// This is more suited than simply casting (`val as u64`)
|
||||
/// because of bitpacking.
|
||||
///
|
||||
/// Imagine a list of `i64` ranging from -10 to 10.
|
||||
/// When casting negative values, the negative values are projected
|
||||
/// to values over 2^63, and all values end up requiring 64 bits.
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
|
||||
#[inline(always)]
|
||||
pub fn i64_to_u64(val: i64) -> u64 {
|
||||
(val as u64) ^ HIGHEST_BIT
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline(always)]
|
||||
pub fn u64_to_i64(val: u64) -> i64 {
|
||||
(val ^ HIGHEST_BIT) as i64
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
|
||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
}
|
||||
v
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_i64_converter() {
|
||||
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
|
||||
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
|
||||
test_i64_converter_helper(0i64);
|
||||
test_i64_converter_helper(i64::min_value());
|
||||
test_i64_converter_helper(i64::max_value());
|
||||
for i in -1000i64..1000i64 {
|
||||
test_i64_converter_helper(i);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
assert_eq!(compute_num_bits(0), 0u8);
|
||||
assert_eq!(compute_num_bits(2), 2u8);
|
||||
assert_eq!(compute_num_bits(3), 2u8);
|
||||
assert_eq!(compute_num_bits(4), 3u8);
|
||||
assert_eq!(compute_num_bits(255), 8u8);
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,165 +1,210 @@
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use byteorder::LittleEndian as Endianness;
|
||||
use common::Endianness;
|
||||
use std::fmt;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
use std::io;
|
||||
use common::VInt;
|
||||
|
||||
pub trait BinarySerializable : fmt::Debug + Sized {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize>;
|
||||
fn deserialize(reader: &mut Read) -> io::Result<Self>;
|
||||
/// Trait for a simple binary serialization.
|
||||
pub trait BinarySerializable: fmt::Debug + Sized {
|
||||
/// Serialize
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()>;
|
||||
/// Deserialize
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self>;
|
||||
}
|
||||
|
||||
/// `FixedSize` marks a `BinarySerializable` as
|
||||
/// always serializing to the same size.
|
||||
pub trait FixedSize: BinarySerializable {
|
||||
const SIZE_IN_BYTES: usize;
|
||||
}
|
||||
|
||||
impl BinarySerializable for () {
|
||||
fn serialize(&self, _: &mut Write) -> io::Result<usize> {
|
||||
Ok(0)
|
||||
fn serialize<W: Write>(&self, _: &mut W) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
fn deserialize(_: &mut Read) -> io::Result<Self> {
|
||||
fn deserialize<R: Read>(_: &mut R) -> io::Result<Self> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for () {
|
||||
const SIZE_IN_BYTES: usize = 0;
|
||||
}
|
||||
|
||||
impl<T: BinarySerializable> BinarySerializable for Vec<T> {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
let mut total_size = try!(VInt(self.len() as u64).serialize(writer));
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
VInt(self.len() as u64).serialize(writer)?;
|
||||
for it in self {
|
||||
total_size += try!(it.serialize(writer));
|
||||
it.serialize(writer)?;
|
||||
}
|
||||
Ok(total_size)
|
||||
Ok(())
|
||||
}
|
||||
fn deserialize(reader: &mut Read) -> io::Result<Vec<T>> {
|
||||
let num_items = try!(VInt::deserialize(reader)).val();
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
|
||||
let num_items = VInt::deserialize(reader)?.val();
|
||||
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
|
||||
for _ in 0..num_items {
|
||||
let item = try!(T::deserialize(reader));
|
||||
let item = T::deserialize(reader)?;
|
||||
items.push(item);
|
||||
}
|
||||
Ok(items)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<Left: BinarySerializable, Right: BinarySerializable> BinarySerializable for (Left, Right) {
|
||||
fn serialize(&self, write: &mut Write) -> io::Result<usize> {
|
||||
Ok(try!(self.0.serialize(write)) + try!(self.1.serialize(write)))
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.0.serialize(write)?;
|
||||
self.1.serialize(write)
|
||||
}
|
||||
fn deserialize(reader: &mut Read) -> io::Result<Self> {
|
||||
Ok( (try!(Left::deserialize(reader)), try!(Right::deserialize(reader))) )
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
Ok((Left::deserialize(reader)?, Right::deserialize(reader)?))
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for u32 {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_u32::<Endianness>(*self)
|
||||
.map(|_| 4)
|
||||
}
|
||||
|
||||
fn deserialize(reader: &mut Read) -> io::Result<u32> {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u32> {
|
||||
reader.read_u32::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for u32 {
|
||||
const SIZE_IN_BYTES: usize = 4;
|
||||
}
|
||||
|
||||
impl BinarySerializable for u64 {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_u64::<Endianness>(*self)
|
||||
.map(|_| 8)
|
||||
}
|
||||
fn deserialize(reader: &mut Read) -> io::Result<u64> {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
reader.read_u64::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for u64 {
|
||||
const SIZE_IN_BYTES: usize = 8;
|
||||
}
|
||||
|
||||
impl BinarySerializable for i64 {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_i64::<Endianness>(*self)
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
reader.read_i64::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for i64 {
|
||||
const SIZE_IN_BYTES: usize = 8;
|
||||
}
|
||||
|
||||
impl BinarySerializable for u8 {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
try!(writer.write_u8(*self));
|
||||
Ok(1)
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_u8(*self)
|
||||
}
|
||||
fn deserialize(reader: &mut Read) -> io::Result<u8> {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<u8> {
|
||||
reader.read_u8()
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for u8 {
|
||||
const SIZE_IN_BYTES: usize = 1;
|
||||
}
|
||||
|
||||
impl BinarySerializable for String {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let data: &[u8] = self.as_bytes();
|
||||
let mut size = try!(VInt(data.len() as u64).serialize(writer));
|
||||
size += data.len();
|
||||
try!(writer.write_all(data));
|
||||
Ok(size)
|
||||
VInt(data.len() as u64).serialize(writer)?;
|
||||
writer.write_all(data)
|
||||
}
|
||||
|
||||
fn deserialize(reader: &mut Read) -> io::Result<String> {
|
||||
let string_length = try!(VInt::deserialize(reader)).val() as usize;
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
|
||||
let string_length = VInt::deserialize(reader)?.val() as usize;
|
||||
let mut result = String::with_capacity(string_length);
|
||||
try!(reader.take(string_length as u64).read_to_string(&mut result));
|
||||
reader
|
||||
.take(string_length as u64)
|
||||
.read_to_string(&mut result)?;
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
pub mod test {
|
||||
|
||||
use common::VInt;
|
||||
use super::*;
|
||||
|
||||
fn serialize_test<T: BinarySerializable + Eq>(v: T, num_bytes: usize) {
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
if num_bytes != 0 {
|
||||
assert_eq!(v.serialize(&mut buffer).unwrap(), num_bytes);
|
||||
assert_eq!(buffer.len(), num_bytes);
|
||||
}
|
||||
else {
|
||||
v.serialize(&mut buffer).unwrap();
|
||||
}
|
||||
fn serialize_test<T: BinarySerializable + Eq>(v: T) -> usize {
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
v.serialize(&mut buffer).unwrap();
|
||||
let num_bytes = buffer.len();
|
||||
let mut cursor = &buffer[..];
|
||||
let deser = T::deserialize(&mut cursor).unwrap();
|
||||
assert_eq!(deser, v);
|
||||
num_bytes
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_u8() {
|
||||
serialize_test(3u8, 1);
|
||||
serialize_test(5u8, 1);
|
||||
fixed_size_test::<u8>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_u32() {
|
||||
serialize_test(3u32, 4);
|
||||
serialize_test(5u32, 4);
|
||||
serialize_test(u32::max_value(), 4);
|
||||
fixed_size_test::<u32>();
|
||||
assert_eq!(4, serialize_test(3u32));
|
||||
assert_eq!(4, serialize_test(5u32));
|
||||
assert_eq!(4, serialize_test(u32::max_value()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_i64() {
|
||||
fixed_size_test::<i64>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_u64() {
|
||||
fixed_size_test::<u64>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_string() {
|
||||
serialize_test(String::from(""), 1);
|
||||
serialize_test(String::from("ぽよぽよ"), 1 + 3*4);
|
||||
serialize_test(String::from("富士さん見える。"), 1 + 3*8);
|
||||
assert_eq!(serialize_test(String::from("")), 1);
|
||||
assert_eq!(serialize_test(String::from("ぽよぽよ")), 1 + 3 * 4);
|
||||
assert_eq!(
|
||||
serialize_test(String::from("富士さん見える。")),
|
||||
1 + 3 * 8
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_vec() {
|
||||
let v: Vec<u8> = Vec::new();
|
||||
serialize_test(v, 1);
|
||||
serialize_test(vec!(1u32, 3u32), 1 + 4*2);
|
||||
assert_eq!(serialize_test(Vec::<u8>::new()), 1);
|
||||
assert_eq!(serialize_test(vec![1u32, 3u32]), 1 + 4 * 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_vint() {
|
||||
for i in 0..10_000 {
|
||||
serialize_test(VInt(i as u64), 0);
|
||||
serialize_test(VInt(i as u64));
|
||||
}
|
||||
serialize_test(VInt(7u64), 1);
|
||||
serialize_test(VInt(127u64), 1);
|
||||
serialize_test(VInt(128u64), 2);
|
||||
serialize_test(VInt(129u64), 2);
|
||||
serialize_test(VInt(1234u64), 2);
|
||||
serialize_test(VInt(16_383), 2);
|
||||
serialize_test(VInt(16_384), 3);
|
||||
serialize_test(VInt(u64::max_value()), 10);
|
||||
assert_eq!(serialize_test(VInt(7u64)), 1);
|
||||
assert_eq!(serialize_test(VInt(127u64)), 1);
|
||||
assert_eq!(serialize_test(VInt(128u64)), 2);
|
||||
assert_eq!(serialize_test(VInt(129u64)), 2);
|
||||
assert_eq!(serialize_test(VInt(1234u64)), 2);
|
||||
assert_eq!(serialize_test(VInt(16_383u64)), 2);
|
||||
assert_eq!(serialize_test(VInt(16_384u64)), 3);
|
||||
assert_eq!(serialize_test(VInt(u64::max_value())), 10);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,11 +10,11 @@ pub struct OpenTimer<'a> {
|
||||
impl<'a> OpenTimer<'a> {
|
||||
/// Starts timing a new named subtask
|
||||
///
|
||||
/// The timer is stopped automatically
|
||||
/// The timer is stopped automatically
|
||||
/// when the `OpenTimer` is dropped.
|
||||
pub fn open(&mut self, name: &'static str) -> OpenTimer {
|
||||
OpenTimer {
|
||||
name: name,
|
||||
name,
|
||||
timer_tree: self.timer_tree,
|
||||
start: PreciseTime::now(),
|
||||
depth: self.depth + 1,
|
||||
@@ -23,17 +23,20 @@ impl<'a> OpenTimer<'a> {
|
||||
}
|
||||
|
||||
impl<'a> Drop for OpenTimer<'a> {
|
||||
fn drop(&mut self,) {
|
||||
self.timer_tree.timings.push(Timing {
|
||||
fn drop(&mut self) {
|
||||
self.timer_tree.timings.push(Timing {
|
||||
name: self.name,
|
||||
duration: self.start.to(PreciseTime::now()).num_microseconds().unwrap(),
|
||||
duration: self.start
|
||||
.to(PreciseTime::now())
|
||||
.num_microseconds()
|
||||
.unwrap(),
|
||||
depth: self.depth,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Timing recording
|
||||
#[derive(Debug, RustcEncodable)]
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Timing {
|
||||
name: &'static str,
|
||||
duration: i64,
|
||||
@@ -41,22 +44,21 @@ pub struct Timing {
|
||||
}
|
||||
|
||||
/// Timer tree
|
||||
#[derive(Debug, RustcEncodable)]
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct TimerTree {
|
||||
timings: Vec<Timing>,
|
||||
}
|
||||
|
||||
impl TimerTree {
|
||||
|
||||
/// Returns the total time elapsed in microseconds
|
||||
pub fn total_time(&self,) -> i64 {
|
||||
/// Returns the total time elapsed in microseconds
|
||||
pub fn total_time(&self) -> i64 {
|
||||
self.timings.last().unwrap().duration
|
||||
}
|
||||
|
||||
|
||||
/// Open a new named subtask
|
||||
pub fn open(&mut self, name: &'static str) -> OpenTimer {
|
||||
OpenTimer {
|
||||
name: name,
|
||||
name,
|
||||
timer_tree: self,
|
||||
start: PreciseTime::now(),
|
||||
depth: 0,
|
||||
@@ -72,7 +74,6 @@ impl Default for TimerTree {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -3,59 +3,59 @@ use std::io;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
|
||||
|
||||
|
||||
/// Wrapper over a `u64` that serializes as a variable int.
|
||||
/// Wrapper over a `u64` that serializes as a variable int.
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub struct VInt(pub u64);
|
||||
|
||||
impl VInt {
|
||||
pub fn val(&self,) -> u64 {
|
||||
pub fn val(&self) -> u64 {
|
||||
self.0
|
||||
}
|
||||
|
||||
pub fn deserialize_u64<R: Read>(reader: &mut R) -> io::Result<u64> {
|
||||
VInt::deserialize(reader).map(|vint| vint.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for VInt {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let mut remaining = self.0;
|
||||
let mut written: usize = 0;
|
||||
let mut buffer = [0u8; 10];
|
||||
let mut i = 0;
|
||||
loop {
|
||||
let next_byte: u8 = (remaining % 128u64) as u8;
|
||||
remaining /= 128u64;
|
||||
if remaining == 0u64 {
|
||||
buffer[written] = next_byte | 128u8;
|
||||
written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
buffer[written] = next_byte;
|
||||
written += 1;
|
||||
buffer[i] = next_byte | 128u8;
|
||||
return writer.write_all(&buffer[0..i + 1]);
|
||||
} else {
|
||||
buffer[i] = next_byte;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
try!(writer.write_all(&buffer[0..written]));
|
||||
Ok(written)
|
||||
}
|
||||
|
||||
fn deserialize(reader: &mut Read) -> io::Result<Self> {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let mut bytes = reader.bytes();
|
||||
let mut result = 0u64;
|
||||
let mut shift = 0u64;
|
||||
loop {
|
||||
match bytes.next() {
|
||||
Some(Ok(b)) => {
|
||||
result += ((b % 128u8) as u64) << shift;
|
||||
result += u64::from(b % 128u8) << shift;
|
||||
if b & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
_ => {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "Reach end of buffer"))
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"Reach end of buffer",
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(VInt(result))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,159 +0,0 @@
|
||||
use super::{BlockEncoder, BlockDecoder};
|
||||
use super::NUM_DOCS_PER_BLOCK;
|
||||
use compression::{VIntEncoder, VIntDecoder};
|
||||
|
||||
pub struct CompositeEncoder {
|
||||
block_encoder: BlockEncoder,
|
||||
output: Vec<u8>,
|
||||
}
|
||||
|
||||
impl CompositeEncoder {
|
||||
|
||||
pub fn new() -> CompositeEncoder {
|
||||
CompositeEncoder {
|
||||
block_encoder: BlockEncoder::new(),
|
||||
output: Vec::with_capacity(500_000),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn compress_sorted(&mut self, vals: &[u32]) -> &[u8] {
|
||||
self.output.clear();
|
||||
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
|
||||
let mut offset = 0u32;
|
||||
for i in 0..num_blocks {
|
||||
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK .. (i + 1) * NUM_DOCS_PER_BLOCK];
|
||||
let block_compressed = self.block_encoder.compress_block_sorted(vals_slice, offset);
|
||||
offset = vals_slice[NUM_DOCS_PER_BLOCK - 1];
|
||||
self.output.extend_from_slice(block_compressed);
|
||||
}
|
||||
let vint_compressed = self.block_encoder.compress_vint_sorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..], offset);
|
||||
self.output.extend_from_slice(vint_compressed);
|
||||
&self.output
|
||||
}
|
||||
|
||||
pub fn compress_unsorted(&mut self, vals: &[u32]) -> &[u8] {
|
||||
self.output.clear();
|
||||
let num_blocks = vals.len() / NUM_DOCS_PER_BLOCK;
|
||||
for i in 0..num_blocks {
|
||||
let vals_slice = &vals[i * NUM_DOCS_PER_BLOCK .. (i + 1) * NUM_DOCS_PER_BLOCK];
|
||||
let block_compressed = self.block_encoder.compress_block_unsorted(vals_slice);
|
||||
self.output.extend_from_slice(block_compressed);
|
||||
}
|
||||
let vint_compressed = self.block_encoder.compress_vint_unsorted(&vals[num_blocks * NUM_DOCS_PER_BLOCK..]);
|
||||
self.output.extend_from_slice(vint_compressed);
|
||||
&self.output
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct CompositeDecoder {
|
||||
block_decoder: BlockDecoder,
|
||||
vals: Vec<u32>,
|
||||
}
|
||||
|
||||
|
||||
impl CompositeDecoder {
|
||||
pub fn new() -> CompositeDecoder {
|
||||
CompositeDecoder {
|
||||
block_decoder: BlockDecoder::new(),
|
||||
vals: Vec::with_capacity(500_000),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_sorted(&mut self, mut compressed_data: &[u8], uncompressed_len: usize) -> &[u32] {
|
||||
if uncompressed_len > self.vals.capacity() {
|
||||
let extra_capacity = uncompressed_len - self.vals.capacity();
|
||||
self.vals.reserve(extra_capacity);
|
||||
}
|
||||
let mut offset = 0u32;
|
||||
self.vals.clear();
|
||||
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
|
||||
for _ in 0..num_blocks {
|
||||
compressed_data = self.block_decoder.uncompress_block_sorted(compressed_data, offset);
|
||||
offset = self.block_decoder.output(NUM_DOCS_PER_BLOCK - 1);
|
||||
self.vals.extend_from_slice(self.block_decoder.output_array());
|
||||
}
|
||||
self.block_decoder.uncompress_vint_sorted(compressed_data, offset, uncompressed_len % NUM_DOCS_PER_BLOCK);
|
||||
self.vals.extend_from_slice(self.block_decoder.output_array());
|
||||
&self.vals
|
||||
}
|
||||
|
||||
pub fn uncompress_unsorted(&mut self, mut compressed_data: &[u8], uncompressed_len: usize) -> &[u32] {
|
||||
self.vals.clear();
|
||||
let num_blocks = uncompressed_len / NUM_DOCS_PER_BLOCK;
|
||||
for _ in 0..num_blocks {
|
||||
compressed_data = self.block_decoder.uncompress_block_unsorted(compressed_data);
|
||||
self.vals.extend_from_slice(self.block_decoder.output_array());
|
||||
}
|
||||
self.block_decoder.uncompress_vint_unsorted(compressed_data, uncompressed_len % NUM_DOCS_PER_BLOCK);
|
||||
self.vals.extend_from_slice(self.block_decoder.output_array());
|
||||
&self.vals
|
||||
}
|
||||
}
|
||||
|
||||
impl Into<Vec<u32>> for CompositeDecoder {
|
||||
fn into(self) -> Vec<u32> {
|
||||
self.vals
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use test::Bencher;
|
||||
use super::*;
|
||||
use compression::tests::generate_array;
|
||||
|
||||
#[test]
|
||||
fn test_composite_unsorted() {
|
||||
let data = generate_array(10_000, 0.1);
|
||||
let mut encoder = CompositeEncoder::new();
|
||||
let compressed = encoder.compress_unsorted(&data);
|
||||
assert_eq!(compressed.len(), 19_790);
|
||||
let mut decoder = CompositeDecoder::new();
|
||||
let result = decoder.uncompress_unsorted(&compressed, data.len());
|
||||
for i in 0..data.len() {
|
||||
assert_eq!(data[i], result[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_composite_sorted() {
|
||||
let data = generate_array(10_000, 0.1);
|
||||
let mut encoder = CompositeEncoder::new();
|
||||
let compressed = encoder.compress_sorted(&data);
|
||||
assert_eq!(compressed.len(), 7_822);
|
||||
let mut decoder = CompositeDecoder::new();
|
||||
let result = decoder.uncompress_sorted(&compressed, data.len());
|
||||
for i in 0..data.len() {
|
||||
assert_eq!(data[i], result[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const BENCH_NUM_INTS: usize = 99_968;
|
||||
|
||||
#[bench]
|
||||
fn bench_compress(b: &mut Bencher) {
|
||||
let mut encoder = CompositeEncoder::new();
|
||||
let data = generate_array(BENCH_NUM_INTS, 0.1);
|
||||
b.iter(|| {
|
||||
encoder.compress_sorted(&data);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_uncompress(b: &mut Bencher) {
|
||||
let mut encoder = CompositeEncoder::new();
|
||||
let data = generate_array(BENCH_NUM_INTS, 0.1);
|
||||
let compressed = encoder.compress_sorted(&data);
|
||||
let mut decoder = CompositeDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_sorted(compressed, BENCH_NUM_INTS);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,127 +0,0 @@
|
||||
use common::bitpacker::compute_num_bits;
|
||||
use common::bitpacker::{BitPacker, BitUnpacker};
|
||||
use std::cmp;
|
||||
use std::io::Write;
|
||||
use super::NUM_DOCS_PER_BLOCK;
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
|
||||
|
||||
pub fn compress_sorted(vals: &mut [u32], mut output: &mut [u8], offset: u32) -> usize {
|
||||
let mut max_delta = 0;
|
||||
{
|
||||
let mut local_offset = offset;
|
||||
for i in 0..NUM_DOCS_PER_BLOCK {
|
||||
let val = vals[i];
|
||||
let delta = val - local_offset;
|
||||
max_delta = cmp::max(max_delta, delta);
|
||||
vals[i] = delta;
|
||||
local_offset = val;
|
||||
}
|
||||
}
|
||||
let num_bits = compute_num_bits(max_delta);
|
||||
output.write_all(&[num_bits]).unwrap();
|
||||
let mut bit_packer = BitPacker::new(num_bits as usize);
|
||||
for val in vals {
|
||||
bit_packer.write(*val, &mut output).unwrap();
|
||||
}
|
||||
1 + bit_packer.close(&mut output).expect("packing in memory should never fail")
|
||||
}
|
||||
|
||||
|
||||
|
||||
pub struct BlockEncoder {
|
||||
pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
pub output_len: usize,
|
||||
input_buffer: [u32; NUM_DOCS_PER_BLOCK],
|
||||
}
|
||||
|
||||
impl BlockEncoder {
|
||||
|
||||
pub fn new() -> BlockEncoder {
|
||||
BlockEncoder {
|
||||
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
output_len: 0,
|
||||
input_buffer: [0u32; NUM_DOCS_PER_BLOCK],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
|
||||
self.input_buffer.clone_from_slice(vals);
|
||||
let compressed_size = compress_sorted(&mut self.input_buffer, &mut self.output, offset);
|
||||
&self.output[..compressed_size]
|
||||
}
|
||||
|
||||
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
|
||||
let compressed_size: usize = {
|
||||
let mut output: &mut [u8] = &mut self.output;
|
||||
let max = vals.iter().cloned().max().expect("compress unsorted called with an empty array");
|
||||
let num_bits = compute_num_bits(max);
|
||||
output.write_all(&[num_bits]).unwrap();
|
||||
let mut bit_packer = BitPacker::new(num_bits as usize);
|
||||
for val in vals {
|
||||
bit_packer.write(*val, &mut output).unwrap();
|
||||
}
|
||||
1 + bit_packer.close(&mut output).expect("packing in memory should never fail")
|
||||
};
|
||||
&self.output[..compressed_size]
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub struct BlockDecoder {
|
||||
pub output: [u32; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
pub output_len: usize,
|
||||
}
|
||||
|
||||
|
||||
impl BlockDecoder {
|
||||
pub fn new() -> BlockDecoder {
|
||||
BlockDecoder::with_val(0u32)
|
||||
}
|
||||
|
||||
pub fn with_val(val: u32) -> BlockDecoder {
|
||||
BlockDecoder {
|
||||
output: [val; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_block_sorted<'a>(&mut self, compressed_data: &'a [u8], mut offset: u32) -> &'a[u8] {
|
||||
let consumed_size = {
|
||||
let num_bits = compressed_data[0];
|
||||
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
|
||||
for i in 0..NUM_DOCS_PER_BLOCK {
|
||||
let delta = bit_unpacker.get(i);
|
||||
let val = offset + delta;
|
||||
self.output[i] = val;
|
||||
offset = val;
|
||||
}
|
||||
1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8
|
||||
};
|
||||
self.output_len = NUM_DOCS_PER_BLOCK;
|
||||
&compressed_data[consumed_size..]
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a[u8] {
|
||||
let num_bits = compressed_data[0];
|
||||
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits as usize);
|
||||
for i in 0..NUM_DOCS_PER_BLOCK {
|
||||
self.output[i] = bit_unpacker.get(i);
|
||||
}
|
||||
let consumed_size = 1 + (num_bits as usize * NUM_DOCS_PER_BLOCK + 7) / 8;
|
||||
self.output_len = NUM_DOCS_PER_BLOCK;
|
||||
&compressed_data[consumed_size..]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output_array(&self,) -> &[u32] {
|
||||
&self.output[..self.output_len]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output(&self, idx: usize) -> u32 {
|
||||
self.output[idx]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,164 +1,135 @@
|
||||
#![allow(dead_code)]
|
||||
|
||||
|
||||
mod composite;
|
||||
pub use self::composite::{CompositeEncoder, CompositeDecoder};
|
||||
mod stream;
|
||||
|
||||
#[cfg(feature="simdcompression")]
|
||||
mod compression_simd;
|
||||
#[cfg(feature="simdcompression")]
|
||||
pub use self::compression_simd::{BlockEncoder, BlockDecoder};
|
||||
pub use self::stream::CompressedIntStream;
|
||||
|
||||
pub const COMPRESSION_BLOCK_SIZE: usize = 128;
|
||||
|
||||
#[cfg(not(feature="simdcompression"))]
|
||||
mod compression_nosimd;
|
||||
#[cfg(not(feature="simdcompression"))]
|
||||
pub use self::compression_nosimd::{BlockEncoder, BlockDecoder};
|
||||
/// Returns the size in bytes of a compressed block, given `num_bits`.
|
||||
pub fn compressed_block_size(num_bits: u8) -> usize {
|
||||
1 + (num_bits as usize) * 16
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "simdcompression"))]
|
||||
mod pack {
|
||||
mod compression_pack_nosimd;
|
||||
pub use self::compression_pack_nosimd::{BlockDecoder, BlockEncoder};
|
||||
}
|
||||
|
||||
#[cfg(feature = "simdcompression")]
|
||||
mod pack {
|
||||
mod compression_pack_simd;
|
||||
pub use self::compression_pack_simd::{BlockDecoder, BlockEncoder};
|
||||
}
|
||||
|
||||
pub use self::pack::{BlockDecoder, BlockEncoder};
|
||||
|
||||
#[cfg(any(not(feature = "simdcompression"), target_env = "msvc"))]
|
||||
mod vint {
|
||||
mod compression_vint_nosimd;
|
||||
pub(crate) use self::compression_vint_nosimd::*;
|
||||
}
|
||||
|
||||
#[cfg(all(feature = "simdcompression", not(target_env = "msvc")))]
|
||||
mod vint {
|
||||
mod compression_vint_simd;
|
||||
pub(crate) use self::compression_vint_simd::*;
|
||||
}
|
||||
|
||||
pub trait VIntEncoder {
|
||||
/// Compresses an array of `u32` integers,
|
||||
/// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
|
||||
/// and variable bytes encoding.
|
||||
///
|
||||
/// The method takes an array of ints to compress, and returns
|
||||
/// a `&[u8]` representing the compressed data.
|
||||
///
|
||||
/// The method also takes an offset to give the value of the
|
||||
/// hypothetical previous element in the delta-encoding.
|
||||
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8];
|
||||
|
||||
/// Compresses an array of `u32` integers,
|
||||
/// using variable bytes encoding.
|
||||
///
|
||||
/// The method takes an array of ints to compress, and returns
|
||||
/// a `&[u8]` representing the compressed data.
|
||||
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8];
|
||||
}
|
||||
|
||||
pub trait VIntDecoder {
|
||||
fn uncompress_vint_sorted<'a>(&mut self, compressed_data: &'a [u8], offset: u32, num_els: usize) -> &'a [u8];
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> &'a [u8];
|
||||
}
|
||||
|
||||
impl VIntEncoder for BlockEncoder{
|
||||
|
||||
fn compress_vint_sorted(&mut self, input: &[u32], mut offset: u32) -> &[u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v - offset;
|
||||
offset = v;
|
||||
loop {
|
||||
let next_byte: u8 = (to_encode % 128u32) as u8;
|
||||
to_encode /= 128u32;
|
||||
if to_encode == 0u32 {
|
||||
self.output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
self.output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
&self.output[..byte_written]
|
||||
}
|
||||
|
||||
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v;
|
||||
loop {
|
||||
let next_byte: u8 = (to_encode % 128u32) as u8;
|
||||
to_encode /= 128u32;
|
||||
if to_encode == 0u32 {
|
||||
self.output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
self.output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
&self.output[..byte_written]
|
||||
}
|
||||
}
|
||||
|
||||
impl VIntDecoder for BlockDecoder {
|
||||
|
||||
/// Uncompress an array of `u32` integers,
|
||||
/// that were compressed using [delta-encoding](https://en.wikipedia.org/wiki/Delta_encoding)
|
||||
/// and variable bytes encoding.
|
||||
///
|
||||
/// The method takes a number of int to decompress, and returns
|
||||
/// the amount of bytes that were read to decompress them.
|
||||
///
|
||||
/// The method also takes an offset to give the value of the
|
||||
/// hypothetical previous element in the delta-encoding.
|
||||
///
|
||||
/// For instance, if delta encoded are `1, 3, 9`, and the
|
||||
/// `offset` is 5, then the output will be:
|
||||
/// `5 + 1 = 6, 6 + 3= 9, 9 + 9 = 18`
|
||||
fn uncompress_vint_sorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize) -> &'a [u8] {
|
||||
let mut read_byte = 0;
|
||||
let mut result = offset;
|
||||
for i in 0..num_els {
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[read_byte];
|
||||
read_byte += 1;
|
||||
result += ((cur_byte % 128u8) as u32) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
self.output[i] = result;
|
||||
}
|
||||
self.output_len = num_els;
|
||||
&compressed_data[read_byte..]
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
num_els: usize) -> &'a [u8] {
|
||||
let mut read_byte = 0;
|
||||
for i in 0..num_els {
|
||||
let mut result = 0u32;
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[read_byte];
|
||||
read_byte += 1;
|
||||
result += ((cur_byte % 128u8) as u32) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
self.output[i] = result;
|
||||
}
|
||||
self.output_len = num_els;
|
||||
&compressed_data[read_byte..]
|
||||
}
|
||||
|
||||
num_els: usize,
|
||||
) -> usize;
|
||||
|
||||
/// Uncompress an array of `u32s`, compressed using variable
|
||||
/// byte encoding.
|
||||
///
|
||||
/// The method takes a number of int to decompress, and returns
|
||||
/// the amount of bytes that were read to decompress them.
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize;
|
||||
}
|
||||
|
||||
|
||||
|
||||
impl VIntEncoder for BlockEncoder {
|
||||
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8] {
|
||||
vint::compress_sorted(input, &mut self.output, offset)
|
||||
}
|
||||
|
||||
pub const NUM_DOCS_PER_BLOCK: usize = 128; //< should be a power of 2 to let the compiler optimize.
|
||||
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8] {
|
||||
vint::compress_unsorted(input, &mut self.output)
|
||||
}
|
||||
}
|
||||
|
||||
impl VIntDecoder for BlockDecoder {
|
||||
fn uncompress_vint_sorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize,
|
||||
) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use rand::XorShiftRng;
|
||||
use super::*;
|
||||
use tests;
|
||||
use test::Bencher;
|
||||
|
||||
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
(0..u32::max_value())
|
||||
.filter(|_| rng.next_f32()< ratio)
|
||||
.take(n)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_sorted_block() {
|
||||
let vals: Vec<u32> = (0u32..128u32).map(|i| i*7).collect();
|
||||
let vals: Vec<u32> = (0u32..128u32).map(|i| i * 7).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let compressed_data = encoder.compress_block_sorted(&vals, 0);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
{
|
||||
let remaining_data = decoder.uncompress_block_sorted(compressed_data, 0);
|
||||
assert_eq!(remaining_data.len(), 0);
|
||||
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 0);
|
||||
assert_eq!(consumed_num_bytes, compressed_data.len());
|
||||
}
|
||||
for i in 0..128 {
|
||||
assert_eq!(vals[i], decoder.output(i));
|
||||
@@ -167,33 +138,33 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_encode_sorted_block_with_offset() {
|
||||
let vals: Vec<u32> = (0u32..128u32).map(|i| 11 + i*7).collect();
|
||||
let vals: Vec<u32> = (0u32..128u32).map(|i| 11 + i * 7).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let compressed_data = encoder.compress_block_sorted(&vals, 10);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
{
|
||||
let remaining_data = decoder.uncompress_block_sorted(compressed_data, 10);
|
||||
assert_eq!(remaining_data.len(), 0);
|
||||
let consumed_num_bytes = decoder.uncompress_block_sorted(compressed_data, 10);
|
||||
assert_eq!(consumed_num_bytes, compressed_data.len());
|
||||
}
|
||||
for i in 0..128 {
|
||||
assert_eq!(vals[i], decoder.output(i));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_encode_sorted_block_with_junk() {
|
||||
let mut compressed: Vec<u8> = Vec::new();
|
||||
let n = 128;
|
||||
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32)*7u32).collect();
|
||||
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let compressed_data = encoder.compress_block_sorted(&vals, 10);
|
||||
compressed.extend_from_slice(compressed_data);
|
||||
compressed.push(173u8);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
{
|
||||
let remaining_data = decoder.uncompress_block_sorted(&compressed, 10);
|
||||
assert_eq!(remaining_data.len(), 1);
|
||||
assert_eq!(remaining_data[0], 173u8);
|
||||
let consumed_num_bytes = decoder.uncompress_block_sorted(&compressed, 10);
|
||||
assert_eq!(consumed_num_bytes, compressed.len() - 1);
|
||||
assert_eq!(compressed[consumed_num_bytes], 173u8);
|
||||
}
|
||||
for i in 0..n {
|
||||
assert_eq!(vals[i], decoder.output(i));
|
||||
@@ -204,72 +175,94 @@ pub mod tests {
|
||||
fn test_encode_unsorted_block_with_junk() {
|
||||
let mut compressed: Vec<u8> = Vec::new();
|
||||
let n = 128;
|
||||
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32)*7u32 % 12).collect();
|
||||
let vals: Vec<u32> = (0..n).map(|i| 11u32 + (i as u32) * 7u32 % 12).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let compressed_data = encoder.compress_block_unsorted(&vals);
|
||||
compressed.extend_from_slice(compressed_data);
|
||||
compressed.push(173u8);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
{
|
||||
let remaining_data = decoder.uncompress_block_unsorted(&compressed);
|
||||
assert_eq!(remaining_data.len(), 1);
|
||||
assert_eq!(remaining_data[0], 173u8);
|
||||
let consumed_num_bytes = decoder.uncompress_block_unsorted(&compressed);
|
||||
assert_eq!(consumed_num_bytes + 1, compressed.len());
|
||||
assert_eq!(compressed[consumed_num_bytes], 173u8);
|
||||
}
|
||||
for i in 0..n {
|
||||
assert_eq!(vals[i], decoder.output(i));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_encode_vint() {
|
||||
{
|
||||
let expected_length = 123;
|
||||
let expected_length = 154;
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let input: Vec<u32> = (0u32..123u32)
|
||||
.map(|i| 4 + i * 7 / 2)
|
||||
.into_iter()
|
||||
.collect();
|
||||
let input: Vec<u32> = (0u32..123u32).map(|i| 4 + i * 7 / 2).into_iter().collect();
|
||||
for offset in &[0u32, 1u32, 2u32] {
|
||||
let encoded_data = encoder.compress_vint_sorted(&input, *offset);
|
||||
assert_eq!(encoded_data.len(), expected_length);
|
||||
assert!(encoded_data.len() <= expected_length);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
let remaining_data = decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
|
||||
assert_eq!(0, remaining_data.len());
|
||||
let consumed_num_bytes =
|
||||
decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
|
||||
assert_eq!(consumed_num_bytes, encoded_data.len());
|
||||
assert_eq!(input, decoder.output_array());
|
||||
}
|
||||
}
|
||||
{
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let input = vec!(3u32, 17u32, 187u32);
|
||||
let encoded_data = encoder.compress_vint_sorted(&input, 0);
|
||||
assert_eq!(encoded_data.len(), 4);
|
||||
assert_eq!(encoded_data[0], 3u8 + 128u8);
|
||||
assert_eq!(encoded_data[1], (17u8 - 3u8) + 128u8);
|
||||
assert_eq!(encoded_data[2], (187u8 - 17u8 - 128u8));
|
||||
assert_eq!(encoded_data[3], (1u8 + 128u8));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[bench]
|
||||
fn bench_compress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(NUM_DOCS_PER_BLOCK, 0.1);
|
||||
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
b.iter(|| {
|
||||
encoder.compress_block_sorted(&data, 0u32);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
#[bench]
|
||||
fn bench_uncompress(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(NUM_DOCS_PER_BLOCK, 0.1);
|
||||
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||
let compressed = encoder.compress_block_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_block_sorted(compressed, 0u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_docs_compression_numbits() {
|
||||
for num_bits in 0..33 {
|
||||
let mut data = [0u32; 128];
|
||||
if num_bits > 0 {
|
||||
data[0] = 1 << (num_bits - 1);
|
||||
}
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let compressed = encoder.compress_block_unsorted(&data);
|
||||
assert_eq!(compressed[0] as usize, num_bits);
|
||||
assert_eq!(compressed.len(), compressed_block_size(compressed[0]));
|
||||
}
|
||||
}
|
||||
|
||||
const NUM_INTS_BENCH_VINT: usize = 10;
|
||||
|
||||
#[bench]
|
||||
fn bench_compress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
b.iter(|| {
|
||||
encoder.compress_vint_sorted(&data, 0u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_uncompress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let compressed = encoder.compress_vint_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
148
src/compression/pack/compression_pack_nosimd.rs
Normal file
148
src/compression/pack/compression_pack_nosimd.rs
Normal file
@@ -0,0 +1,148 @@
|
||||
use common::compute_num_bits;
|
||||
use common::bitpacker::{BitPacker, BitUnpacker};
|
||||
use common::CountingWriter;
|
||||
use std::cmp;
|
||||
use std::io::Write;
|
||||
use super::super::{compressed_block_size, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
|
||||
|
||||
pub fn compress_sorted(vals: &mut [u32], output: &mut [u8], offset: u32) -> usize {
|
||||
let mut max_delta = 0;
|
||||
{
|
||||
let mut local_offset = offset;
|
||||
for i in 0..COMPRESSION_BLOCK_SIZE {
|
||||
let val = vals[i];
|
||||
let delta = val - local_offset;
|
||||
max_delta = cmp::max(max_delta, delta);
|
||||
vals[i] = delta;
|
||||
local_offset = val;
|
||||
}
|
||||
}
|
||||
let mut counting_writer = CountingWriter::wrap(output);
|
||||
let num_bits = compute_num_bits(max_delta as u64);
|
||||
counting_writer.write_all(&[num_bits]).unwrap();
|
||||
|
||||
let mut bit_packer = BitPacker::new();
|
||||
for val in vals {
|
||||
bit_packer
|
||||
.write(*val as u64, num_bits, &mut counting_writer)
|
||||
.unwrap();
|
||||
}
|
||||
let compressed_size = counting_writer.written_bytes();
|
||||
assert_eq!(compressed_size, compressed_block_size(num_bits));
|
||||
compressed_size
|
||||
}
|
||||
|
||||
pub struct BlockEncoder {
|
||||
pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
pub output_len: usize,
|
||||
input_buffer: [u32; COMPRESSION_BLOCK_SIZE],
|
||||
}
|
||||
|
||||
impl BlockEncoder {
|
||||
pub fn new() -> BlockEncoder {
|
||||
BlockEncoder {
|
||||
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
output_len: 0,
|
||||
input_buffer: [0u32; COMPRESSION_BLOCK_SIZE],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
|
||||
self.input_buffer.clone_from_slice(vals);
|
||||
let compressed_size = compress_sorted(&mut self.input_buffer, &mut self.output, offset);
|
||||
&self.output[..compressed_size]
|
||||
}
|
||||
|
||||
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
|
||||
let compressed_size = {
|
||||
let output: &mut [u8] = &mut self.output;
|
||||
let max = vals.iter()
|
||||
.cloned()
|
||||
.max()
|
||||
.expect("compress unsorted called with an empty array");
|
||||
let num_bits = compute_num_bits(max as u64);
|
||||
let mut counting_writer = CountingWriter::wrap(output);
|
||||
counting_writer.write_all(&[num_bits]).unwrap();
|
||||
let mut bit_packer = BitPacker::new();
|
||||
for val in vals {
|
||||
bit_packer
|
||||
.write(*val as u64, num_bits, &mut counting_writer)
|
||||
.unwrap();
|
||||
}
|
||||
for _ in vals.len()..COMPRESSION_BLOCK_SIZE {
|
||||
bit_packer
|
||||
.write(vals[0] as u64, num_bits, &mut counting_writer)
|
||||
.unwrap();
|
||||
}
|
||||
bit_packer.flush(&mut counting_writer).expect(
|
||||
"Flushing the bitpacking \
|
||||
in an in RAM buffer should never fail",
|
||||
);
|
||||
// we avoid writing "closing", because we
|
||||
// do not want 7 bytes of padding here.
|
||||
counting_writer.written_bytes()
|
||||
};
|
||||
&self.output[..compressed_size]
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BlockDecoder {
|
||||
pub output: [u32; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
pub output_len: usize,
|
||||
}
|
||||
|
||||
impl BlockDecoder {
|
||||
pub fn new() -> BlockDecoder {
|
||||
BlockDecoder::with_val(0u32)
|
||||
}
|
||||
|
||||
pub fn with_val(val: u32) -> BlockDecoder {
|
||||
BlockDecoder {
|
||||
output: [val; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_block_sorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
mut offset: u32,
|
||||
) -> usize {
|
||||
let consumed_size = {
|
||||
let num_bits = compressed_data[0];
|
||||
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits);
|
||||
for i in 0..COMPRESSION_BLOCK_SIZE {
|
||||
let delta = bit_unpacker.get(i);
|
||||
let val = offset + delta as u32;
|
||||
self.output[i] = val;
|
||||
offset = val;
|
||||
}
|
||||
compressed_block_size(num_bits)
|
||||
};
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
consumed_size
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
|
||||
let num_bits = compressed_data[0];
|
||||
let bit_unpacker = BitUnpacker::new(&compressed_data[1..], num_bits);
|
||||
for i in 0..COMPRESSION_BLOCK_SIZE {
|
||||
self.output[i] = bit_unpacker.get(i) as u32;
|
||||
}
|
||||
let consumed_size = 1 + (num_bits as usize * COMPRESSION_BLOCK_SIZE + 7) / 8;
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
consumed_size
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output_array(&self) -> &[u32] {
|
||||
&self.output[..self.output_len]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output(&self, idx: usize) -> u32 {
|
||||
self.output[idx]
|
||||
}
|
||||
}
|
||||
@@ -1,28 +1,22 @@
|
||||
use super::NUM_DOCS_PER_BLOCK;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
|
||||
|
||||
mod simdcomp {
|
||||
use libc::size_t;
|
||||
|
||||
extern {
|
||||
pub fn compress_sorted(
|
||||
data: *const u32,
|
||||
output: *mut u8,
|
||||
offset: u32) -> size_t;
|
||||
extern "C" {
|
||||
pub fn compress_sorted(data: *const u32, output: *mut u8, offset: u32) -> size_t;
|
||||
|
||||
pub fn uncompress_sorted(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
offset: u32) -> size_t;
|
||||
|
||||
pub fn compress_unsorted(
|
||||
data: *const u32,
|
||||
output: *mut u8) -> size_t;
|
||||
offset: u32,
|
||||
) -> size_t;
|
||||
|
||||
pub fn uncompress_unsorted(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32) -> size_t;
|
||||
pub fn compress_unsorted(data: *const u32, output: *mut u8) -> size_t;
|
||||
|
||||
pub fn uncompress_unsorted(compressed_data: *const u8, output: *mut u32) -> size_t;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,31 +36,28 @@ fn uncompress_unsorted(compressed_data: &[u8], output: &mut [u32]) -> usize {
|
||||
unsafe { simdcomp::uncompress_unsorted(compressed_data.as_ptr(), output.as_mut_ptr()) }
|
||||
}
|
||||
|
||||
|
||||
pub struct BlockEncoder {
|
||||
pub output: [u8; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
pub output_len: usize,
|
||||
}
|
||||
|
||||
impl BlockEncoder {
|
||||
|
||||
pub fn new() -> BlockEncoder {
|
||||
BlockEncoder {
|
||||
output: [0u8; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn compress_block_sorted(&mut self, vals: &[u32], offset: u32) -> &[u8] {
|
||||
let compressed_size = compress_sorted(vals, &mut self.output, offset);
|
||||
&self.output[..compressed_size]
|
||||
}
|
||||
|
||||
|
||||
pub fn compress_block_unsorted(&mut self, vals: &[u32]) -> &[u8] {
|
||||
let compressed_size = compress_unsorted(vals, &mut self.output);
|
||||
&self.output[..compressed_size]
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub struct BlockDecoder {
|
||||
@@ -74,40 +65,52 @@ pub struct BlockDecoder {
|
||||
pub output_len: usize,
|
||||
}
|
||||
|
||||
|
||||
impl BlockDecoder {
|
||||
pub fn new() -> BlockDecoder {
|
||||
BlockDecoder::with_val(0u32)
|
||||
}
|
||||
|
||||
|
||||
pub fn with_val(val: u32) -> BlockDecoder {
|
||||
BlockDecoder {
|
||||
output: [val; COMPRESSED_BLOCK_MAX_SIZE],
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn uncompress_block_sorted<'a>(&mut self, compressed_data: &'a [u8], offset: u32) -> &'a[u8] {
|
||||
|
||||
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
|
||||
let consumed_size = uncompress_sorted(compressed_data, &mut self.output, offset);
|
||||
self.output_len = NUM_DOCS_PER_BLOCK;
|
||||
&compressed_data[consumed_size..]
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
consumed_size
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> &'a[u8] {
|
||||
|
||||
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
|
||||
let consumed_size = uncompress_unsorted(compressed_data, &mut self.output);
|
||||
self.output_len = NUM_DOCS_PER_BLOCK;
|
||||
&compressed_data[consumed_size..]
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
consumed_size
|
||||
}
|
||||
|
||||
|
||||
#[inline]
|
||||
pub fn output_array(&self,) -> &[u32] {
|
||||
pub fn output_array(&self) -> &[u32] {
|
||||
&self.output[..self.output_len]
|
||||
}
|
||||
|
||||
|
||||
#[inline]
|
||||
pub fn output(&self, idx: usize) -> u32 {
|
||||
self.output[idx]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::BlockEncoder;
|
||||
|
||||
#[test]
|
||||
fn test_all_docs_compression_len() {
|
||||
let data: Vec<u32> = (0u32..128u32).collect();
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let compressed = encoder.compress_block_sorted(&data, 0u32);
|
||||
assert_eq!(compressed.len(), 17);
|
||||
}
|
||||
|
||||
}
|
||||
130
src/compression/stream.rs
Normal file
130
src/compression/stream.rs
Normal file
@@ -0,0 +1,130 @@
|
||||
use compression::BlockDecoder;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use compression::compressed_block_size;
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
|
||||
/// Reads a stream of compressed ints.
|
||||
///
|
||||
/// Tantivy uses `CompressedIntStream` to read
|
||||
/// the position file.
|
||||
/// The `.skip(...)` makes it possible to avoid
|
||||
/// decompressing blocks that are not required.
|
||||
pub struct CompressedIntStream {
|
||||
buffer: SourceRead,
|
||||
block_decoder: BlockDecoder,
|
||||
inner_offset: usize,
|
||||
}
|
||||
|
||||
impl CompressedIntStream {
|
||||
/// Opens a compressed int stream.
|
||||
pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream {
|
||||
CompressedIntStream {
|
||||
buffer: SourceRead::from(source),
|
||||
block_decoder: BlockDecoder::new(),
|
||||
inner_offset: COMPRESSION_BLOCK_SIZE,
|
||||
}
|
||||
}
|
||||
|
||||
/// Fills a buffer with the next `output.len()` integers,
|
||||
/// and advance the stream by that many els.
|
||||
pub fn read(&mut self, output: &mut [u32]) {
|
||||
let mut num_els: usize = output.len();
|
||||
let mut start: usize = 0;
|
||||
loop {
|
||||
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
|
||||
if num_els >= available {
|
||||
if available > 0 {
|
||||
let uncompressed_block =
|
||||
&self.block_decoder.output_array()[self.inner_offset..];
|
||||
output[start..][..available].clone_from_slice(uncompressed_block);
|
||||
}
|
||||
num_els -= available;
|
||||
start += available;
|
||||
let num_consumed_bytes = self.block_decoder
|
||||
.uncompress_block_unsorted(self.buffer.as_ref());
|
||||
self.buffer.advance(num_consumed_bytes);
|
||||
self.inner_offset = 0;
|
||||
} else {
|
||||
let uncompressed_block = &self.block_decoder.output_array()
|
||||
[self.inner_offset..self.inner_offset + num_els];
|
||||
output[start..][..num_els].clone_from_slice(uncompressed_block);
|
||||
self.inner_offset += num_els;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Skip the next `skip_len` integer.
|
||||
///
|
||||
/// If a full block is skipped, calling
|
||||
/// `.skip(...)` will avoid decompressing it.
|
||||
pub fn skip(&mut self, mut skip_len: usize) {
|
||||
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
|
||||
if available >= skip_len {
|
||||
self.inner_offset += skip_len;
|
||||
} else {
|
||||
skip_len -= available;
|
||||
// entirely skip decompressing some blocks.
|
||||
while skip_len >= COMPRESSION_BLOCK_SIZE {
|
||||
skip_len -= COMPRESSION_BLOCK_SIZE;
|
||||
let num_bits: u8 = self.buffer.as_ref()[0];
|
||||
let block_len = compressed_block_size(num_bits);
|
||||
self.buffer.advance(block_len);
|
||||
}
|
||||
let num_consumed_bytes = self.block_decoder
|
||||
.uncompress_block_unsorted(self.buffer.as_ref());
|
||||
self.buffer.advance(num_consumed_bytes);
|
||||
self.inner_offset = skip_len;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use super::CompressedIntStream;
|
||||
use compression::compressed_block_size;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use compression::BlockEncoder;
|
||||
use directory::ReadOnlySource;
|
||||
|
||||
fn create_stream_buffer() -> ReadOnlySource {
|
||||
let mut buffer: Vec<u8> = vec![];
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let vals: Vec<u32> = (0u32..1_025u32).collect();
|
||||
for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) {
|
||||
let compressed_block = encoder.compress_block_unsorted(chunk);
|
||||
let num_bits = compressed_block[0];
|
||||
assert_eq!(compressed_block_size(num_bits), compressed_block.len());
|
||||
buffer.extend_from_slice(compressed_block);
|
||||
}
|
||||
if cfg!(simd) {
|
||||
buffer.extend_from_slice(&[0u8; 7]);
|
||||
}
|
||||
ReadOnlySource::from(buffer)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compressed_int_stream() {
|
||||
let buffer = create_stream_buffer();
|
||||
let mut stream = CompressedIntStream::wrap(buffer);
|
||||
let mut block: [u32; COMPRESSION_BLOCK_SIZE] = [0u32; COMPRESSION_BLOCK_SIZE];
|
||||
|
||||
stream.read(&mut block[0..2]);
|
||||
assert_eq!(block[0], 0);
|
||||
assert_eq!(block[1], 1);
|
||||
stream.skip(5);
|
||||
stream.read(&mut block[0..3]);
|
||||
assert_eq!(block[0], 7);
|
||||
assert_eq!(block[1], 8);
|
||||
assert_eq!(block[2], 9);
|
||||
stream.skip(500);
|
||||
stream.read(&mut block[0..3]);
|
||||
assert_eq!(block[0], 510);
|
||||
assert_eq!(block[1], 511);
|
||||
assert_eq!(block[2], 512);
|
||||
stream.skip(511);
|
||||
stream.read(&mut block[..1]);
|
||||
assert_eq!(block[0], 1024);
|
||||
}
|
||||
}
|
||||
92
src/compression/vint/compression_vint_nosimd.rs
Normal file
92
src/compression/vint/compression_vint_nosimd.rs
Normal file
@@ -0,0 +1,92 @@
|
||||
#[inline(always)]
|
||||
pub(crate) fn compress_sorted<'a>(
|
||||
input: &[u32],
|
||||
output: &'a mut [u8],
|
||||
mut offset: u32,
|
||||
) -> &'a [u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v - offset;
|
||||
offset = v;
|
||||
loop {
|
||||
let next_byte: u8 = (to_encode % 128u32) as u8;
|
||||
to_encode /= 128u32;
|
||||
if to_encode == 0u32 {
|
||||
output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
} else {
|
||||
output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
&output[..byte_written]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v;
|
||||
loop {
|
||||
let next_byte: u8 = (to_encode % 128u32) as u8;
|
||||
to_encode /= 128u32;
|
||||
if to_encode == 0u32 {
|
||||
output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
} else {
|
||||
output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
&output[..byte_written]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn uncompress_sorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32,
|
||||
) -> usize {
|
||||
let mut read_byte = 0;
|
||||
let mut result = offset;
|
||||
let num_els = output.len();
|
||||
for i in 0..num_els {
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[read_byte];
|
||||
read_byte += 1;
|
||||
result += ((cur_byte % 128u8) as u32) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
output[i] = result;
|
||||
}
|
||||
read_byte
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
|
||||
let mut read_byte = 0;
|
||||
let num_els = output.len();
|
||||
for i in 0..num_els {
|
||||
let mut result = 0u32;
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[read_byte];
|
||||
read_byte += 1;
|
||||
result += ((cur_byte % 128u8) as u32) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
output[i] = result;
|
||||
}
|
||||
read_byte
|
||||
}
|
||||
72
src/compression/vint/compression_vint_simd.rs
Normal file
72
src/compression/vint/compression_vint_simd.rs
Normal file
@@ -0,0 +1,72 @@
|
||||
mod streamvbyte {
|
||||
|
||||
use libc::size_t;
|
||||
|
||||
extern "C" {
|
||||
pub fn streamvbyte_delta_encode(
|
||||
data: *const u32,
|
||||
num_els: u32,
|
||||
output: *mut u8,
|
||||
offset: u32,
|
||||
) -> size_t;
|
||||
|
||||
pub fn streamvbyte_delta_decode(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: u32,
|
||||
offset: u32,
|
||||
) -> size_t;
|
||||
|
||||
pub fn streamvbyte_encode(data: *const u32, num_els: u32, output: *mut u8) -> size_t;
|
||||
|
||||
pub fn streamvbyte_decode(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: usize,
|
||||
) -> size_t;
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
|
||||
let compress_length = unsafe {
|
||||
streamvbyte::streamvbyte_delta_encode(
|
||||
input.as_ptr(),
|
||||
input.len() as u32,
|
||||
output.as_mut_ptr(),
|
||||
offset,
|
||||
)
|
||||
};
|
||||
&output[..compress_length]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
let compress_length = unsafe {
|
||||
streamvbyte::streamvbyte_encode(input.as_ptr(), input.len() as u32, output.as_mut_ptr())
|
||||
};
|
||||
&output[..compress_length]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn uncompress_sorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32,
|
||||
) -> usize {
|
||||
unsafe {
|
||||
streamvbyte::streamvbyte_delta_decode(
|
||||
compressed_data.as_ptr(),
|
||||
output.as_mut_ptr(),
|
||||
output.len() as u32,
|
||||
offset,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
|
||||
unsafe {
|
||||
streamvbyte::streamvbyte_decode(compressed_data.as_ptr(), output.as_mut_ptr(), output.len())
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,10 @@
|
||||
use Result;
|
||||
use Error;
|
||||
use error::{ErrorKind, ResultExt};
|
||||
use serde_json;
|
||||
use schema::Schema;
|
||||
use std::sync::Arc;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::fmt;
|
||||
use rustc_serialize::json;
|
||||
use core::SegmentId;
|
||||
use directory::{Directory, MmapDirectory, RAMDirectory};
|
||||
use indexer::index_writer::open_index_writer;
|
||||
@@ -18,29 +18,30 @@ use core::SegmentMeta;
|
||||
use super::pool::LeasedItem;
|
||||
use std::path::Path;
|
||||
use core::IndexMeta;
|
||||
use indexer::DirectoryLock;
|
||||
use IndexWriter;
|
||||
use directory::ManagedDirectory;
|
||||
use core::META_FILEPATH;
|
||||
use super::segment::create_segment;
|
||||
use indexer::segment_updater::save_new_metas;
|
||||
use tokenizer::TokenizerManager;
|
||||
|
||||
const NUM_SEARCHERS: usize = 12;
|
||||
|
||||
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||
let meta_string = String::from_utf8_lossy(&meta_data);
|
||||
json::decode(&meta_string)
|
||||
.map_err(|e| Error::CorruptedFile(META_FILEPATH.clone(), Box::new(e)))
|
||||
serde_json::from_str(&meta_string).chain_err(|| ErrorKind::CorruptedFile(META_FILEPATH.clone()))
|
||||
}
|
||||
|
||||
/// Tantivy's Search Index
|
||||
/// Search Index
|
||||
pub struct Index {
|
||||
directory: ManagedDirectory,
|
||||
schema: Schema,
|
||||
searcher_pool: Arc<Pool<Searcher>>,
|
||||
tokenizers: TokenizerManager,
|
||||
}
|
||||
|
||||
|
||||
impl Index {
|
||||
/// Creates a new index using the `RAMDirectory`.
|
||||
///
|
||||
@@ -48,20 +49,29 @@ impl Index {
|
||||
/// This should only be used for unit tests.
|
||||
pub fn create_in_ram(schema: Schema) -> Index {
|
||||
let ram_directory = RAMDirectory::create();
|
||||
let directory = ManagedDirectory::new(ram_directory).expect("Creating a managed directory from a brand new RAM directory should never fail.");
|
||||
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail") // unwrap is ok here
|
||||
// unwrap is ok here
|
||||
let directory = ManagedDirectory::new(ram_directory).expect(
|
||||
"Creating a managed directory from a brand new RAM directory \
|
||||
should never fail.",
|
||||
);
|
||||
Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail")
|
||||
}
|
||||
|
||||
/// Creates a new index in a given filepath.
|
||||
/// The index will use the `MMapDirectory`.
|
||||
///
|
||||
/// If a previous index was in this directory, then its meta file will be destroyed.
|
||||
pub fn create(directory_path: &Path, schema: Schema) -> Result<Index> {
|
||||
pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
/// Accessor for the tokenizer manager.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
&self.tokenizers
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
///
|
||||
/// The index will use the `MMapDirectory` in a newly created directory.
|
||||
@@ -77,37 +87,36 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Creates a new index given a directory and an `IndexMeta`.
|
||||
fn create_from_metas(directory: ManagedDirectory, metas: IndexMeta) -> Result<Index> {
|
||||
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
|
||||
let schema = metas.schema.clone();
|
||||
let index = Index {
|
||||
directory: directory,
|
||||
schema: schema,
|
||||
directory,
|
||||
schema,
|
||||
searcher_pool: Arc::new(Pool::new()),
|
||||
tokenizers: TokenizerManager::default(),
|
||||
};
|
||||
try!(index.load_searchers());
|
||||
index.load_searchers()?;
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
/// Create a new index from a directory.
|
||||
pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||
Index::create_from_metas(directory, IndexMeta::with_schema(schema))
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
pub fn open(directory_path: &Path) -> Result<Index> {
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
let metas = try!(load_metas(&directory));
|
||||
Index::create_from_metas(directory, metas)
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
|
||||
/// Returns the index opstamp.
|
||||
///
|
||||
/// The opstamp is the number of documents that have been added
|
||||
/// from the beginning of time, and until the moment of the last commit.
|
||||
pub fn opstamp(&self) -> u64 {
|
||||
load_metas(self.directory()).unwrap().opstamp
|
||||
/// Reads the index meta file from the directory.
|
||||
pub fn load_metas(&self) -> Result<IndexMeta> {
|
||||
load_metas(self.directory())
|
||||
}
|
||||
|
||||
/// Open a new index writer. Attempts to acquire a lockfile.
|
||||
@@ -125,14 +134,15 @@ impl Index {
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn writer_with_num_threads(&self,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes: usize)
|
||||
-> Result<IndexWriter> {
|
||||
open_index_writer(self, num_threads, heap_size_in_bytes)
|
||||
pub fn writer_with_num_threads(
|
||||
&self,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes: usize,
|
||||
) -> Result<IndexWriter> {
|
||||
let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
|
||||
open_index_writer(self, num_threads, heap_size_in_bytes, directory_lock)
|
||||
}
|
||||
|
||||
|
||||
/// Creates a multithreaded writer
|
||||
/// It just calls `writer_with_num_threads` with the number of cores as `num_threads`
|
||||
///
|
||||
@@ -153,8 +163,7 @@ impl Index {
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
Ok(self.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
.map(|segment_meta| self.segment(segment_meta))
|
||||
.collect())
|
||||
@@ -184,28 +193,29 @@ impl Index {
|
||||
/// Reads the meta.json and returns the list of
|
||||
/// `SegmentMeta` from the last commit.
|
||||
pub fn searchable_segment_metas(&self) -> Result<Vec<SegmentMeta>> {
|
||||
Ok(load_metas(self.directory())?.segments)
|
||||
Ok(self.load_metas()?.segments)
|
||||
}
|
||||
|
||||
|
||||
/// Returns the list of segment ids that are searchable.
|
||||
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
|
||||
Ok(self.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect())
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Creates a new generation of searchers after
|
||||
|
||||
/// a change of the set of searchable indexes.
|
||||
///
|
||||
/// This needs to be called when a new segment has been
|
||||
/// published or after a merge.
|
||||
pub fn load_searchers(&self) -> Result<()> {
|
||||
let searchable_segments = self.searchable_segments()?;
|
||||
let segment_readers: Vec<SegmentReader> = try!(searchable_segments
|
||||
.into_iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect());
|
||||
let segment_readers: Vec<SegmentReader> = searchable_segments
|
||||
.iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect::<Result<_>>()?;
|
||||
let searchers = (0..NUM_SEARCHERS)
|
||||
.map(|_| Searcher::from(segment_readers.clone()))
|
||||
.collect();
|
||||
@@ -228,7 +238,6 @@ impl Index {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl fmt::Debug for Index {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "Index({:?})", self.directory)
|
||||
@@ -240,7 +249,8 @@ impl Clone for Index {
|
||||
Index {
|
||||
directory: self.directory.clone(),
|
||||
schema: self.schema.clone(),
|
||||
searcher_pool: self.searcher_pool.clone(),
|
||||
searcher_pool: Arc::clone(&self.searcher_pool),
|
||||
tokenizers: self.tokenizers.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,27 +1,68 @@
|
||||
use schema::Schema;
|
||||
use core::SegmentMeta;
|
||||
use std::fmt;
|
||||
use serde_json;
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
///
|
||||
/// This object is serialized on disk in the `meta.json` file.
|
||||
/// It keeps information about
|
||||
/// It keeps information about
|
||||
/// * the searchable segments,
|
||||
/// * the index docstamp
|
||||
/// * the index `docstamp`
|
||||
/// * the schema
|
||||
///
|
||||
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct IndexMeta {
|
||||
pub segments: Vec<SegmentMeta>,
|
||||
pub schema: Schema,
|
||||
pub opstamp: u64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub payload: Option<String>,
|
||||
}
|
||||
|
||||
impl IndexMeta {
|
||||
pub fn with_schema(schema: Schema) -> IndexMeta {
|
||||
IndexMeta {
|
||||
segments: vec!(),
|
||||
schema: schema,
|
||||
segments: vec![],
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for IndexMeta {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}",
|
||||
serde_json::ser::to_string(self)
|
||||
.expect("JSON serialization for IndexMeta should never fail.")
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use serde_json;
|
||||
use super::IndexMeta;
|
||||
use schema::{SchemaBuilder, TEXT};
|
||||
|
||||
#[test]
|
||||
fn test_serialize_metas() {
|
||||
let schema = {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
schema_builder.add_text_field("text", TEXT);
|
||||
schema_builder.build()
|
||||
};
|
||||
let index_metas = IndexMeta {
|
||||
segments: Vec::new(),
|
||||
schema: schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
};
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(json, r#"{"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#);
|
||||
}
|
||||
}
|
||||
|
||||
166
src/core/inverted_index_reader.rs
Normal file
166
src/core/inverted_index_reader.rs
Normal file
@@ -0,0 +1,166 @@
|
||||
use directory::{ReadOnlySource, SourceRead};
|
||||
use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
use postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use postings::TermInfo;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use fastfield::DeleteBitSet;
|
||||
use compression::CompressedIntStream;
|
||||
use postings::FreqReadingOption;
|
||||
use schema::FieldType;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// It is safe to delete the segment associated to
|
||||
/// an `InvertedIndexReader`. As long as it is open,
|
||||
/// the `ReadOnlySource` it is relying on should
|
||||
/// stay available.
|
||||
///
|
||||
///
|
||||
/// `InvertedIndexReader` are created by calling
|
||||
/// the `SegmentReader`'s [`.inverted_index(...)`] method
|
||||
pub struct InvertedIndexReader {
|
||||
termdict: TermDictionaryImpl,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
delete_bitset: DeleteBitSet,
|
||||
record_option: IndexRecordOption,
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
pub(crate) fn new(
|
||||
termdict: TermDictionaryImpl,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
delete_bitset: DeleteBitSet,
|
||||
record_option: IndexRecordOption,
|
||||
) -> InvertedIndexReader {
|
||||
InvertedIndexReader {
|
||||
termdict,
|
||||
postings_source,
|
||||
positions_source,
|
||||
delete_bitset,
|
||||
record_option,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates an empty `InvertedIndexReader` object, which
|
||||
/// contains no terms at all.
|
||||
pub fn empty(field_type: FieldType) -> InvertedIndexReader {
|
||||
let record_option = field_type
|
||||
.get_index_record_option()
|
||||
.unwrap_or(IndexRecordOption::Basic);
|
||||
InvertedIndexReader::new(
|
||||
TermDictionaryImpl::empty(field_type),
|
||||
ReadOnlySource::empty(),
|
||||
ReadOnlySource::empty(),
|
||||
DeleteBitSet::empty(),
|
||||
record_option,
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns the term info associated with the term.
|
||||
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
|
||||
self.termdict.get(term.value_bytes())
|
||||
}
|
||||
|
||||
/// Return the term dictionary datastructure.
|
||||
pub fn terms(&self) -> &TermDictionaryImpl {
|
||||
&self.termdict
|
||||
}
|
||||
|
||||
/// Resets the block segment to another position of the postings
|
||||
/// file.
|
||||
///
|
||||
/// This is useful for enumerating through a list of terms,
|
||||
/// and consuming the associated posting lists while avoiding
|
||||
/// reallocating a `BlockSegmentPostings`.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This does not reset the positions list.
|
||||
pub fn reset_block_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
block_postings: &mut BlockSegmentPostings,
|
||||
) {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let end_source = self.postings_source.len();
|
||||
let postings_slice = self.postings_source.slice(offset, end_source);
|
||||
let postings_reader = SourceRead::from(postings_slice);
|
||||
block_postings.reset(term_info.doc_freq as usize, postings_reader);
|
||||
}
|
||||
|
||||
/// Returns a block postings given a `term_info`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most user should prefer using `read_postings` instead.
|
||||
pub fn read_block_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
requested_option: IndexRecordOption,
|
||||
) -> BlockSegmentPostings {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data = self.postings_source.slice_from(offset);
|
||||
let freq_reading_option = match (self.record_option, requested_option) {
|
||||
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
||||
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
|
||||
(_, _) => FreqReadingOption::ReadFreq,
|
||||
};
|
||||
BlockSegmentPostings::from_data(
|
||||
term_info.doc_freq as usize,
|
||||
SourceRead::from(postings_data),
|
||||
freq_reading_option,
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns a posting object given a `term_info`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most user should prefer using `read_postings` instead.
|
||||
pub fn read_postings_from_terminfo(
|
||||
&self,
|
||||
term_info: &TermInfo,
|
||||
option: IndexRecordOption,
|
||||
) -> SegmentPostings {
|
||||
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
|
||||
let delete_bitset = self.delete_bitset.clone();
|
||||
let position_stream = {
|
||||
if option.has_positions() {
|
||||
let position_offset = term_info.positions_offset;
|
||||
let positions_source = self.positions_source.slice_from(position_offset as usize);
|
||||
let mut stream = CompressedIntStream::wrap(positions_source);
|
||||
stream.skip(term_info.positions_inner_offset as usize);
|
||||
Some(stream)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
SegmentPostings::from_block_postings(block_postings, delete_bitset, position_stream)
|
||||
}
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
/// or `None` if the term has never been encountered and indexed.
|
||||
///
|
||||
/// If the field was not indexed with the indexing options that cover
|
||||
/// the requested options, the returned `SegmentPostings` the method does not fail
|
||||
/// and returns a `SegmentPostings` with as much information as possible.
|
||||
///
|
||||
/// For instance, requesting `IndexRecordOption::Freq` for a
|
||||
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
|
||||
/// with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
pub fn doc_freq(&self, term: &Term) -> u32 {
|
||||
self.get_term_info(term)
|
||||
.map(|term_info| term_info.doc_freq)
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
}
|
||||
@@ -7,8 +7,9 @@ mod segment;
|
||||
mod index_meta;
|
||||
mod pool;
|
||||
mod segment_meta;
|
||||
mod term_iterator;
|
||||
mod inverted_index_reader;
|
||||
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
pub use self::searcher::Searcher;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
@@ -18,8 +19,6 @@ pub use self::segment::SerializableSegment;
|
||||
pub use self::index::Index;
|
||||
pub use self::segment_meta::SegmentMeta;
|
||||
pub use self::index_meta::IndexMeta;
|
||||
pub use self::term_iterator::TermIterator;
|
||||
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -27,7 +26,7 @@ lazy_static! {
|
||||
/// The meta file contains all the information about the list of segments and the schema
|
||||
/// of the index.
|
||||
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");
|
||||
|
||||
|
||||
/// The managed file contains a list of files that were created by the tantivy
|
||||
/// and will therefore be garbage collected when they are deemed useless by tantivy.
|
||||
///
|
||||
@@ -40,4 +39,4 @@ lazy_static! {
|
||||
///
|
||||
/// If the process is killed and this file remains, it is safe to remove it manually.
|
||||
pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,10 +17,10 @@ pub struct Pool<T> {
|
||||
}
|
||||
|
||||
impl<T> Pool<T> {
|
||||
|
||||
pub fn new() -> Pool<T> {
|
||||
let queue = Arc::new(MsQueue::new());
|
||||
Pool {
|
||||
queue: Arc::new(MsQueue::new()),
|
||||
queue,
|
||||
freshest_generation: AtomicUsize::default(),
|
||||
next_generation: AtomicUsize::default(),
|
||||
}
|
||||
@@ -30,52 +30,52 @@ impl<T> Pool<T> {
|
||||
let next_generation = self.next_generation.fetch_add(1, Ordering::SeqCst) + 1;
|
||||
for item in items {
|
||||
let gen_item = GenerationItem {
|
||||
item: item,
|
||||
item,
|
||||
generation: next_generation,
|
||||
};
|
||||
self.queue.push(gen_item);
|
||||
}
|
||||
self.advertise_generation(next_generation);
|
||||
}
|
||||
|
||||
/// At the exit of this method,
|
||||
|
||||
/// At the exit of this method,
|
||||
/// - freshest_generation has a value greater or equal than generation
|
||||
/// - freshest_generation has a value that has been advertised
|
||||
/// - freshest_generation has
|
||||
/// - freshest_generation has)
|
||||
fn advertise_generation(&self, generation: usize) {
|
||||
// not optimal at all but the easiest to read proof.
|
||||
// not optimal at all but the easiest to read proof.
|
||||
loop {
|
||||
let former_generation = self.freshest_generation.load(Ordering::Acquire);
|
||||
if former_generation >= generation {
|
||||
break;
|
||||
}
|
||||
self.freshest_generation.compare_and_swap(former_generation, generation, Ordering::SeqCst);
|
||||
}
|
||||
self.freshest_generation.compare_and_swap(
|
||||
former_generation,
|
||||
generation,
|
||||
Ordering::SeqCst,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn generation(&self,) -> usize {
|
||||
|
||||
fn generation(&self) -> usize {
|
||||
self.freshest_generation.load(Ordering::Acquire)
|
||||
}
|
||||
|
||||
pub fn acquire(&self,) -> LeasedItem<T> {
|
||||
pub fn acquire(&self) -> LeasedItem<T> {
|
||||
let generation = self.generation();
|
||||
loop {
|
||||
let gen_item = self.queue.pop();
|
||||
if gen_item.generation >= generation {
|
||||
return LeasedItem {
|
||||
gen_item: Some(gen_item),
|
||||
recycle_queue: self.queue.clone(),
|
||||
}
|
||||
}
|
||||
else {
|
||||
recycle_queue: Arc::clone(&self.queue),
|
||||
};
|
||||
} else {
|
||||
// this searcher is obsolete,
|
||||
// removing it from the pool.
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
pub struct LeasedItem<T> {
|
||||
@@ -84,29 +84,33 @@ pub struct LeasedItem<T> {
|
||||
}
|
||||
|
||||
impl<T> Deref for LeasedItem<T> {
|
||||
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &T {
|
||||
&self.gen_item.as_ref().expect("Unwrapping a leased item should never fail").item // unwrap is safe here
|
||||
&self.gen_item
|
||||
.as_ref()
|
||||
.expect("Unwrapping a leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DerefMut for LeasedItem<T> {
|
||||
fn deref_mut(&mut self) -> &mut T {
|
||||
&mut self.gen_item.as_mut().expect("Unwrapping a mut leased item should never fail").item // unwrap is safe here
|
||||
&mut self.gen_item
|
||||
.as_mut()
|
||||
.expect("Unwrapping a mut leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Drop for LeasedItem<T> {
|
||||
fn drop(&mut self) {
|
||||
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None).expect("Unwrapping a leased item should never fail");
|
||||
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
|
||||
.expect("Unwrapping a leased item should never fail");
|
||||
self.recycle_queue.push(gen_item);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -127,4 +131,4 @@ mod tests {
|
||||
assert_eq!(*pool.acquire(), 11);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,83 +6,97 @@ use common::TimerTree;
|
||||
use query::Query;
|
||||
use DocId;
|
||||
use DocAddress;
|
||||
use schema::Term;
|
||||
use core::TermIterator;
|
||||
use schema::{Field, Term};
|
||||
use termdict::{TermDictionary, TermMerger};
|
||||
use std::sync::Arc;
|
||||
use std::fmt;
|
||||
|
||||
use core::InvertedIndexReader;
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
/// It guarantees that the `Segment` will not be removed before
|
||||
/// It guarantees that the `Segment` will not be removed before
|
||||
/// the destruction of the `Searcher`.
|
||||
///
|
||||
///
|
||||
pub struct Searcher {
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
}
|
||||
|
||||
|
||||
impl Searcher {
|
||||
|
||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||
///
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
/// the request to the right `Segment`.
|
||||
/// the request to the right `Segment`.
|
||||
pub fn doc(&self, doc_address: &DocAddress) -> Result<Document> {
|
||||
let DocAddress(segment_local_id, doc_id) = *doc_address;
|
||||
let segment_reader = &self.segment_readers[segment_local_id as usize];
|
||||
segment_reader.doc(doc_id)
|
||||
}
|
||||
|
||||
|
||||
/// Returns the overall number of documents in the index.
|
||||
pub fn num_docs(&self,) -> DocId {
|
||||
pub fn num_docs(&self) -> DocId {
|
||||
self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.num_docs())
|
||||
.fold(0u32, |acc, val| acc + val)
|
||||
.sum::<u32>()
|
||||
}
|
||||
|
||||
|
||||
/// Return the overall number of documents containing
|
||||
/// the given term.
|
||||
/// the given term.
|
||||
pub fn doc_freq(&self, term: &Term) -> u32 {
|
||||
self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.doc_freq(term))
|
||||
.fold(0u32, |acc, val| acc + val)
|
||||
}
|
||||
|
||||
/// Returns a Stream over all of the sorted unique terms of
|
||||
/// the searcher.
|
||||
///
|
||||
/// This includes all of the fields from all of the segment_readers.
|
||||
/// See [TermIterator](struct.TermIterator.html).
|
||||
///
|
||||
/// # Warning
|
||||
/// This API is very likely to change in the future.
|
||||
pub fn terms<'a>(&'a self) -> TermIterator<'a> {
|
||||
TermIterator::from(self.segment_readers())
|
||||
.map(|segment_reader| segment_reader.inverted_index(term.field()).doc_freq(term))
|
||||
.sum::<u32>()
|
||||
}
|
||||
|
||||
/// Return the list of segment readers
|
||||
pub fn segment_readers(&self,) -> &[SegmentReader] {
|
||||
pub fn segment_readers(&self) -> &[SegmentReader] {
|
||||
&self.segment_readers
|
||||
}
|
||||
|
||||
|
||||
/// Returns the segment_reader associated with the given segment_ordinal
|
||||
pub fn segment_reader(&self, segment_ord: usize) -> &SegmentReader {
|
||||
&self.segment_readers[segment_ord]
|
||||
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
|
||||
&self.segment_readers[segment_ord as usize]
|
||||
}
|
||||
|
||||
|
||||
/// Runs a query on the segment readers wrapped by the searcher
|
||||
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<TimerTree> {
|
||||
query.search(self, collector)
|
||||
}
|
||||
|
||||
/// Return the field searcher associated to a `Field`.
|
||||
pub fn field(&self, field: Field) -> FieldSearcher {
|
||||
let inv_index_readers = self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.inverted_index(field))
|
||||
.collect::<Vec<_>>();
|
||||
FieldSearcher::new(inv_index_readers)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FieldSearcher {
|
||||
inv_index_readers: Vec<Arc<InvertedIndexReader>>,
|
||||
}
|
||||
|
||||
impl FieldSearcher {
|
||||
fn new(inv_index_readers: Vec<Arc<InvertedIndexReader>>) -> FieldSearcher {
|
||||
FieldSearcher { inv_index_readers }
|
||||
}
|
||||
|
||||
/// Returns a Stream over all of the sorted unique terms of
|
||||
/// for the given field.
|
||||
pub fn terms(&self) -> TermMerger {
|
||||
let term_streamers: Vec<_> = self.inv_index_readers
|
||||
.iter()
|
||||
.map(|inverted_index| inverted_index.terms().stream())
|
||||
.collect();
|
||||
TermMerger::new(term_streamers)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<SegmentReader>> for Searcher {
|
||||
fn from(segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
Searcher {
|
||||
segment_readers: segment_readers,
|
||||
}
|
||||
Searcher { segment_readers }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,4 +108,4 @@ impl fmt::Debug for Searcher {
|
||||
.collect::<Vec<_>>();
|
||||
write!(f, "Searcher({:?})", segment_ids)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::path::PathBuf;
|
||||
use schema::Schema;
|
||||
use std::fmt;
|
||||
use core::SegmentId;
|
||||
use directory::{ReadOnlySource, WritePtr, FileProtection};
|
||||
use directory::{FileProtection, ReadOnlySource, WritePtr};
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use super::SegmentComponent;
|
||||
use core::Index;
|
||||
@@ -26,19 +26,20 @@ impl fmt::Debug for Segment {
|
||||
}
|
||||
|
||||
/// Creates a new segment given an `Index` and a `SegmentId`
|
||||
///
|
||||
/// The function is here to make it private outside `tantivy`.
|
||||
///
|
||||
/// The function is here to make it private outside `tantivy`.
|
||||
pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
|
||||
Segment {
|
||||
index: index,
|
||||
meta: meta,
|
||||
}
|
||||
Segment { index, meta }
|
||||
}
|
||||
|
||||
impl Segment {
|
||||
|
||||
/// Returns the index the segment belongs to.
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
}
|
||||
|
||||
/// Returns our index's schema.
|
||||
pub fn schema(&self,) -> Schema {
|
||||
pub fn schema(&self) -> Schema {
|
||||
self.index.schema()
|
||||
}
|
||||
|
||||
@@ -53,19 +54,18 @@ impl Segment {
|
||||
}
|
||||
|
||||
/// Returns the segment's id.
|
||||
pub fn id(&self,) -> SegmentId {
|
||||
pub fn id(&self) -> SegmentId {
|
||||
self.meta.id()
|
||||
}
|
||||
|
||||
/// Returns the relative path of a component of our segment.
|
||||
///
|
||||
/// It just joins the segment id with the extension
|
||||
///
|
||||
/// It just joins the segment id with the extension
|
||||
/// associated to a segment component.
|
||||
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
|
||||
self.meta.relative_path(component)
|
||||
}
|
||||
|
||||
|
||||
/// Protects a specific component file from being deleted.
|
||||
///
|
||||
/// Returns a FileProtection object. The file is guaranteed
|
||||
@@ -77,16 +77,22 @@ impl Segment {
|
||||
}
|
||||
|
||||
/// Open one of the component file for a *regular* read.
|
||||
pub fn open_read(&self, component: SegmentComponent) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
pub fn open_read(
|
||||
&self,
|
||||
component: SegmentComponent,
|
||||
) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
let path = self.relative_path(component);
|
||||
let source = try!(self.index.directory().open_read(&path));
|
||||
let source = self.index.directory().open_read(&path)?;
|
||||
Ok(source)
|
||||
}
|
||||
|
||||
/// Open one of the component file for *regular* write.
|
||||
pub fn open_write(&mut self, component: SegmentComponent) -> result::Result<WritePtr, OpenWriteError> {
|
||||
pub fn open_write(
|
||||
&mut self,
|
||||
component: SegmentComponent,
|
||||
) -> result::Result<WritePtr, OpenWriteError> {
|
||||
let path = self.relative_path(component);
|
||||
let write = try!(self.index.directory_mut().open_write(&path));
|
||||
let write = self.index.directory_mut().open_write(&path)?;
|
||||
Ok(write)
|
||||
}
|
||||
}
|
||||
@@ -114,20 +120,20 @@ mod tests {
|
||||
let mut index = Index::create_in_ram(SchemaBuilder::new().build());
|
||||
let segment = index.new_segment();
|
||||
let path = segment.relative_path(SegmentComponent::POSTINGS);
|
||||
|
||||
|
||||
let directory = index.directory_mut();
|
||||
directory.atomic_write(&*path, &vec!(0u8)).unwrap();
|
||||
|
||||
directory.atomic_write(&*path, &vec![0u8]).unwrap();
|
||||
|
||||
let living_files = HashSet::new();
|
||||
{
|
||||
let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
|
||||
assert!(directory.exists(&*path));
|
||||
directory.garbage_collect(living_files.clone());
|
||||
directory.garbage_collect(|| living_files.clone());
|
||||
assert!(directory.exists(&*path));
|
||||
}
|
||||
|
||||
directory.garbage_collect(living_files);
|
||||
directory.garbage_collect(|| living_files);
|
||||
assert!(!directory.exists(&*path));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,27 +1,41 @@
|
||||
/// Enum describing each component of a tantivy segment.
|
||||
/// Each component is stored in its own file,
|
||||
/// using the pattern `segment_uuid`.`component_extension`,
|
||||
/// except the delete component that takes an `segment_uuid`.`delete_opstamp`.`component_extension`
|
||||
#[derive(Copy, Clone)]
|
||||
pub enum SegmentComponent {
|
||||
/// Postings (or inverted list). Sorted lists of document ids, associated to terms
|
||||
POSTINGS,
|
||||
/// Positions of terms in each document.
|
||||
POSITIONS,
|
||||
/// Column-oriented random-access storage of fields.
|
||||
FASTFIELDS,
|
||||
/// Stores the sum of the length (in terms) of each field for each document.
|
||||
/// Field norms are stored as a special u64 fast field.
|
||||
FIELDNORMS,
|
||||
/// Dictionary associating `Term`s to `TermInfo`s which is
|
||||
/// simply an address into the `postings` file and the `positions` file.
|
||||
TERMS,
|
||||
/// Row-oriented, LZ4-compressed storage of the documents.
|
||||
/// Accessing a document from the store is relatively slow, as it
|
||||
/// requires to decompress the entire block it belongs to.
|
||||
STORE,
|
||||
DELETE
|
||||
/// Bitset describing which document of the segment is deleted.
|
||||
DELETE,
|
||||
}
|
||||
|
||||
impl SegmentComponent {
|
||||
|
||||
pub fn iterator() -> impl Iterator<Item=&'static SegmentComponent> {
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
|
||||
/// Iterates through the components.
|
||||
pub fn iterator() -> impl Iterator<Item = &'static SegmentComponent> {
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
|
||||
SegmentComponent::POSTINGS,
|
||||
SegmentComponent::POSITIONS,
|
||||
SegmentComponent::FASTFIELDS,
|
||||
SegmentComponent::FIELDNORMS,
|
||||
SegmentComponent::TERMS,
|
||||
SegmentComponent::STORE,
|
||||
SegmentComponent::DELETE
|
||||
SegmentComponent::DELETE,
|
||||
];
|
||||
SEGMENT_COMPONENTS.into_iter()
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,30 +1,27 @@
|
||||
use uuid::Uuid;
|
||||
use std::fmt;
|
||||
use rustc_serialize::{Encoder, Decoder, Encodable, Decodable};
|
||||
use std::cmp::{Ordering, Ord};
|
||||
use std::cmp::{Ord, Ordering};
|
||||
|
||||
#[cfg(test)]
|
||||
use std::sync::atomic;
|
||||
|
||||
/// Tantivy SegmentId.
|
||||
/// Uuid identifying a segment.
|
||||
///
|
||||
/// Tantivy's segment are identified
|
||||
/// Tantivy's segment are identified
|
||||
/// by a UUID which is used to prefix the filenames
|
||||
/// of all of the file associated with the segment.
|
||||
///
|
||||
/// In unit test, for reproducability, the SegmentId are
|
||||
/// In unit test, for reproducability, the `SegmentId` are
|
||||
/// simply generated in an autoincrement fashion.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct SegmentId(Uuid);
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
lazy_static! {
|
||||
static ref AUTO_INC_COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::default();
|
||||
static ref EMPTY_ARR: [u8; 8] = [0u8; 8];
|
||||
}
|
||||
|
||||
|
||||
// During tests, we generate the segment id in a autoincrement manner
|
||||
// for consistency of segment id between run.
|
||||
//
|
||||
@@ -47,43 +44,29 @@ impl SegmentId {
|
||||
SegmentId(create_uuid())
|
||||
}
|
||||
|
||||
|
||||
/// Returns a shorter identifier of the segment.
|
||||
///
|
||||
/// We are using UUID4, so only 6 bits are fixed,
|
||||
/// and the rest is random.
|
||||
///
|
||||
/// Picking the first 8 chars is ok to identify
|
||||
/// Picking the first 8 chars is ok to identify
|
||||
/// segments in a display message.
|
||||
pub fn short_uuid_string(&self,) -> String {
|
||||
pub fn short_uuid_string(&self) -> String {
|
||||
(&self.0.simple().to_string()[..8]).to_string()
|
||||
}
|
||||
|
||||
/// Returns a segment uuid string.
|
||||
pub fn uuid_string(&self,) -> String {
|
||||
pub fn uuid_string(&self) -> String {
|
||||
self.0.simple().to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl Encodable for SegmentId {
|
||||
fn encode<S: Encoder>(&self, s: &mut S) -> Result<(), S::Error> {
|
||||
self.0.encode(s)
|
||||
}
|
||||
}
|
||||
|
||||
impl Decodable for SegmentId {
|
||||
fn decode<D: Decoder>(d: &mut D) -> Result<Self, D::Error> {
|
||||
Uuid::decode(d).map(SegmentId)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "Seg({:?})", self.short_uuid_string())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl PartialOrd for SegmentId {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
|
||||
@@ -3,30 +3,29 @@ use super::SegmentComponent;
|
||||
use std::path::PathBuf;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[derive(Clone, Debug, RustcDecodable,RustcEncodable)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
num_deleted_docs: u32,
|
||||
opstamp: u64,
|
||||
}
|
||||
|
||||
/// SegmentMeta contains simple meta information about a segment.
|
||||
/// `SegmentMeta` contains simple meta information about a segment.
|
||||
///
|
||||
/// For instance the number of docs it contains,
|
||||
/// how many are deleted, etc.
|
||||
#[derive(Clone, Debug, RustcDecodable,RustcEncodable)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct SegmentMeta {
|
||||
segment_id: SegmentId,
|
||||
max_doc: u32,
|
||||
deletes: Option<DeleteMeta>,
|
||||
deletes: Option<DeleteMeta>,
|
||||
}
|
||||
|
||||
impl SegmentMeta {
|
||||
|
||||
/// Creates a new segment meta for
|
||||
/// Creates a new segment meta for
|
||||
/// a segment with no deletes and no documents.
|
||||
pub fn new(segment_id: SegmentId) -> SegmentMeta {
|
||||
SegmentMeta {
|
||||
segment_id: segment_id,
|
||||
segment_id,
|
||||
max_doc: 0,
|
||||
deletes: None,
|
||||
}
|
||||
@@ -53,16 +52,13 @@ impl SegmentMeta {
|
||||
/// and are not used by any segment anymore.
|
||||
pub fn list_files(&self) -> HashSet<PathBuf> {
|
||||
SegmentComponent::iterator()
|
||||
.map(|component| {
|
||||
self.relative_path(*component)
|
||||
})
|
||||
.map(|component| self.relative_path(*component))
|
||||
.collect::<HashSet<PathBuf>>()
|
||||
|
||||
}
|
||||
|
||||
/// Returns the relative path of a component of our segment.
|
||||
///
|
||||
/// It just joins the segment id with the extension
|
||||
///
|
||||
/// It just joins the segment id with the extension
|
||||
/// associated to a segment component.
|
||||
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
|
||||
let mut path = self.id().uuid_string();
|
||||
@@ -73,7 +69,7 @@ impl SegmentMeta {
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||
SegmentComponent::DELETE => {format!(".{}.del", self.delete_opstamp().unwrap_or(0))},
|
||||
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
||||
});
|
||||
PathBuf::from(path)
|
||||
}
|
||||
@@ -95,9 +91,7 @@ impl SegmentMeta {
|
||||
/// Returns the opstamp of the last delete operation
|
||||
/// taken in account in this segment.
|
||||
pub fn delete_opstamp(&self) -> Option<u64> {
|
||||
self.deletes
|
||||
.as_ref()
|
||||
.map(|delete_meta| delete_meta.opstamp)
|
||||
self.deletes.as_ref().map(|delete_meta| delete_meta.opstamp)
|
||||
}
|
||||
|
||||
/// Returns true iff the segment meta contains
|
||||
@@ -114,8 +108,8 @@ impl SegmentMeta {
|
||||
#[doc(hidden)]
|
||||
pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) {
|
||||
self.deletes = Some(DeleteMeta {
|
||||
num_deleted_docs: num_deleted_docs,
|
||||
opstamp: opstamp,
|
||||
num_deleted_docs,
|
||||
opstamp,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,27 +2,29 @@ use Result;
|
||||
use core::Segment;
|
||||
use core::SegmentId;
|
||||
use core::SegmentComponent;
|
||||
use schema::Term;
|
||||
use std::sync::RwLock;
|
||||
use common::HasLen;
|
||||
use core::SegmentMeta;
|
||||
use fastfield::delete::DeleteBitSet;
|
||||
use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::DeleteBitSet;
|
||||
use store::StoreReader;
|
||||
use schema::Document;
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use std::str;
|
||||
use postings::TermInfo;
|
||||
use datastruct::FstMap;
|
||||
use std::sync::Arc;
|
||||
use std::collections::HashMap;
|
||||
use common::CompositeFile;
|
||||
use std::fmt;
|
||||
use core::InvertedIndexReader;
|
||||
use schema::Field;
|
||||
use postings::SegmentPostingsOption;
|
||||
use postings::SegmentPostings;
|
||||
use fastfield::{U32FastFieldsReader, U32FastFieldReader};
|
||||
use schema::Schema;
|
||||
use schema::FieldType;
|
||||
use postings::FreqHandler;
|
||||
use schema::TextIndexingOptions;
|
||||
use error::ErrorKind;
|
||||
use termdict::TermDictionaryImpl;
|
||||
use fastfield::FacetReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Schema;
|
||||
use termdict::TermDictionary;
|
||||
use fastfield::{FastValue, MultiValueIntFastFieldReader};
|
||||
use schema::Cardinality;
|
||||
|
||||
/// Entry point to access all of the datastructures of the `Segment`
|
||||
///
|
||||
@@ -35,17 +37,23 @@ use schema::TextIndexingOptions;
|
||||
/// The segment reader has a very low memory footprint,
|
||||
/// as close to all of the memory data is mmapped.
|
||||
///
|
||||
///
|
||||
/// TODO fix not decoding docfreq
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentReader {
|
||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||
|
||||
segment_id: SegmentId,
|
||||
segment_meta: SegmentMeta,
|
||||
term_infos: Arc<FstMap<TermInfo>>,
|
||||
postings_data: ReadOnlySource,
|
||||
|
||||
termdict_composite: CompositeFile,
|
||||
postings_composite: CompositeFile,
|
||||
positions_composite: CompositeFile,
|
||||
fast_fields_composite: CompositeFile,
|
||||
fieldnorms_composite: CompositeFile,
|
||||
|
||||
store_reader: StoreReader,
|
||||
fast_fields_reader: Arc<U32FastFieldsReader>,
|
||||
fieldnorms_reader: Arc<U32FastFieldsReader>,
|
||||
delete_bitset: DeleteBitSet,
|
||||
positions_data: ReadOnlySource,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
@@ -57,7 +65,7 @@ impl SegmentReader {
|
||||
pub fn max_doc(&self) -> DocId {
|
||||
self.segment_meta.max_doc()
|
||||
}
|
||||
|
||||
|
||||
/// Returns the number of documents.
|
||||
/// Deleted documents are not counted.
|
||||
///
|
||||
@@ -66,7 +74,7 @@ impl SegmentReader {
|
||||
pub fn num_docs(&self) -> DocId {
|
||||
self.segment_meta.num_docs()
|
||||
}
|
||||
|
||||
|
||||
/// Return the number of documents that have been
|
||||
/// deleted in the segment.
|
||||
pub fn num_deleted_docs(&self) -> DocId {
|
||||
@@ -74,104 +82,208 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
/// Accessor to a segment's fast field reader given a field.
|
||||
pub fn get_fast_field_reader(&self, field: Field) -> Option<U32FastFieldReader> {
|
||||
/// Returns the u32 fast value reader if the field
|
||||
/// is a u32 field indexed as "fast".
|
||||
///
|
||||
/// Return None if the field is not a u32 field
|
||||
/// indexed with the fast option.
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
///
|
||||
/// Returns the u64 fast value reader if the field
|
||||
/// is a u64 field indexed as "fast".
|
||||
///
|
||||
/// Return a FastFieldNotAvailableError if the field is not
|
||||
/// declared as a fast field in the schema.
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
pub fn fast_field_reader<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
match field_entry.field_type() {
|
||||
&FieldType::Str(_) => {
|
||||
warn!("Field <{}> is not a fast field. It is a text field, and fast text fields are not supported yet.", field_entry.name());
|
||||
None
|
||||
},
|
||||
&FieldType::U32(ref u32_options) => {
|
||||
if u32_options.is_fast() {
|
||||
self.fast_fields_reader.get_field(field)
|
||||
}
|
||||
else {
|
||||
warn!("Field <{}> is not defined as a fast field.", field_entry.name());
|
||||
None
|
||||
}
|
||||
},
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
|
||||
{
|
||||
self.fast_fields_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
|
||||
/// May panick if the field is not a multivalued fastfield of the type `Item`.
|
||||
pub fn multi_fast_field_reader<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
|
||||
{
|
||||
let idx_reader = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let vals_reader = self.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the `FacetReader` associated to a given `Field`.
|
||||
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if field_entry.field_type() != &FieldType::HierarchicalFacet {
|
||||
return Err(ErrorKind::InvalidArgument(format!(
|
||||
"The field {:?} is not a \
|
||||
hierarchical facet.",
|
||||
field_entry
|
||||
)).into());
|
||||
}
|
||||
let term_ords_reader = self.multi_fast_field_reader(field)?;
|
||||
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
|
||||
ErrorKind::InvalidArgument(format!(
|
||||
"The field \"{}\" is a hierarchical \
|
||||
but this segment does not seem to have the field term \
|
||||
dictionary.",
|
||||
field_entry.name()
|
||||
))
|
||||
})?;
|
||||
let termdict = TermDictionaryImpl::from_source(termdict_source);
|
||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||
Ok(facet_reader)
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `Field norms`'s reader.
|
||||
///
|
||||
/// Field norms are the length (in tokens) of the fields.
|
||||
/// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
|
||||
/// It is used in the computation of the [TfIdf]
|
||||
/// (https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
|
||||
///
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<U32FastFieldReader> {
|
||||
self.fieldnorms_reader.get_field(field)
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
pub fn get_fieldnorms_reader(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||
self.fieldnorms_composite
|
||||
.open_read(field)
|
||||
.map(FastFieldReader::open)
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
pub fn doc_freq(&self, term: &Term) -> u32 {
|
||||
match self.get_term_info(term) {
|
||||
Some(term_info) => term_info.doc_freq,
|
||||
None => 0,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Accessor to the segment's `StoreReader`.
|
||||
pub fn get_store_reader(&self) -> &StoreReader {
|
||||
&self.store_reader
|
||||
}
|
||||
|
||||
/// Open a new segment for reading.
|
||||
pub fn open(segment: Segment) -> Result<SegmentReader> {
|
||||
pub fn open(segment: &Segment) -> Result<SegmentReader> {
|
||||
let termdict_source = segment.open_read(SegmentComponent::TERMS)?;
|
||||
let termdict_composite = CompositeFile::open(&termdict_source)?;
|
||||
|
||||
let source = try!(segment.open_read(SegmentComponent::TERMS));
|
||||
let term_infos = try!(FstMap::from_source(source));
|
||||
let store_reader = StoreReader::from(try!(segment.open_read(SegmentComponent::STORE)));
|
||||
let postings_shared_mmap = try!(segment.open_read(SegmentComponent::POSTINGS));
|
||||
|
||||
let fast_field_data = try!(segment.open_read(SegmentComponent::FASTFIELDS));
|
||||
let fast_fields_reader = try!(U32FastFieldsReader::open(fast_field_data));
|
||||
|
||||
let fieldnorms_data = try!(segment.open_read(SegmentComponent::FIELDNORMS));
|
||||
let fieldnorms_reader = try!(U32FastFieldsReader::open(fieldnorms_data));
|
||||
|
||||
let positions_data = segment
|
||||
.open_read(SegmentComponent::POSITIONS)
|
||||
.unwrap_or_else(|_| ReadOnlySource::empty());
|
||||
|
||||
let delete_bitset =
|
||||
if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
DeleteBitSet::open(delete_data)
|
||||
let store_source = segment.open_read(SegmentComponent::STORE)?;
|
||||
let store_reader = StoreReader::from_source(store_source);
|
||||
|
||||
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
|
||||
let postings_composite = CompositeFile::open(&postings_source)?;
|
||||
|
||||
let positions_composite = {
|
||||
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONS) {
|
||||
CompositeFile::open(&source)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
else {
|
||||
DeleteBitSet::empty()
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
|
||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
||||
|
||||
let delete_bitset = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
|
||||
DeleteBitSet::open(delete_data)
|
||||
} else {
|
||||
DeleteBitSet::empty()
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
segment_meta: segment.meta().clone(),
|
||||
postings_data: postings_shared_mmap,
|
||||
term_infos: Arc::new(term_infos),
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_composite,
|
||||
fieldnorms_composite,
|
||||
segment_id: segment.id(),
|
||||
store_reader: store_reader,
|
||||
fast_fields_reader: Arc::new(fast_fields_reader),
|
||||
fieldnorms_reader: Arc::new(fieldnorms_reader),
|
||||
delete_bitset: delete_bitset,
|
||||
positions_data: positions_data,
|
||||
schema: schema,
|
||||
store_reader,
|
||||
delete_bitset,
|
||||
positions_composite,
|
||||
schema,
|
||||
})
|
||||
}
|
||||
|
||||
/// Return the term dictionary datastructure.
|
||||
pub fn term_infos(&self) -> &FstMap<TermInfo> {
|
||||
&self.term_infos
|
||||
|
||||
/// Returns a field reader associated to the field given in argument.
|
||||
/// If the field was not present in the index during indexing time,
|
||||
/// the InvertedIndexReader is empty.
|
||||
///
|
||||
/// The field reader is in charge of iterating through the
|
||||
/// term dictionary associated to a specific field,
|
||||
/// and opening the posting list associated to any term.
|
||||
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
|
||||
if let Some(inv_idx_reader) = self.inv_idx_reader_cache
|
||||
.read()
|
||||
.expect("Lock poisoned. This should never happen")
|
||||
.get(&field)
|
||||
{
|
||||
return Arc::clone(inv_idx_reader);
|
||||
}
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let record_option_opt = field_type.get_index_record_option();
|
||||
|
||||
if record_option_opt.is_none() {
|
||||
panic!("Field {:?} does not seem indexed.", field_entry.name());
|
||||
}
|
||||
|
||||
let record_option = record_option_opt.unwrap();
|
||||
|
||||
let postings_source_opt = self.postings_composite.open_read(field);
|
||||
|
||||
if postings_source_opt.is_none() {
|
||||
// no documents in the segment contained this field.
|
||||
// As a result, no data is associated to the inverted index.
|
||||
//
|
||||
// Returns an empty inverted index.
|
||||
return Arc::new(InvertedIndexReader::empty(field_type.clone()));
|
||||
}
|
||||
|
||||
let postings_source = postings_source_opt.unwrap();
|
||||
|
||||
let termdict_source = self.termdict_composite
|
||||
.open_read(field)
|
||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
|
||||
|
||||
let positions_source = self.positions_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
|
||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||
TermDictionaryImpl::from_source(termdict_source),
|
||||
postings_source,
|
||||
positions_source,
|
||||
self.delete_bitset.clone(),
|
||||
record_option,
|
||||
));
|
||||
|
||||
// by releasing the lock in between, we may end up opening the inverting index
|
||||
// twice, but this is fine.
|
||||
self.inv_idx_reader_cache
|
||||
.write()
|
||||
.expect("Field reader cache lock poisoned. This should never happen.")
|
||||
.insert(field, Arc::clone(&inv_idx_reader));
|
||||
|
||||
inv_idx_reader
|
||||
}
|
||||
|
||||
|
||||
/// Returns the document (or to be accurate, its stored field)
|
||||
/// bearing the given doc id.
|
||||
/// This method is slow and should seldom be called from
|
||||
@@ -180,85 +292,6 @@ impl SegmentReader {
|
||||
self.store_reader.get(doc_id)
|
||||
}
|
||||
|
||||
|
||||
/// Returns the segment postings associated with the term, and with the given option,
|
||||
/// or `None` if the term has never been encounterred and indexed.
|
||||
///
|
||||
/// If the field was not indexed with the indexing options that cover
|
||||
/// the requested options, the returned `SegmentPostings` the method does not fail
|
||||
/// and returns a `SegmentPostings` with as much information as possible.
|
||||
///
|
||||
/// For instance, requesting `SegmentPostingsOption::FreqAndPositions` for a `TextIndexingOptions`
|
||||
/// that does not index position will return a `SegmentPostings` with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self, term: &Term, option: SegmentPostingsOption) -> Option<SegmentPostings> {
|
||||
let field = term.field();
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let term_info = get!(self.get_term_info(&term));
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data = &self.postings_data[offset..];
|
||||
let freq_handler = match *field_entry.field_type() {
|
||||
FieldType::Str(ref options) => {
|
||||
let indexing_options = options.get_indexing_options();
|
||||
match option {
|
||||
SegmentPostingsOption::NoFreq => {
|
||||
FreqHandler::new_without_freq()
|
||||
}
|
||||
SegmentPostingsOption::Freq => {
|
||||
if indexing_options.is_termfreq_enabled() {
|
||||
FreqHandler::new_with_freq()
|
||||
}
|
||||
else {
|
||||
FreqHandler::new_without_freq()
|
||||
}
|
||||
}
|
||||
SegmentPostingsOption::FreqAndPositions => {
|
||||
if indexing_options == TextIndexingOptions::TokenizedWithFreqAndPosition {
|
||||
let offseted_position_data = &self.positions_data[term_info.positions_offset as usize ..];
|
||||
FreqHandler::new_with_freq_and_position(offseted_position_data)
|
||||
}
|
||||
else if indexing_options.is_termfreq_enabled()
|
||||
{
|
||||
FreqHandler::new_with_freq()
|
||||
}
|
||||
else {
|
||||
FreqHandler::new_without_freq()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
FreqHandler::new_without_freq()
|
||||
}
|
||||
};
|
||||
Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, &self.delete_bitset, freq_handler))
|
||||
}
|
||||
|
||||
|
||||
/// Returns the posting list associated with a term.
|
||||
///
|
||||
/// If the term is not found, return None.
|
||||
/// Even when non-null, because of deletes, the posting object
|
||||
/// returned by this method may contain no documents.
|
||||
pub fn read_postings_all_info(&self, term: &Term) -> Option<SegmentPostings> {
|
||||
let field_entry = self.schema.get_field_entry(term.field());
|
||||
let segment_posting_option = match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => {
|
||||
match text_options.get_indexing_options() {
|
||||
TextIndexingOptions::TokenizedWithFreq => SegmentPostingsOption::Freq,
|
||||
TextIndexingOptions::TokenizedWithFreqAndPosition => SegmentPostingsOption::FreqAndPositions,
|
||||
_ => SegmentPostingsOption::NoFreq,
|
||||
}
|
||||
}
|
||||
FieldType::U32(_) => SegmentPostingsOption::NoFreq
|
||||
};
|
||||
self.read_postings(term, segment_posting_option)
|
||||
}
|
||||
|
||||
/// Returns the term info associated with the term.
|
||||
pub fn get_term_info(&self, term: &Term) -> Option<TermInfo> {
|
||||
self.term_infos.get(term.as_slice())
|
||||
}
|
||||
|
||||
/// Returns the segment id
|
||||
pub fn segment_id(&self) -> SegmentId {
|
||||
self.segment_id
|
||||
@@ -270,7 +303,6 @@ impl SegmentReader {
|
||||
&self.delete_bitset
|
||||
}
|
||||
|
||||
|
||||
/// Returns true iff the `doc` is marked
|
||||
/// as deleted.
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
@@ -278,7 +310,6 @@ impl SegmentReader {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl fmt::Debug for SegmentReader {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "SegmentReader({:?})", self.segment_id)
|
||||
|
||||
@@ -1,185 +0,0 @@
|
||||
use fst::Streamer;
|
||||
use std::mem;
|
||||
use std::collections::BinaryHeap;
|
||||
use fst::map::Keys;
|
||||
use schema::Field;
|
||||
use schema::Term;
|
||||
use core::SegmentReader;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
struct HeapItem {
|
||||
term: Term,
|
||||
segment_ord: usize,
|
||||
}
|
||||
|
||||
impl PartialOrd for HeapItem {
|
||||
fn partial_cmp(&self, other: &HeapItem) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for HeapItem {
|
||||
fn cmp(&self, other: &HeapItem) -> Ordering {
|
||||
(&other.term, &other.segment_ord).cmp(&(&self.term, &self.segment_ord))
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a list of sorted term streams,
|
||||
/// returns an iterator over sorted unique terms.
|
||||
///
|
||||
/// The item yield is actually a pair with
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
pub struct TermIterator<'a> {
|
||||
key_streams: Vec<Keys<'a>>,
|
||||
heap: BinaryHeap<HeapItem>,
|
||||
// Buffer hosting the list of segment ordinals containing
|
||||
// the current term.
|
||||
current_term: Term,
|
||||
current_segment_ords: Vec<usize>,
|
||||
}
|
||||
|
||||
impl<'a> TermIterator<'a> {
|
||||
fn new(key_streams: Vec<Keys<'a>>) -> TermIterator<'a> {
|
||||
let key_streams_len = key_streams.len();
|
||||
TermIterator {
|
||||
key_streams: key_streams,
|
||||
heap: BinaryHeap::new(),
|
||||
current_term: Term::from_field_text(Field(0), ""),
|
||||
current_segment_ords: (0..key_streams_len).collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Advance the term iterator to the next term.
|
||||
/// Returns true if there is indeed another term
|
||||
/// False if there is none.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
self.advance_segments();
|
||||
if let Some(mut head) = self.heap.pop() {
|
||||
mem::swap(&mut self.current_term, &mut head.term);
|
||||
self.current_segment_ords.push(head.segment_ord);
|
||||
loop {
|
||||
match self.heap.peek() {
|
||||
Some(&ref next_heap_it) if next_heap_it.term == self.current_term => {}
|
||||
_ => { break; }
|
||||
}
|
||||
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
|
||||
self.current_segment_ords.push(next_heap_it.segment_ord);
|
||||
}
|
||||
true
|
||||
}
|
||||
else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns the current term.
|
||||
///
|
||||
/// This method may be called
|
||||
/// iff advance() has been called before
|
||||
/// and "true" was returned.
|
||||
pub fn term(&self) -> &Term {
|
||||
&self.current_term
|
||||
}
|
||||
|
||||
/// Returns the sorted list of segment ordinals
|
||||
/// that include the current term.
|
||||
///
|
||||
/// This method may be called
|
||||
/// iff advance() has been called before
|
||||
/// and "true" was returned.
|
||||
pub fn segment_ords(&self) -> &[usize]{
|
||||
&self.current_segment_ords[..]
|
||||
}
|
||||
|
||||
fn advance_segments(&mut self) {
|
||||
for segment_ord in self.current_segment_ords.drain(..) {
|
||||
if let Some(term) = self.key_streams[segment_ord].next() {
|
||||
self.heap.push(HeapItem {
|
||||
term: Term::from_bytes(term),
|
||||
segment_ord: segment_ord,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'f> Streamer<'a> for TermIterator<'f> {
|
||||
type Item = &'a Term;
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
if self.advance() {
|
||||
Some(&self.current_term)
|
||||
}
|
||||
else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [SegmentReader]> for TermIterator<'a> {
|
||||
fn from(segment_readers: &'a [SegmentReader]) -> TermIterator<'a> {
|
||||
TermIterator::new(
|
||||
segment_readers
|
||||
.iter()
|
||||
.map(|reader| reader.term_infos().keys())
|
||||
.collect()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use schema::{SchemaBuilder, Document, TEXT};
|
||||
use core::Index;
|
||||
|
||||
#[test]
|
||||
fn test_term_iterator() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b d f");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a b c d f");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "e f");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let mut term_it = searcher.terms();
|
||||
let mut terms = String::new();
|
||||
while let Some(term) = term_it.next() {
|
||||
unsafe {
|
||||
terms.push_str(term.text());
|
||||
}
|
||||
}
|
||||
assert_eq!(terms, "abcdef");
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,152 +0,0 @@
|
||||
#![allow(should_implement_trait)]
|
||||
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use fst;
|
||||
use fst::raw::Fst;
|
||||
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
fn convert_fst_error(e: fst::Error) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e)
|
||||
}
|
||||
|
||||
pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
|
||||
fst_builder: fst::MapBuilder<W>,
|
||||
data: Vec<u8>,
|
||||
_phantom_: PhantomData<V>,
|
||||
}
|
||||
|
||||
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
|
||||
|
||||
pub fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
|
||||
let fst_builder = try!(fst::MapBuilder::new(w).map_err(convert_fst_error));
|
||||
Ok(FstMapBuilder {
|
||||
fst_builder: fst_builder,
|
||||
data: Vec::new(),
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
/// Horribly unsafe, nobody should ever do that... except me :)
|
||||
///
|
||||
/// If used, it must be used by systematically alternating calls
|
||||
/// to insert_key and insert_value.
|
||||
///
|
||||
/// TODO see if I can bend Rust typesystem to enforce that
|
||||
/// in a nice way.
|
||||
pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
|
||||
try!(self.fst_builder
|
||||
.insert(key, self.data.len() as u64)
|
||||
.map_err(convert_fst_error));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Horribly unsafe, nobody should ever do that... except me :)
|
||||
pub fn insert_value(&mut self, value: &V) -> io::Result<()> {
|
||||
try!(value.serialize(&mut self.data));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> {
|
||||
try!(self.fst_builder
|
||||
.insert(key, self.data.len() as u64)
|
||||
.map_err(convert_fst_error));
|
||||
try!(value.serialize(&mut self.data));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(self,) -> io::Result<W> {
|
||||
let mut file = try!(
|
||||
self.fst_builder
|
||||
.into_inner()
|
||||
.map_err(convert_fst_error));
|
||||
let footer_size = self.data.len() as u32;
|
||||
try!(file.write_all(&self.data));
|
||||
try!((footer_size as u32).serialize(&mut file));
|
||||
try!(file.flush());
|
||||
Ok(file)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FstMap<V: BinarySerializable> {
|
||||
fst_index: fst::Map,
|
||||
values_mmap: ReadOnlySource,
|
||||
_phantom_: PhantomData<V>,
|
||||
}
|
||||
|
||||
|
||||
fn open_fst_index(source: ReadOnlySource) -> io::Result<fst::Map> {
|
||||
Ok(fst::Map::from(match source {
|
||||
ReadOnlySource::Anonymous(data) => try!(Fst::from_shared_bytes(data.data, data.start, data.len).map_err(convert_fst_error)),
|
||||
ReadOnlySource::Mmap(mmap_readonly) => try!(Fst::from_mmap(mmap_readonly).map_err(convert_fst_error)),
|
||||
}))
|
||||
}
|
||||
|
||||
impl<V: BinarySerializable> FstMap<V> {
|
||||
|
||||
pub fn keys(&self,) -> fst::map::Keys {
|
||||
self.fst_index.keys()
|
||||
}
|
||||
|
||||
pub fn from_source(source: ReadOnlySource) -> io::Result<FstMap<V>> {
|
||||
let total_len = source.len();
|
||||
let length_offset = total_len - 4;
|
||||
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
|
||||
let footer_size = try!(u32::deserialize(&mut split_len_buffer)) as usize;
|
||||
let split_len = length_offset - footer_size;
|
||||
let fst_source = source.slice(0, split_len);
|
||||
let values_source = source.slice(split_len, length_offset);
|
||||
let fst_index = try!(open_fst_index(fst_source));
|
||||
Ok(FstMap {
|
||||
fst_index: fst_index,
|
||||
values_mmap: values_source,
|
||||
_phantom_: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
fn read_value(&self, offset: u64) -> V {
|
||||
let buffer = self.values_mmap.as_slice();
|
||||
let mut cursor = &buffer[(offset as usize)..];
|
||||
V::deserialize(&mut cursor).expect("Data in FST is corrupted")
|
||||
}
|
||||
|
||||
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<V> {
|
||||
self.fst_index
|
||||
.get(key)
|
||||
.map(|offset| self.read_value(offset))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use directory::{RAMDirectory, Directory};
|
||||
use std::path::PathBuf;
|
||||
use fst::Streamer;
|
||||
|
||||
#[test]
|
||||
fn test_fstmap() {
|
||||
let mut directory = RAMDirectory::create();
|
||||
let path = PathBuf::from("fstmap");
|
||||
{
|
||||
let write = directory.open_write(&path).unwrap();
|
||||
let mut fstmap_builder = FstMapBuilder::new(write).unwrap();
|
||||
fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap();
|
||||
fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap();
|
||||
fstmap_builder.finish().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
let fstmap: FstMap<u32> = FstMap::from_source(source).unwrap();
|
||||
assert_eq!(fstmap.get("abc"), Some(34u32));
|
||||
assert_eq!(fstmap.get("abcd"), Some(346u32));
|
||||
let mut keys = fstmap.keys();
|
||||
assert_eq!(keys.next().unwrap(), "abc".as_bytes());
|
||||
assert_eq!(keys.next().unwrap(), "abcd".as_bytes());
|
||||
assert_eq!(keys.next(), None);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,7 +1,4 @@
|
||||
mod fstmap;
|
||||
mod skip;
|
||||
pub mod stacker;
|
||||
|
||||
pub use self::fstmap::FstMapBuilder;
|
||||
pub use self::fstmap::FstMap;
|
||||
pub use self::skip::{SkipListBuilder, SkipList};
|
||||
pub use self::skip::{SkipList, SkipListBuilder};
|
||||
|
||||
@@ -6,17 +6,15 @@ mod skiplist;
|
||||
pub use self::skiplist_builder::SkipListBuilder;
|
||||
pub use self::skiplist::SkipList;
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use super::{SkipList, SkipListBuilder};
|
||||
|
||||
#[test]
|
||||
fn test_skiplist() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(10);
|
||||
let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(8);
|
||||
skip_list_builder.insert(2, &3).unwrap();
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
let mut skip_list: SkipList<u32> = SkipList::from(output.as_slice());
|
||||
@@ -26,7 +24,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_skiplist2() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(10);
|
||||
let skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(8);
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
let mut skip_list: SkipList<u32> = SkipList::from(output.as_slice());
|
||||
assert_eq!(skip_list.next(), None);
|
||||
@@ -73,7 +71,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_skiplist5() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3);
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
|
||||
skip_list_builder.insert(2, &()).unwrap();
|
||||
skip_list_builder.insert(3, &()).unwrap();
|
||||
skip_list_builder.insert(5, &()).unwrap();
|
||||
@@ -105,7 +103,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_skiplist7() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3);
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
|
||||
for i in 0..1000 {
|
||||
skip_list_builder.insert(i, &()).unwrap();
|
||||
}
|
||||
@@ -114,44 +112,57 @@ mod tests {
|
||||
let mut skip_list: SkipList<()> = SkipList::from(output.as_slice());
|
||||
assert_eq!(skip_list.next().unwrap(), (0, ()));
|
||||
skip_list.seek(431);
|
||||
assert_eq!(skip_list.next().unwrap(), (431,()) );
|
||||
assert_eq!(skip_list.next().unwrap(), (431, ()));
|
||||
skip_list.seek(1003);
|
||||
assert_eq!(skip_list.next().unwrap(), (1004,()) );
|
||||
assert_eq!(skip_list.next().unwrap(), (1004, ()));
|
||||
assert_eq!(skip_list.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skiplist8() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(10);
|
||||
let mut skip_list_builder: SkipListBuilder<u64> = SkipListBuilder::new(8);
|
||||
skip_list_builder.insert(2, &3).unwrap();
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
assert_eq!(output.len(), 13);
|
||||
assert_eq!(output.len(), 11);
|
||||
assert_eq!(output[0], 1u8 + 128u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skiplist9() {
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<u32> = SkipListBuilder::new(3);
|
||||
for i in 0..9 {
|
||||
let mut skip_list_builder: SkipListBuilder<u64> = SkipListBuilder::new(4);
|
||||
for i in 0..4 * 4 * 4 {
|
||||
skip_list_builder.insert(i, &i).unwrap();
|
||||
}
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
assert_eq!(output.len(), 117);
|
||||
assert_eq!(output[0], 3u8 + 128u8);
|
||||
assert_eq!(output.len(), 774);
|
||||
assert_eq!(output[0], 4u8 + 128u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skiplist10() {
|
||||
// checking that void gets serialized to nothing.
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(3);
|
||||
for i in 0..9 {
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
|
||||
for i in 0..((4 * 4 * 4) - 1) {
|
||||
skip_list_builder.insert(i, &()).unwrap();
|
||||
}
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
assert_eq!(output.len(), 81);
|
||||
assert_eq!(output.len(), 230);
|
||||
assert_eq!(output[0], 128u8 + 3u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skiplist11() {
|
||||
// checking that void gets serialized to nothing.
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let mut skip_list_builder: SkipListBuilder<()> = SkipListBuilder::new(4);
|
||||
for i in 0..(4 * 4) {
|
||||
skip_list_builder.insert(i, &()).unwrap();
|
||||
}
|
||||
skip_list_builder.write::<Vec<u8>>(&mut output).unwrap();
|
||||
assert_eq!(output.len(), 65);
|
||||
assert_eq!(output[0], 128u8 + 3u8);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use common::BinarySerializable;
|
||||
use common::{BinarySerializable, VInt};
|
||||
use std::marker::PhantomData;
|
||||
use DocId;
|
||||
use std::cmp::max;
|
||||
|
||||
static EMPTY: [u8; 0] = [];
|
||||
@@ -8,130 +7,127 @@ static EMPTY: [u8; 0] = [];
|
||||
struct Layer<'a, T> {
|
||||
data: &'a [u8],
|
||||
cursor: &'a [u8],
|
||||
next_id: DocId,
|
||||
next_id: Option<u64>,
|
||||
_phantom_: PhantomData<T>,
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> Iterator for Layer<'a, T> {
|
||||
type Item = (u64, T);
|
||||
|
||||
type Item = (DocId, T);
|
||||
|
||||
fn next(&mut self,)-> Option<(DocId, T)> {
|
||||
if self.next_id == u32::max_value() {
|
||||
None
|
||||
}
|
||||
else {
|
||||
fn next(&mut self) -> Option<(u64, T)> {
|
||||
if let Some(cur_id) = self.next_id {
|
||||
let cur_val = T::deserialize(&mut self.cursor).unwrap();
|
||||
let cur_id = self.next_id;
|
||||
self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
|
||||
self.next_id = VInt::deserialize_u64(&mut self.cursor).ok();
|
||||
Some((cur_id, cur_val))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> From<&'a [u8]> for Layer<'a, T> {
|
||||
fn from(data: &'a [u8]) -> Layer<'a, T> {
|
||||
let mut cursor = data;
|
||||
let next_id = u32::deserialize(&mut cursor).unwrap_or(u32::max_value());
|
||||
let mut cursor = data;
|
||||
let next_id = VInt::deserialize_u64(&mut cursor).ok();
|
||||
Layer {
|
||||
data: data,
|
||||
cursor: cursor,
|
||||
next_id: next_id,
|
||||
data,
|
||||
cursor,
|
||||
next_id,
|
||||
_phantom_: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> Layer<'a, T> {
|
||||
|
||||
fn empty() -> Layer<'a, T> {
|
||||
Layer {
|
||||
data: &EMPTY,
|
||||
cursor: &EMPTY,
|
||||
next_id: DocId::max_value(),
|
||||
next_id: None,
|
||||
_phantom_: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
fn seek_offset(&mut self, offset: usize) {
|
||||
fn seek_offset(&mut self, offset: usize) {
|
||||
self.cursor = &self.data[offset..];
|
||||
self.next_id = u32::deserialize(&mut self.cursor).unwrap_or(u32::max_value());
|
||||
self.next_id = VInt::deserialize_u64(&mut self.cursor).ok();
|
||||
}
|
||||
|
||||
|
||||
// Returns the last element (key, val)
|
||||
// such that (key < doc_id)
|
||||
//
|
||||
// If there is no such element anymore,
|
||||
// returns None.
|
||||
fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> {
|
||||
let mut val = None;
|
||||
while self.next_id < doc_id {
|
||||
match self.next() {
|
||||
None => { break; },
|
||||
v => { val = v; }
|
||||
//
|
||||
// If the element exists, it will be returned
|
||||
// at the next call to `.next()`.
|
||||
fn seek(&mut self, key: u64) -> Option<(u64, T)> {
|
||||
let mut result: Option<(u64, T)> = None;
|
||||
loop {
|
||||
if let Some(next_id) = self.next_id {
|
||||
if next_id < key {
|
||||
if let Some(v) = self.next() {
|
||||
result = Some(v);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
val
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct SkipList<'a, T: BinarySerializable> {
|
||||
data_layer: Layer<'a, T>,
|
||||
skip_layers: Vec<Layer<'a, u32>>,
|
||||
skip_layers: Vec<Layer<'a, u64>>,
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> Iterator for SkipList<'a, T> {
|
||||
type Item = (u64, T);
|
||||
|
||||
type Item = (DocId, T);
|
||||
|
||||
fn next(&mut self,)-> Option<(DocId, T)> {
|
||||
fn next(&mut self) -> Option<(u64, T)> {
|
||||
self.data_layer.next()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: BinarySerializable> SkipList<'a, T> {
|
||||
|
||||
pub fn seek(&mut self, doc_id: DocId) -> Option<(DocId, T)> {
|
||||
let mut next_layer_skip: Option<(DocId, u32)> = None;
|
||||
pub fn seek(&mut self, key: u64) -> Option<(u64, T)> {
|
||||
let mut next_layer_skip: Option<(u64, u64)> = None;
|
||||
for skip_layer in &mut self.skip_layers {
|
||||
if let Some((_, offset)) = next_layer_skip {
|
||||
skip_layer.seek_offset(offset as usize);
|
||||
}
|
||||
next_layer_skip = skip_layer.seek(doc_id);
|
||||
}
|
||||
if let Some((_, offset)) = next_layer_skip {
|
||||
self.data_layer.seek_offset(offset as usize);
|
||||
}
|
||||
self.data_layer.seek(doc_id)
|
||||
next_layer_skip = skip_layer.seek(key);
|
||||
}
|
||||
if let Some((_, offset)) = next_layer_skip {
|
||||
self.data_layer.seek_offset(offset as usize);
|
||||
}
|
||||
self.data_layer.seek(key)
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl<'a, T: BinarySerializable> From<&'a [u8]> for SkipList<'a, T> {
|
||||
|
||||
fn from(mut data: &'a [u8]) -> SkipList<'a, T> {
|
||||
let offsets: Vec<u32> = Vec::deserialize(&mut data).unwrap();
|
||||
let offsets: Vec<u64> = Vec::<VInt>::deserialize(&mut data)
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|el| el.0)
|
||||
.collect();
|
||||
let num_layers = offsets.len();
|
||||
let layers_data: &[u8] = data;
|
||||
let data_layer: Layer<'a, T> =
|
||||
if num_layers == 0 { Layer::empty() }
|
||||
else {
|
||||
let first_layer_data: &[u8] = &layers_data[..offsets[0] as usize];
|
||||
Layer::from(first_layer_data)
|
||||
};
|
||||
let data_layer: Layer<'a, T> = if num_layers == 0 {
|
||||
Layer::empty()
|
||||
} else {
|
||||
let first_layer_data: &[u8] = &layers_data[..offsets[0] as usize];
|
||||
Layer::from(first_layer_data)
|
||||
};
|
||||
let skip_layers = (0..max(1, num_layers) - 1)
|
||||
.map(|i| (offsets[i] as usize, offsets[i + 1] as usize))
|
||||
.map(|(start, stop)| {
|
||||
Layer::from(&layers_data[start..stop])
|
||||
})
|
||||
.map(|(start, stop)| Layer::from(&layers_data[start..stop]))
|
||||
.collect();
|
||||
SkipList {
|
||||
skip_layers: skip_layers,
|
||||
data_layer: data_layer,
|
||||
skip_layers,
|
||||
data_layer,
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,71 +1,65 @@
|
||||
use std::io::Write;
|
||||
use common::BinarySerializable;
|
||||
use common::{BinarySerializable, VInt, is_power_of_2};
|
||||
use std::marker::PhantomData;
|
||||
use DocId;
|
||||
use std::io;
|
||||
|
||||
struct LayerBuilder<T: BinarySerializable> {
|
||||
period: usize,
|
||||
period_mask: usize,
|
||||
buffer: Vec<u8>,
|
||||
remaining: usize,
|
||||
len: usize,
|
||||
_phantom_: PhantomData<T>,
|
||||
}
|
||||
|
||||
impl<T: BinarySerializable> LayerBuilder<T> {
|
||||
|
||||
fn written_size(&self,) -> usize {
|
||||
fn written_size(&self) -> usize {
|
||||
self.buffer.len()
|
||||
}
|
||||
|
||||
fn write(&self, output: &mut Write) -> Result<(), io::Error> {
|
||||
try!(output.write_all(&self.buffer));
|
||||
output.write_all(&self.buffer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn with_period(period: usize) -> LayerBuilder<T> {
|
||||
assert!(is_power_of_2(period), "The period has to be a power of 2.");
|
||||
LayerBuilder {
|
||||
period: period,
|
||||
period_mask: (period - 1),
|
||||
buffer: Vec::new(),
|
||||
remaining: period,
|
||||
len: 0,
|
||||
_phantom_: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
fn insert(&mut self, doc_id: DocId, value: &T) -> io::Result<Option<(DocId, u32)>> {
|
||||
self.remaining -= 1;
|
||||
fn insert(&mut self, key: u64, value: &T) -> io::Result<Option<(u64, u64)>> {
|
||||
self.len += 1;
|
||||
let offset = self.written_size() as u32;
|
||||
try!(doc_id.serialize(&mut self.buffer));
|
||||
try!(value.serialize(&mut self.buffer));
|
||||
Ok(if self.remaining == 0 {
|
||||
self.remaining = self.period;
|
||||
Some((doc_id, offset))
|
||||
}
|
||||
else { None })
|
||||
let offset = self.written_size() as u64;
|
||||
VInt(key).serialize(&mut self.buffer)?;
|
||||
value.serialize(&mut self.buffer)?;
|
||||
let emit_skip_info = (self.period_mask & self.len) == 0;
|
||||
if emit_skip_info {
|
||||
Ok(Some((key, offset)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub struct SkipListBuilder<T: BinarySerializable> {
|
||||
period: usize,
|
||||
data_layer: LayerBuilder<T>,
|
||||
skip_layers: Vec<LayerBuilder<u32>>,
|
||||
skip_layers: Vec<LayerBuilder<u64>>,
|
||||
}
|
||||
|
||||
|
||||
impl<T: BinarySerializable> SkipListBuilder<T> {
|
||||
|
||||
pub fn new(period: usize) -> SkipListBuilder<T> {
|
||||
SkipListBuilder {
|
||||
period: period,
|
||||
period,
|
||||
data_layer: LayerBuilder::with_period(period),
|
||||
skip_layers: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_skip_layer(&mut self, layer_id: usize) -> &mut LayerBuilder<u32> {
|
||||
fn get_skip_layer(&mut self, layer_id: usize) -> &mut LayerBuilder<u64> {
|
||||
if layer_id == self.skip_layers.len() {
|
||||
let layer_builder = LayerBuilder::with_period(self.period);
|
||||
self.skip_layers.push(layer_builder);
|
||||
@@ -73,34 +67,32 @@ impl<T: BinarySerializable> SkipListBuilder<T> {
|
||||
&mut self.skip_layers[layer_id]
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, doc_id: DocId, dest: &T) -> io::Result<()> {
|
||||
pub fn insert(&mut self, key: u64, dest: &T) -> io::Result<()> {
|
||||
let mut layer_id = 0;
|
||||
let mut skip_pointer = try!(self.data_layer.insert(doc_id, dest));
|
||||
let mut skip_pointer = self.data_layer.insert(key, dest)?;
|
||||
loop {
|
||||
skip_pointer = match skip_pointer {
|
||||
Some((skip_doc_id, skip_offset)) =>
|
||||
try!(self
|
||||
.get_skip_layer(layer_id)
|
||||
.insert(skip_doc_id, &skip_offset)),
|
||||
None => { return Ok(()); }
|
||||
Some((skip_doc_id, skip_offset)) => self.get_skip_layer(layer_id)
|
||||
.insert(skip_doc_id, &skip_offset)?,
|
||||
None => {
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
layer_id += 1;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write<W: Write>(self, output: &mut Write) -> io::Result<()> {
|
||||
let mut size: u32 = 0;
|
||||
let mut layer_sizes: Vec<u32> = Vec::new();
|
||||
size += self.data_layer.buffer.len() as u32;
|
||||
layer_sizes.push(size);
|
||||
pub fn write<W: Write>(self, output: &mut W) -> io::Result<()> {
|
||||
let mut size: u64 = self.data_layer.buffer.len() as u64;
|
||||
let mut layer_sizes = vec![VInt(size)];
|
||||
for layer in self.skip_layers.iter().rev() {
|
||||
size += layer.buffer.len() as u32;
|
||||
layer_sizes.push(size);
|
||||
size += layer.buffer.len() as u64;
|
||||
layer_sizes.push(VInt(size));
|
||||
}
|
||||
try!(layer_sizes.serialize(output));
|
||||
try!(self.data_layer.write(output));
|
||||
layer_sizes.serialize(output)?;
|
||||
self.data_layer.write(output)?;
|
||||
for layer in self.skip_layers.iter().rev() {
|
||||
try!(layer.write(output));
|
||||
layer.write(output)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::mem;
|
||||
use super::heap::{Heap, HeapAllocable};
|
||||
|
||||
|
||||
#[inline]
|
||||
pub fn is_power_of_2(val: u32) -> bool {
|
||||
val & (val - 1) == 0
|
||||
@@ -9,11 +8,10 @@ pub fn is_power_of_2(val: u32) -> bool {
|
||||
|
||||
#[inline]
|
||||
pub fn jump_needed(val: u32) -> bool {
|
||||
val > 3 && is_power_of_2(val)
|
||||
val > 3 && is_power_of_2(val)
|
||||
}
|
||||
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ExpUnrolledLinkedList {
|
||||
len: u32,
|
||||
end: u32,
|
||||
@@ -24,10 +22,9 @@ pub struct ExpUnrolledLinkedList {
|
||||
}
|
||||
|
||||
impl ExpUnrolledLinkedList {
|
||||
|
||||
pub fn iter<'a>(&self, addr: u32, heap: &'a Heap) -> ExpUnrolledLinkedListIterator<'a> {
|
||||
ExpUnrolledLinkedListIterator {
|
||||
heap: heap,
|
||||
heap,
|
||||
addr: addr + 2u32 * (mem::size_of::<u32>() as u32),
|
||||
len: self.len,
|
||||
consumed: 0,
|
||||
@@ -42,16 +39,21 @@ impl ExpUnrolledLinkedList {
|
||||
// the next block as a size of (length so far),
|
||||
// and we need to add 1u32 to store the pointer
|
||||
// to the next element.
|
||||
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
|
||||
let new_block_size: usize = (self.len as usize + 1) * mem::size_of::<u32>();
|
||||
let new_block_addr: u32 = heap.allocate_space(new_block_size);
|
||||
heap.set(self.end, &new_block_addr);
|
||||
self.end = new_block_addr;
|
||||
self.end = new_block_addr;
|
||||
}
|
||||
heap.set(self.end, &val);
|
||||
self.end += mem::size_of::<u32>() as u32;
|
||||
}
|
||||
}
|
||||
|
||||
impl HeapAllocable for u32 {
|
||||
fn with_addr(_addr: u32) -> u32 {
|
||||
0u32
|
||||
}
|
||||
}
|
||||
|
||||
impl HeapAllocable for ExpUnrolledLinkedList {
|
||||
fn with_addr(addr: u32) -> ExpUnrolledLinkedList {
|
||||
@@ -77,33 +79,26 @@ pub struct ExpUnrolledLinkedListIterator<'a> {
|
||||
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self,) -> Option<u32> {
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.consumed == self.len {
|
||||
None
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
let addr: u32;
|
||||
self.consumed += 1;
|
||||
if jump_needed(self.consumed) {
|
||||
addr = *self.heap.get_mut_ref(self.addr);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
addr = self.addr;
|
||||
}
|
||||
self.addr = addr + mem::size_of::<u32>() as u32;
|
||||
Some(*self.heap.get_mut_ref(addr))
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
use super::*;
|
||||
use super::super::heap::Heap;
|
||||
use test::Bencher;
|
||||
@@ -147,7 +142,7 @@ mod tests {
|
||||
|
||||
#[bench]
|
||||
fn bench_push_stack(bench: &mut Bencher) {
|
||||
let heap = Heap::with_capacity(64_000_000);
|
||||
let heap = Heap::with_capacity(64_000_000);
|
||||
bench.iter(|| {
|
||||
let mut stacks = Vec::with_capacity(100);
|
||||
for _ in 0..NUM_STACK {
|
||||
@@ -163,4 +158,4 @@ mod tests {
|
||||
heap.clear();
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,26 +1,77 @@
|
||||
use std::iter;
|
||||
use std::marker::PhantomData;
|
||||
use super::heap::{Heap, HeapAllocable, BytesRef};
|
||||
use std::mem;
|
||||
use postings::UnorderedTermId;
|
||||
use super::heap::{BytesRef, Heap, HeapAllocable};
|
||||
|
||||
mod murmurhash2 {
|
||||
|
||||
const SEED: u32 = 3_242_157_231u32;
|
||||
|
||||
#[inline(always)]
|
||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||
let m: u32 = 0x5bd1_e995;
|
||||
let r = 24;
|
||||
let len = key.len() as u32;
|
||||
|
||||
/// dbj2 hash function
|
||||
fn djb2(key: &[u8]) -> u64 {
|
||||
let mut state: u64 = 5381;
|
||||
for &b in key {
|
||||
state = (state << 5).wrapping_add(state).wrapping_add(b as u64);
|
||||
let mut h: u32 = SEED ^ len;
|
||||
let num_blocks = len >> 2;
|
||||
for _ in 0..num_blocks {
|
||||
let mut k: u32 = unsafe { *key_ptr };
|
||||
k = k.wrapping_mul(m);
|
||||
k ^= k >> r;
|
||||
k = k.wrapping_mul(m);
|
||||
k = k.wrapping_mul(m);
|
||||
h ^= k;
|
||||
key_ptr = key_ptr.wrapping_offset(1);
|
||||
}
|
||||
|
||||
// Handle the last few bytes of the input array
|
||||
let remaining = len & 3;
|
||||
let key_ptr_u8: *const u8 = key_ptr as *const u8;
|
||||
match remaining {
|
||||
3 => {
|
||||
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(2)) } << 16;
|
||||
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
|
||||
h ^= unsafe { u32::from(*key_ptr_u8) };
|
||||
h = h.wrapping_mul(m);
|
||||
}
|
||||
2 => {
|
||||
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
|
||||
h ^= unsafe { u32::from(*key_ptr_u8) };
|
||||
h = h.wrapping_mul(m);
|
||||
}
|
||||
1 => {
|
||||
h ^= unsafe { u32::from(*key_ptr_u8) };
|
||||
h = h.wrapping_mul(m);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(m);
|
||||
h ^ (h >> 15)
|
||||
}
|
||||
state
|
||||
}
|
||||
|
||||
impl Default for BytesRef {
|
||||
fn default() -> BytesRef {
|
||||
BytesRef {
|
||||
start: 0u32,
|
||||
stop: 0u32,
|
||||
}
|
||||
}
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
/// - the hash table "table" itself.
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
pub(crate) fn split_memory(per_thread_memory_budget: usize) -> (usize, usize) {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
let compute_table_size = |num_bits: usize| (1 << num_bits) * mem::size_of::<KeyValue>();
|
||||
let table_num_bits: usize = (1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
.expect(&format!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
));
|
||||
let table_size = compute_table_size(table_num_bits);
|
||||
let heap_size = per_thread_memory_budget - table_size;
|
||||
(heap_size, table_num_bits)
|
||||
}
|
||||
|
||||
/// `KeyValue` is the item stored in the hash table.
|
||||
@@ -29,27 +80,21 @@ impl Default for BytesRef {
|
||||
///
|
||||
/// The key and the value are actually stored contiguously.
|
||||
/// For this reason, the (start, stop) information is actually redundant
|
||||
/// and can be simplified in the future
|
||||
/// and can be simplified in the future
|
||||
#[derive(Copy, Clone, Default)]
|
||||
struct KeyValue {
|
||||
key: BytesRef,
|
||||
value_addr: u32,
|
||||
key_value_addr: BytesRef,
|
||||
hash: u32,
|
||||
}
|
||||
|
||||
impl KeyValue {
|
||||
fn is_empty(&self,) -> bool {
|
||||
self.key.stop == 0u32
|
||||
fn is_empty(&self) -> bool {
|
||||
self.key_value_addr.is_null()
|
||||
}
|
||||
}
|
||||
|
||||
pub enum Entry {
|
||||
Vacant(usize),
|
||||
Occupied(u32),
|
||||
}
|
||||
|
||||
|
||||
/// Customized `HashMap` with string keys
|
||||
///
|
||||
///
|
||||
/// This `HashMap` takes String as keys. Keys are
|
||||
/// stored in a user defined heap.
|
||||
///
|
||||
@@ -57,114 +102,115 @@ pub enum Entry {
|
||||
/// the computation of the hash of the key twice,
|
||||
/// or copying the key as long as there is no insert.
|
||||
///
|
||||
pub struct HashMap<'a, V> where V: HeapAllocable {
|
||||
pub struct TermHashMap<'a> {
|
||||
table: Box<[KeyValue]>,
|
||||
heap: &'a Heap,
|
||||
_phantom: PhantomData<V>,
|
||||
mask: usize,
|
||||
occupied: Vec<usize>,
|
||||
}
|
||||
|
||||
impl<'a, V> HashMap<'a, V> where V: HeapAllocable {
|
||||
struct QuadraticProbing {
|
||||
hash: usize,
|
||||
i: usize,
|
||||
mask: usize,
|
||||
}
|
||||
|
||||
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> HashMap<'a, V> {
|
||||
impl QuadraticProbing {
|
||||
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
|
||||
QuadraticProbing {
|
||||
hash,
|
||||
i: 0,
|
||||
mask,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_probe(&mut self) -> usize {
|
||||
self.i += 1;
|
||||
(self.hash + self.i * self.i) & self.mask
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TermHashMap<'a> {
|
||||
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> {
|
||||
let table_size = 1 << num_bucket_power_of_2;
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default())
|
||||
.take(table_size)
|
||||
.collect();
|
||||
HashMap {
|
||||
let table: Vec<KeyValue> = iter::repeat(KeyValue::default()).take(table_size).collect();
|
||||
TermHashMap {
|
||||
table: table.into_boxed_slice(),
|
||||
heap: heap,
|
||||
_phantom: PhantomData,
|
||||
heap,
|
||||
mask: table_size - 1,
|
||||
occupied: Vec::with_capacity(table_size / 2),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn bucket(&self, key: &[u8]) -> usize {
|
||||
let hash: u64 = djb2(key);
|
||||
(hash as usize) & self.mask
|
||||
fn probe(&self, hash: u32) -> QuadraticProbing {
|
||||
QuadraticProbing::compute(hash as usize, self.mask)
|
||||
}
|
||||
|
||||
fn get_key(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
self.heap.get_slice(bytes_ref)
|
||||
pub fn is_saturated(&self) -> bool {
|
||||
self.table.len() < self.occupied.len() * 3
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, key_bytes: &[u8], bucket: usize, addr: u32) -> u32 {
|
||||
#[inline(never)]
|
||||
fn get_key_value(&self, bytes_ref: BytesRef) -> (&[u8], u32) {
|
||||
let key_bytes: &[u8] = self.heap.get_slice(bytes_ref);
|
||||
let expull_addr: u32 = bytes_ref.addr() + 2 + key_bytes.len() as u32;
|
||||
(key_bytes, expull_addr)
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: BytesRef, bucket: usize) {
|
||||
self.occupied.push(bucket);
|
||||
self.table[bucket] = KeyValue {
|
||||
key: self.heap.allocate_and_set(key_bytes),
|
||||
value_addr: addr,
|
||||
key_value_addr, hash
|
||||
};
|
||||
addr
|
||||
}
|
||||
|
||||
pub fn iter<'b: 'a>(&'b self,) -> impl Iterator<Item=(&'a [u8], (u32, &'a V))> + 'b {
|
||||
let heap: &'a Heap = self.heap;
|
||||
let table: &'b [KeyValue] = &self.table;
|
||||
self.occupied
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(move |bucket: usize| {
|
||||
let kv = table[bucket];
|
||||
let addr = kv.value_addr;
|
||||
let v: &V = heap.get_mut_ref::<V>(addr);
|
||||
(heap.get_slice(kv.key), (addr, v))
|
||||
})
|
||||
// .map(move |addr: u32| (heap.get_mut_ref::<V>(addr)) )
|
||||
}
|
||||
|
||||
pub fn values_mut<'b: 'a>(&'b self,) -> impl Iterator<Item=&'a mut V> + 'b {
|
||||
let heap: &'a Heap = self.heap;
|
||||
let table: &'b [KeyValue] = &self.table;
|
||||
self.occupied
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(move |bucket: usize| table[bucket].value_addr)
|
||||
.map(move |addr: u32| heap.get_mut_ref::<V>(addr))
|
||||
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32, UnorderedTermId)> + 'b {
|
||||
self.occupied.iter().cloned().map(move |bucket: usize| {
|
||||
let kv = self.table[bucket];
|
||||
let (key, offset) = self.get_key_value(kv.key_value_addr);
|
||||
(key, offset, bucket as UnorderedTermId)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_or_create<S: AsRef<[u8]>>(&mut self, key: S) -> &mut V {
|
||||
let entry = self.lookup(key.as_ref());
|
||||
match entry {
|
||||
Entry::Occupied(addr) => {
|
||||
self.heap.get_mut_ref(addr)
|
||||
}
|
||||
Entry::Vacant(bucket) => {
|
||||
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
|
||||
self.set_bucket(key.as_ref(), bucket, addr);
|
||||
val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn lookup<S: AsRef<[u8]>>(&self, key: S) -> Entry {
|
||||
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(
|
||||
&mut self,
|
||||
key: S,
|
||||
) -> (UnorderedTermId, &mut V) {
|
||||
let key_bytes: &[u8] = key.as_ref();
|
||||
let mut bucket = self.bucket(key_bytes);
|
||||
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||
let mut probe = self.probe(hash);
|
||||
loop {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
return Entry::Vacant(bucket);
|
||||
let key_bytes_ref = self.heap.allocate_and_set(key_bytes);
|
||||
let (addr, val): (u32, &mut V) = self.heap.allocate_object();
|
||||
assert_eq!(addr, key_bytes_ref.addr() + 2 + key_bytes.len() as u32);
|
||||
self.set_bucket(hash, key_bytes_ref, bucket);
|
||||
return (bucket as UnorderedTermId, val);
|
||||
} else if kv.hash == hash {
|
||||
let (stored_key, expull_addr): (&[u8], u32) = self.get_key_value(kv.key_value_addr);
|
||||
if stored_key == key_bytes {
|
||||
return (
|
||||
bucket as UnorderedTermId,
|
||||
self.heap.get_mut_ref(expull_addr),
|
||||
);
|
||||
}
|
||||
}
|
||||
if self.get_key(kv.key) == key_bytes {
|
||||
return Entry::Occupied(kv.value_addr);
|
||||
}
|
||||
bucket = (bucket + 1) & self.mask;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
use super::*;
|
||||
use super::super::heap::{Heap, HeapAllocable};
|
||||
use super::djb2;
|
||||
use super::murmurhash2::murmurhash2;
|
||||
use test::Bencher;
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::Hasher;
|
||||
use std::collections::HashSet;
|
||||
use super::split_memory;
|
||||
|
||||
struct TestValue {
|
||||
val: u32,
|
||||
@@ -180,54 +226,84 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(split_memory(100_000), (67232, 12));
|
||||
assert_eq!(split_memory(1_000_000), (737856, 15));
|
||||
assert_eq!(split_memory(10_000_000), (7902848, 18));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hash_map() {
|
||||
let heap = Heap::with_capacity(2_000_000);
|
||||
let mut hash_map: HashMap<TestValue> = HashMap::new(18, &heap);
|
||||
let mut hash_map: TermHashMap = TermHashMap::new(18, &heap);
|
||||
{
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc");
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc").1;
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 3u32;
|
||||
|
||||
}
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd");
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
|
||||
assert_eq!(v.val, 0u32);
|
||||
v.val = 4u32;
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc");
|
||||
let v: &mut TestValue = hash_map.get_or_create("abc").1;
|
||||
assert_eq!(v.val, 3u32);
|
||||
}
|
||||
{
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd");
|
||||
let v: &mut TestValue = hash_map.get_or_create("abcd").1;
|
||||
assert_eq!(v.val, 4u32);
|
||||
}
|
||||
let mut iter_values = hash_map.values_mut();
|
||||
assert_eq!(iter_values.next().unwrap().val, 3u32);
|
||||
assert_eq!(iter_values.next().unwrap().val, 4u32);
|
||||
assert!(!iter_values.next().is_some());
|
||||
let mut iter_values = hash_map.iter();
|
||||
{
|
||||
let (_, addr, _) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 3u32);
|
||||
}
|
||||
{
|
||||
let (_, addr, _) = iter_values.next().unwrap();
|
||||
let val: &TestValue = heap.get_ref(addr);
|
||||
assert_eq!(val.val, 4u32);
|
||||
}
|
||||
assert!(iter_values.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur() {
|
||||
let s1 = "abcdef";
|
||||
let s2 = "abcdeg";
|
||||
for i in 0..5 {
|
||||
assert_eq!(
|
||||
murmurhash2(&s1[i..5].as_bytes()),
|
||||
murmurhash2(&s2[i..5].as_bytes())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_murmur_collisions() {
|
||||
let mut set: HashSet<u32> = HashSet::default();
|
||||
for i in 0..10_000 {
|
||||
let s = format!("hash{}", i);
|
||||
let hash = murmurhash2(s.as_bytes());
|
||||
set.insert(hash);
|
||||
}
|
||||
assert_eq!(set.len(), 10_000);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_djb2(bench: &mut Bencher) {
|
||||
let v = String::from("abwer");
|
||||
bench.iter(|| {
|
||||
djb2(v.as_bytes())
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_siphasher(bench: &mut Bencher) {
|
||||
let v = String::from("abwer");
|
||||
bench.iter(|| {
|
||||
let mut h = DefaultHasher::new();
|
||||
h.write(v.as_bytes());
|
||||
h.finish()
|
||||
fn bench_murmurhash_2(b: &mut Bencher) {
|
||||
let keys: Vec<&'static str> =
|
||||
vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||
b.iter(|| {
|
||||
keys.iter()
|
||||
.map(|&s| s.as_bytes())
|
||||
.map(murmurhash2::murmurhash2)
|
||||
.map(|h| h as u64)
|
||||
.last()
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -1,13 +1,29 @@
|
||||
use std::cell::UnsafeCell;
|
||||
use std::mem;
|
||||
use common::allocate_vec;
|
||||
use std::ptr;
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
|
||||
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
|
||||
///
|
||||
/// The slice will encode the length of the `&[u8]` slice
|
||||
/// on 16-bits, and then the data is encoded.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct BytesRef {
|
||||
pub start: u32,
|
||||
pub stop: u32,
|
||||
pub struct BytesRef(u32);
|
||||
|
||||
impl BytesRef {
|
||||
pub fn is_null(&self) -> bool {
|
||||
self.0 == u32::max_value()
|
||||
}
|
||||
|
||||
pub fn addr(&self) -> u32 {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BytesRef {
|
||||
fn default() -> BytesRef {
|
||||
BytesRef(u32::max_value())
|
||||
}
|
||||
}
|
||||
|
||||
/// Object that can be allocated in tantivy's custom `Heap`.
|
||||
@@ -20,20 +36,19 @@ pub struct Heap {
|
||||
inner: UnsafeCell<InnerHeap>,
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(mut_from_ref))]
|
||||
impl Heap {
|
||||
/// Creates a new heap with a given capacity
|
||||
pub fn with_capacity(num_bytes: usize) -> Heap {
|
||||
Heap {
|
||||
inner: UnsafeCell::new(
|
||||
InnerHeap::with_capacity(num_bytes)
|
||||
),
|
||||
inner: UnsafeCell::new(InnerHeap::with_capacity(num_bytes)),
|
||||
}
|
||||
}
|
||||
|
||||
fn inner(&self,) -> &mut InnerHeap {
|
||||
unsafe { &mut *self.inner.get() }
|
||||
fn inner(&self) -> &mut InnerHeap {
|
||||
unsafe { &mut *self.inner.get() }
|
||||
}
|
||||
|
||||
|
||||
/// Clears the heap. All the underlying data is lost.
|
||||
///
|
||||
/// This heap does not support deallocation.
|
||||
@@ -41,19 +56,9 @@ impl Heap {
|
||||
pub fn clear(&self) {
|
||||
self.inner().clear();
|
||||
}
|
||||
|
||||
/// Return the heap capacity.
|
||||
pub fn capacity(&self,) -> u32 {
|
||||
self.inner().capacity()
|
||||
}
|
||||
|
||||
/// Return the amount of memory that has been allocated so far.
|
||||
pub fn len(&self,) -> u32 {
|
||||
self.inner().len()
|
||||
}
|
||||
|
||||
|
||||
/// Return amount of free space, in bytes.
|
||||
pub fn num_free_bytes(&self,) -> u32 {
|
||||
pub fn num_free_bytes(&self) -> u32 {
|
||||
self.inner().num_free_bytes()
|
||||
}
|
||||
|
||||
@@ -62,42 +67,43 @@ impl Heap {
|
||||
pub fn allocate_space(&self, num_bytes: usize) -> u32 {
|
||||
self.inner().allocate_space(num_bytes)
|
||||
}
|
||||
|
||||
|
||||
/// Allocate an object in the heap
|
||||
pub fn allocate_object<V: HeapAllocable>(&self,) -> (u32, &mut V) {
|
||||
pub fn allocate_object<V: HeapAllocable>(&self) -> (u32, &mut V) {
|
||||
let addr = self.inner().allocate_space(mem::size_of::<V>());
|
||||
let v: V = V::with_addr(addr);
|
||||
self.inner().set(addr, &v);
|
||||
(addr, self.inner().get_mut_ref(addr))
|
||||
}
|
||||
|
||||
|
||||
/// Stores a `&[u8]` in the heap and returns the destination BytesRef.
|
||||
pub fn allocate_and_set(&self, data: &[u8]) -> BytesRef {
|
||||
self.inner().allocate_and_set(data)
|
||||
}
|
||||
|
||||
|
||||
/// Fetches the `&[u8]` stored on the slice defined by the `BytesRef`
|
||||
/// given as argumetn
|
||||
pub fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
self.inner().get_slice(bytes_ref.start, bytes_ref.stop)
|
||||
self.inner().get_slice(bytes_ref)
|
||||
}
|
||||
|
||||
|
||||
/// Stores an item's data in the heap, at the given `address`.
|
||||
pub fn set<Item>(&self, addr: u32, val: &Item) {
|
||||
self.inner().set(addr, val);
|
||||
}
|
||||
|
||||
|
||||
/// Returns a mutable reference for an object at a given Item.
|
||||
pub fn get_mut_ref<Item>(&self, addr: u32) -> &mut Item {
|
||||
self.inner().get_mut_ref(addr)
|
||||
}
|
||||
|
||||
pub fn get_ref<Item>(&self, addr: u32) -> &Item {
|
||||
self.inner().get_mut_ref(addr)
|
||||
/// Returns a mutable reference to an `Item` at a given `addr`.
|
||||
#[cfg(test)]
|
||||
pub fn get_ref<Item>(&self, addr: u32) -> &mut Item {
|
||||
self.get_mut_ref(addr)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
struct InnerHeap {
|
||||
buffer: Vec<u8>,
|
||||
buffer_len: u32,
|
||||
@@ -105,13 +111,11 @@ struct InnerHeap {
|
||||
next_heap: Option<Box<InnerHeap>>,
|
||||
}
|
||||
|
||||
|
||||
impl InnerHeap {
|
||||
|
||||
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
|
||||
let buffer: Vec<u8> = allocate_vec(num_bytes);
|
||||
let buffer: Vec<u8> = vec![0u8; num_bytes];
|
||||
InnerHeap {
|
||||
buffer: buffer,
|
||||
buffer,
|
||||
buffer_len: num_bytes as u32,
|
||||
next_heap: None,
|
||||
used: 0u32,
|
||||
@@ -123,23 +127,14 @@ impl InnerHeap {
|
||||
self.next_heap = None;
|
||||
}
|
||||
|
||||
pub fn capacity(&self,) -> u32 {
|
||||
self.buffer.len() as u32
|
||||
}
|
||||
|
||||
pub fn len(&self,) -> u32 {
|
||||
self.used
|
||||
}
|
||||
|
||||
// Returns the number of free bytes. If the buffer
|
||||
// has reached it's capacity and overflowed to another buffer, return 0.
|
||||
pub fn num_free_bytes(&self,) -> u32 {
|
||||
pub fn num_free_bytes(&self) -> u32 {
|
||||
if self.next_heap.is_some() {
|
||||
0u32
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
self.buffer_len - self.used
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn allocate_space(&mut self, num_bytes: usize) -> u32 {
|
||||
@@ -147,74 +142,85 @@ impl InnerHeap {
|
||||
self.used += num_bytes as u32;
|
||||
if self.used <= self.buffer_len {
|
||||
addr
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
if self.next_heap.is_none() {
|
||||
warn!("Exceeded heap size. The margin was apparently unsufficient. The segment will be committed right after indexing this very last document.");
|
||||
info!(
|
||||
r#"Exceeded heap size. The segment will be committed right
|
||||
after indexing this document."#,
|
||||
);
|
||||
self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize)));
|
||||
}
|
||||
self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
fn get_slice(&self, start: u32, stop: u32) -> &[u8] {
|
||||
|
||||
fn get_slice(&self, bytes_ref: BytesRef) -> &[u8] {
|
||||
let start = bytes_ref.0;
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap.as_ref().unwrap().get_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
}
|
||||
else {
|
||||
&self.buffer[start as usize..stop as usize]
|
||||
self.next_heap
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.get_slice(BytesRef(start - self.buffer_len))
|
||||
} else {
|
||||
let start = start as usize;
|
||||
let len = NativeEndian::read_u16(&self.buffer[start..start + 2]) as usize;
|
||||
&self.buffer[start + 2..start + 2 + len]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] {
|
||||
if start >= self.buffer_len {
|
||||
self.next_heap.as_mut().unwrap().get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
}
|
||||
else {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_slice(start - self.buffer_len, stop - self.buffer_len)
|
||||
} else {
|
||||
&mut self.buffer[start as usize..stop as usize]
|
||||
}
|
||||
}
|
||||
|
||||
fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef {
|
||||
let start = self.allocate_space(data.len());
|
||||
let stop = start + data.len() as u32;
|
||||
self.get_mut_slice(start, stop).clone_from_slice(data);
|
||||
BytesRef {
|
||||
start: start as u32,
|
||||
stop: stop as u32,
|
||||
}
|
||||
assert!(data.len() < u16::max_value() as usize);
|
||||
let total_len = 2 + data.len();
|
||||
let start = self.allocate_space(total_len);
|
||||
let total_buff = self.get_mut_slice(start, start + total_len as u32);
|
||||
NativeEndian::write_u16(&mut total_buff[0..2], data.len() as u16);
|
||||
total_buff[2..].clone_from_slice(data);
|
||||
BytesRef(start)
|
||||
}
|
||||
|
||||
fn get_mut(&mut self, addr: u32) -> *mut u8 {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap.as_mut().unwrap().get_mut(addr - self.buffer_len)
|
||||
}
|
||||
else {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut(addr - self.buffer_len)
|
||||
} else {
|
||||
let addr_isize = addr as isize;
|
||||
unsafe { self.buffer.as_mut_ptr().offset(addr_isize) }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
fn get_mut_ref<Item>(&mut self, addr: u32) -> &mut Item {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap.as_mut().unwrap().get_mut_ref(addr - self.buffer_len)
|
||||
}
|
||||
else {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.get_mut_ref(addr - self.buffer_len)
|
||||
} else {
|
||||
let v_ptr_u8 = self.get_mut(addr) as *mut u8;
|
||||
let v_ptr = v_ptr_u8 as *mut Item;
|
||||
unsafe { &mut *v_ptr }
|
||||
}
|
||||
}
|
||||
|
||||
fn set<Item>(&mut self, addr: u32, val: &Item) {
|
||||
pub fn set<Item>(&mut self, addr: u32, val: &Item) {
|
||||
if addr >= self.buffer_len {
|
||||
self.next_heap.as_mut().unwrap().set(addr - self.buffer_len, val);
|
||||
}
|
||||
else {
|
||||
self.next_heap
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.set(addr - self.buffer_len, val);
|
||||
} else {
|
||||
let v_ptr: *const Item = val as *const Item;
|
||||
let v_ptr_u8: *const u8 = v_ptr as *const u8;
|
||||
debug_assert!(addr + mem::size_of::<Item>() as u32 <= self.used);
|
||||
@@ -224,4 +230,4 @@ impl InnerHeap {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,46 +1,43 @@
|
||||
mod hashmap;
|
||||
pub(crate) mod hashmap;
|
||||
mod heap;
|
||||
mod expull;
|
||||
|
||||
pub use self::heap::{Heap, HeapAllocable};
|
||||
pub use self::expull::ExpUnrolledLinkedList;
|
||||
pub use self::hashmap::{HashMap, Entry};
|
||||
|
||||
|
||||
|
||||
pub use self::hashmap::TermHashMap;
|
||||
|
||||
#[test]
|
||||
fn test_unrolled_linked_list() {
|
||||
use std::collections;
|
||||
let heap = Heap::with_capacity(30_000_000);
|
||||
{
|
||||
heap.clear();
|
||||
let mut ks: Vec<usize> = (1..5).map(|k| k * 100).collect();
|
||||
ks.push(2);
|
||||
ks.push(3);
|
||||
for k in (1..5).map(|k| k * 100) {
|
||||
let mut hashmap: HashMap<ExpUnrolledLinkedList> = HashMap::new(10, &heap);
|
||||
for k in (1..5).map(|k| k * 100) {
|
||||
let mut hashmap: TermHashMap = TermHashMap::new(10, &heap);
|
||||
for j in 0..k {
|
||||
for i in 0..500 {
|
||||
let mut list = hashmap.get_or_create(i.to_string());
|
||||
list.push(i*j, &heap);
|
||||
let v: &mut ExpUnrolledLinkedList = hashmap.get_or_create(i.to_string()).1;
|
||||
v.push(i * j, &heap);
|
||||
}
|
||||
}
|
||||
let mut map_addr: collections::HashMap<Vec<u8>, u32> = collections::HashMap::new();
|
||||
for (key, addr, _) in hashmap.iter() {
|
||||
map_addr.insert(Vec::from(key), addr);
|
||||
}
|
||||
|
||||
for i in 0..500 {
|
||||
match hashmap.lookup(i.to_string()) {
|
||||
Entry::Occupied(addr) => {
|
||||
let v: &mut ExpUnrolledLinkedList = heap.get_mut_ref(addr);
|
||||
let mut it = v.iter(addr, &heap);
|
||||
for j in 0..k {
|
||||
assert_eq!(it.next().unwrap(), i*j);
|
||||
}
|
||||
assert!(!it.next().is_some());
|
||||
}
|
||||
_ => {
|
||||
panic!("should never happen");
|
||||
}
|
||||
let key: String = i.to_string();
|
||||
let addr: u32 = *map_addr.get(key.as_bytes()).unwrap();
|
||||
let exp_pull: &ExpUnrolledLinkedList = heap.get_ref(addr);
|
||||
let mut it = exp_pull.iter(addr, &heap);
|
||||
for j in 0..k {
|
||||
assert_eq!(it.next().unwrap(), i * j);
|
||||
}
|
||||
assert!(!it.next().is_some());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,38 +1,37 @@
|
||||
use std::marker::Send;
|
||||
use std::fmt;
|
||||
use std::path::Path;
|
||||
use directory::error::{OpenReadError, DeleteError, OpenWriteError};
|
||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use std::result;
|
||||
use std::io;
|
||||
use std::marker::Sync;
|
||||
|
||||
/// Write-once read many (WORM) abstraction for where
|
||||
/// tantivy's data should be stored.
|
||||
/// tantivy's data should be stored.
|
||||
///
|
||||
/// There are currently two implementations of `Directory`
|
||||
///
|
||||
///
|
||||
/// - The [`MMapDirectory`](struct.MmapDirectory.html), this
|
||||
/// should be your default choice.
|
||||
/// - The [`RAMDirectory`](struct.RAMDirectory.html), which
|
||||
/// should be your default choice.
|
||||
/// - The [`RAMDirectory`](struct.RAMDirectory.html), which
|
||||
/// should be used mostly for tests.
|
||||
///
|
||||
///
|
||||
pub trait Directory: fmt::Debug + Send + Sync + 'static {
|
||||
|
||||
/// Opens a virtual file for read.
|
||||
///
|
||||
///
|
||||
/// Once a virtual file is open, its data may not
|
||||
/// change.
|
||||
///
|
||||
/// Specifically, subsequent writes or flushes should
|
||||
/// have no effect on the returned `ReadOnlySource` object.
|
||||
/// have no effect on the returned `ReadOnlySource` object.
|
||||
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError>;
|
||||
|
||||
/// Removes a file
|
||||
///
|
||||
/// Removing a file will not affect an eventual
|
||||
/// existing ReadOnlySource pointing to it.
|
||||
///
|
||||
///
|
||||
/// Removing a nonexistent file, yields a
|
||||
/// `DeleteError::DoesNotExist`.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError>;
|
||||
@@ -40,18 +39,18 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
|
||||
/// Returns true iff the file exists
|
||||
fn exists(&self, path: &Path) -> bool;
|
||||
|
||||
/// Opens a writer for the *virtual file* associated with
|
||||
/// Opens a writer for the *virtual file* associated with
|
||||
/// a Path.
|
||||
///
|
||||
/// Right after this call, the file should be created
|
||||
/// and any subsequent call to `open_read` for the
|
||||
/// and any subsequent call to `open_read` for the
|
||||
/// same path should return a `ReadOnlySource`.
|
||||
///
|
||||
///
|
||||
/// Write operations may be aggressively buffered.
|
||||
/// The client of this trait is responsible for calling flush
|
||||
/// to ensure that subsequent `read` operations
|
||||
/// to ensure that subsequent `read` operations
|
||||
/// will take into account preceding `write` operations.
|
||||
///
|
||||
///
|
||||
/// Flush operation should also be persistent.
|
||||
///
|
||||
/// The user shall not rely on `Drop` triggering `flush`.
|
||||
@@ -60,7 +59,7 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
|
||||
///
|
||||
/// The file may not previously exist.
|
||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError>;
|
||||
|
||||
|
||||
/// Reads the full content file that has been written using
|
||||
/// atomic_write.
|
||||
///
|
||||
@@ -68,17 +67,13 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
|
||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError>;
|
||||
|
||||
/// Atomically replace the content of a file with data.
|
||||
///
|
||||
///
|
||||
/// This calls ensure that reads can never *observe*
|
||||
/// a partially written file.
|
||||
///
|
||||
///
|
||||
/// The file may or may not previously exist.
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
|
||||
|
||||
/// Clones the directory and boxes the clone
|
||||
|
||||
/// Clones the directory and boxes the clone
|
||||
fn box_clone(&self) -> Box<Directory>;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,52 +1,214 @@
|
||||
use std::error::Error as StdError;
|
||||
use std::path::PathBuf;
|
||||
use std::io;
|
||||
use std::fmt;
|
||||
|
||||
/// General IO error with an optional path to the offending file.
|
||||
#[derive(Debug)]
|
||||
pub struct IOError {
|
||||
path: Option<PathBuf>,
|
||||
err: io::Error,
|
||||
}
|
||||
|
||||
impl fmt::Display for IOError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self.path {
|
||||
Some(ref path) => write!(f, "io error occurred on path '{:?}': '{}'", path, self.err),
|
||||
None => write!(f, "io error occurred: '{}'", self.err),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StdError for IOError {
|
||||
fn description(&self) -> &str {
|
||||
"io error occurred"
|
||||
}
|
||||
|
||||
fn cause(&self) -> Option<&StdError> {
|
||||
Some(&self.err)
|
||||
}
|
||||
}
|
||||
|
||||
impl IOError {
|
||||
pub(crate) fn with_path(path: PathBuf, err: io::Error) -> Self {
|
||||
IOError {
|
||||
path: Some(path),
|
||||
err,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<io::Error> for IOError {
|
||||
fn from(err: io::Error) -> IOError {
|
||||
IOError { path: None, err }
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that may occur when opening a directory
|
||||
#[derive(Debug)]
|
||||
pub enum OpenDirectoryError {
|
||||
/// The underlying directory does not exists.
|
||||
/// The underlying directory does not exists.
|
||||
DoesNotExist(PathBuf),
|
||||
/// The path exists but is not a directory.
|
||||
NotADirectory(PathBuf),
|
||||
}
|
||||
|
||||
impl fmt::Display for OpenDirectoryError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match *self {
|
||||
OpenDirectoryError::DoesNotExist(ref path) => {
|
||||
write!(f, "the underlying directory '{:?}' does not exist", path)
|
||||
}
|
||||
OpenDirectoryError::NotADirectory(ref path) => {
|
||||
write!(f, "the path '{:?}' exists but is not a directory", path)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StdError for OpenDirectoryError {
|
||||
fn description(&self) -> &str {
|
||||
"error occurred while opening a directory"
|
||||
}
|
||||
|
||||
fn cause(&self) -> Option<&StdError> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that may occur when starting to write in a file
|
||||
#[derive(Debug)]
|
||||
pub enum OpenWriteError {
|
||||
/// Our directory is WORM, writing an existing file is forbidden.
|
||||
/// Checkout the `Directory` documentation.
|
||||
/// Checkout the `Directory` documentation.
|
||||
FileAlreadyExists(PathBuf),
|
||||
/// Any kind of IO error that happens when
|
||||
/// Any kind of IO error that happens when
|
||||
/// writing in the underlying IO device.
|
||||
IOError(io::Error),
|
||||
IOError(IOError),
|
||||
}
|
||||
|
||||
impl From<io::Error> for OpenWriteError {
|
||||
fn from(err: io::Error) -> OpenWriteError {
|
||||
impl From<IOError> for OpenWriteError {
|
||||
fn from(err: IOError) -> OpenWriteError {
|
||||
OpenWriteError::IOError(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for OpenWriteError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match *self {
|
||||
OpenWriteError::FileAlreadyExists(ref path) => {
|
||||
write!(f, "the file '{:?}' already exists", path)
|
||||
}
|
||||
OpenWriteError::IOError(ref err) => write!(
|
||||
f,
|
||||
"an io error occurred while opening a file for writing: '{}'",
|
||||
err
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StdError for OpenWriteError {
|
||||
fn description(&self) -> &str {
|
||||
"error occurred while opening a file for writing"
|
||||
}
|
||||
|
||||
fn cause(&self) -> Option<&StdError> {
|
||||
match *self {
|
||||
OpenWriteError::FileAlreadyExists(_) => None,
|
||||
OpenWriteError::IOError(ref err) => Some(err),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that may occur when accessing a file read
|
||||
#[derive(Debug)]
|
||||
pub enum OpenReadError {
|
||||
/// The file does not exists.
|
||||
FileDoesNotExist(PathBuf),
|
||||
/// Any kind of IO error that happens when
|
||||
/// Any kind of IO error that happens when
|
||||
/// interacting with the underlying IO device.
|
||||
IOError(io::Error),
|
||||
IOError(IOError),
|
||||
}
|
||||
|
||||
impl From<IOError> for OpenReadError {
|
||||
fn from(err: IOError) -> OpenReadError {
|
||||
OpenReadError::IOError(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for OpenReadError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match *self {
|
||||
OpenReadError::FileDoesNotExist(ref path) => {
|
||||
write!(f, "the file '{:?}' does not exist", path)
|
||||
}
|
||||
OpenReadError::IOError(ref err) => write!(
|
||||
f,
|
||||
"an io error occurred while opening a file for reading: '{}'",
|
||||
err
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StdError for OpenReadError {
|
||||
fn description(&self) -> &str {
|
||||
"error occurred while opening a file for reading"
|
||||
}
|
||||
|
||||
fn cause(&self) -> Option<&StdError> {
|
||||
match *self {
|
||||
OpenReadError::FileDoesNotExist(_) => None,
|
||||
OpenReadError::IOError(ref err) => Some(err),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that may occur when trying to delete a file
|
||||
#[derive(Debug)]
|
||||
pub enum DeleteError {
|
||||
/// The file does not exists.
|
||||
FileDoesNotExist(PathBuf),
|
||||
/// Any kind of IO error that happens when
|
||||
/// Any kind of IO error that happens when
|
||||
/// interacting with the underlying IO device.
|
||||
IOError(io::Error),
|
||||
/// The file may not be deleted because it is
|
||||
IOError(IOError),
|
||||
/// The file may not be deleted because it is
|
||||
/// protected.
|
||||
FileProtected(PathBuf),
|
||||
}
|
||||
|
||||
impl From<IOError> for DeleteError {
|
||||
fn from(err: IOError) -> DeleteError {
|
||||
DeleteError::IOError(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for DeleteError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match *self {
|
||||
DeleteError::FileDoesNotExist(ref path) => {
|
||||
write!(f, "the file '{:?}' does not exist", path)
|
||||
}
|
||||
DeleteError::FileProtected(ref path) => {
|
||||
write!(f, "the file '{:?}' is protected and can't be deleted", path)
|
||||
}
|
||||
DeleteError::IOError(ref err) => {
|
||||
write!(f, "an io error occurred while deleting a file: '{}'", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl StdError for DeleteError {
|
||||
fn description(&self) -> &str {
|
||||
"error occurred while deleting a file"
|
||||
}
|
||||
|
||||
fn cause(&self) -> Option<&StdError> {
|
||||
match *self {
|
||||
DeleteError::FileDoesNotExist(_) | DeleteError::FileProtected(_) => None,
|
||||
DeleteError::IOError(ref err) => Some(err),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,23 +1,23 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
use directory::error::{OpenReadError, DeleteError, OpenWriteError};
|
||||
use serde_json;
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use std::result;
|
||||
use std::io;
|
||||
use Directory;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::collections::HashSet;
|
||||
use std::sync::RwLockWriteGuard;
|
||||
use std::io::Write;
|
||||
use rustc_serialize::json;
|
||||
use core::MANAGED_FILEPATH;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use Result;
|
||||
use Error;
|
||||
use error::{ErrorKind, Result, ResultExt};
|
||||
|
||||
/// Wrapper of directories that keeps track of files created by Tantivy.
|
||||
///
|
||||
/// A managed directory is just a wrapper of a directory
|
||||
/// that keeps a (persisted) list of the files that
|
||||
/// that keeps a (persisted) list of the files that
|
||||
/// have been created (and not deleted) by tantivy so far.
|
||||
///
|
||||
/// Thanks to this list, it implements a `garbage_collect` method
|
||||
@@ -35,7 +35,6 @@ struct MetaInformation {
|
||||
protected_files: HashMap<PathBuf, usize>,
|
||||
}
|
||||
|
||||
|
||||
/// A `FileProtection` prevents the garbage collection of a file.
|
||||
///
|
||||
/// See `ManagedDirectory.protect_file_from_delete`.
|
||||
@@ -45,19 +44,18 @@ pub struct FileProtection {
|
||||
}
|
||||
|
||||
fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
|
||||
let mut meta_informations_wlock = directory.meta_informations
|
||||
let mut meta_informations_wlock = directory
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
if let Some(counter_ref_mut) = meta_informations_wlock
|
||||
.protected_files
|
||||
.get_mut(path) {
|
||||
if let Some(counter_ref_mut) = meta_informations_wlock.protected_files.get_mut(path) {
|
||||
(*counter_ref_mut) -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for FileProtection {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
|
||||
write!(formatter, "FileProtectionFor({:?})", self.path)
|
||||
write!(formatter, "FileProtectionFor({:?})", self.path)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -67,33 +65,39 @@ impl Drop for FileProtection {
|
||||
}
|
||||
}
|
||||
|
||||
impl ManagedDirectory {
|
||||
/// Saves the file containing the list of existing files
|
||||
/// that were created by tantivy.
|
||||
fn save_managed_paths(
|
||||
directory: &mut Directory,
|
||||
wlock: &RwLockWriteGuard<MetaInformation>,
|
||||
) -> io::Result<()> {
|
||||
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
|
||||
write!(&mut w, "\n")?;
|
||||
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl ManagedDirectory {
|
||||
/// Wraps a directory as managed directory.
|
||||
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
|
||||
match directory.atomic_read(&MANAGED_FILEPATH) {
|
||||
Ok(data) => {
|
||||
let managed_files_json = String::from_utf8_lossy(&data);
|
||||
let managed_files: HashSet<PathBuf> = json::decode(&managed_files_json)
|
||||
.map_err(|e| Error::CorruptedFile(MANAGED_FILEPATH.clone(), Box::new(e)))?;
|
||||
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
||||
.chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
meta_informations: Arc::new(RwLock::new(
|
||||
MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
protected_files: HashMap::default()
|
||||
})),
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
protected_files: HashMap::default(),
|
||||
})),
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::FileDoesNotExist(_)) => {
|
||||
Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
meta_informations: Arc::default(),
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::IOError(e)) => {
|
||||
Err(From::from(e))
|
||||
}
|
||||
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
|
||||
directory: box directory,
|
||||
meta_informations: Arc::default(),
|
||||
}),
|
||||
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,27 +105,40 @@ impl ManagedDirectory {
|
||||
///
|
||||
/// Removes the files that were created by `tantivy` and are not
|
||||
/// used by any segment anymore.
|
||||
///
|
||||
///
|
||||
/// * `living_files` - List of files that are still used by the index.
|
||||
///
|
||||
/// This method does not panick nor returns errors.
|
||||
/// If a file cannot be deleted (for permission reasons for instance)
|
||||
/// an error is simply logged, and the file remains in the list of managed
|
||||
/// files.
|
||||
pub fn garbage_collect(&mut self, living_files: HashSet<PathBuf>) {
|
||||
let mut files_to_delete = vec!();
|
||||
{ // releasing the lock as .delete() will use it too.
|
||||
pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
|
||||
info!("Garbage collect");
|
||||
let mut files_to_delete = vec![];
|
||||
{
|
||||
// releasing the lock as .delete() will use it too.
|
||||
let meta_informations_rlock = self.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in garbage collect.");
|
||||
|
||||
// It is crucial to get the living files after acquiring the
|
||||
// read lock of meta informations. That way, we
|
||||
// avoid the following scenario.
|
||||
//
|
||||
// 1) we get the list of living files.
|
||||
// 2) someone creates a new file.
|
||||
// 3) we start garbage collection and remove this file
|
||||
// even though it is a living file.
|
||||
let living_files = get_living_files();
|
||||
|
||||
for managed_path in &meta_informations_rlock.managed_paths {
|
||||
if !living_files.contains(managed_path) {
|
||||
files_to_delete.push(managed_path.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut deleted_files = vec!();
|
||||
|
||||
let mut deleted_files = vec![];
|
||||
{
|
||||
for file_to_delete in files_to_delete {
|
||||
match self.delete(&file_to_delete) {
|
||||
@@ -130,13 +147,14 @@ impl ManagedDirectory {
|
||||
deleted_files.push(file_to_delete);
|
||||
}
|
||||
Err(file_error) => {
|
||||
error!("Failed to delete {:?}", file_to_delete);
|
||||
match file_error {
|
||||
DeleteError::FileDoesNotExist(_) => {
|
||||
deleted_files.push(file_to_delete);
|
||||
}
|
||||
DeleteError::IOError(_) => {
|
||||
if !cfg!(target_os = "windows") {
|
||||
// On windows, delete is expected to fail if the file
|
||||
// is mmapped.
|
||||
error!("Failed to delete {:?}", file_to_delete);
|
||||
}
|
||||
}
|
||||
@@ -144,38 +162,34 @@ impl ManagedDirectory {
|
||||
// this is expected.
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if !deleted_files.is_empty() {
|
||||
// update the list of managed files by removing
|
||||
// update the list of managed files by removing
|
||||
// the file that were removed.
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed directory wlock poisoned (2).");
|
||||
{
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed directory wlock poisoned (2).");
|
||||
let managed_paths_write = &mut meta_informations_wlock.managed_paths;
|
||||
for delete_file in &deleted_files {
|
||||
managed_paths_write.remove(delete_file);
|
||||
}
|
||||
}
|
||||
if let Err(_) = self.save_managed_paths() {
|
||||
if save_managed_paths(self.directory.as_mut(), &meta_informations_wlock).is_err() {
|
||||
error!("Failed to save the list of managed files.");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// Protects a file from being garbage collected.
|
||||
///
|
||||
/// The method returns a `FileProtection` object.
|
||||
/// The file will not be garbage collected as long as the
|
||||
/// `FileProtection` object is kept alive.
|
||||
/// `FileProtection` object is kept alive.
|
||||
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
|
||||
let pathbuf = path.to_owned();
|
||||
{
|
||||
@@ -193,52 +207,33 @@ impl ManagedDirectory {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Saves the file containing the list of existing files
|
||||
/// that were created by tantivy.
|
||||
fn save_managed_paths(&mut self,) -> io::Result<()> {
|
||||
let managed_paths;
|
||||
{
|
||||
let meta_informations_rlock = self.meta_informations
|
||||
.read()
|
||||
.expect("Managed file lock poisoned");
|
||||
managed_paths = meta_informations_rlock.managed_paths.clone();
|
||||
}
|
||||
let mut w = vec!();
|
||||
try!(write!(&mut w, "{}\n", json::as_pretty_json(&managed_paths)));
|
||||
self.directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Registers a file as managed
|
||||
///
|
||||
/// This method must be called before the file is
|
||||
///
|
||||
/// This method must be called before the file is
|
||||
/// actually created to ensure that a failure between
|
||||
/// registering the filepath and creating the file
|
||||
/// will not lead to garbage files that will
|
||||
/// will not lead to garbage files that will
|
||||
/// never get removed.
|
||||
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
|
||||
let has_changed = {
|
||||
let mut meta_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
meta_wlock.managed_paths.insert(filepath.to_owned())
|
||||
};
|
||||
let mut meta_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
||||
if has_changed {
|
||||
self.save_managed_paths()?;
|
||||
save_managed_paths(self.directory.as_mut(), &meta_wlock)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Directory for ManagedDirectory {
|
||||
|
||||
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
self.directory.open_read(path)
|
||||
}
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> result::Result<WritePtr, OpenWriteError> {
|
||||
self.register_file_as_managed(path)?;
|
||||
self.register_file_as_managed(path)
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
|
||||
self.directory.open_write(path)
|
||||
}
|
||||
|
||||
@@ -258,7 +253,7 @@ impl Directory for ManagedDirectory {
|
||||
.expect("poisoned lock in managed directory meta");
|
||||
if let Some(counter) = metas_rlock.protected_files.get(path) {
|
||||
if *counter > 0 {
|
||||
return Err(DeleteError::FileProtected(path.to_owned()))
|
||||
return Err(DeleteError::FileProtected(path.to_owned()));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -268,34 +263,30 @@ impl Directory for ManagedDirectory {
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
self.directory.exists(path)
|
||||
}
|
||||
|
||||
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
box self.clone()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl Clone for ManagedDirectory {
|
||||
fn clone(&self) -> ManagedDirectory {
|
||||
ManagedDirectory {
|
||||
directory: self.directory.box_clone(),
|
||||
meta_informations: self.meta_informations.clone(),
|
||||
meta_informations: Arc::clone(&self.meta_informations),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use directory::MmapDirectory;
|
||||
use std::path::Path;
|
||||
use std::path::Path;
|
||||
use std::io::Write;
|
||||
use tempdir::TempDir;
|
||||
|
||||
|
||||
lazy_static! {
|
||||
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
|
||||
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
|
||||
@@ -313,18 +304,18 @@ mod tests {
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
{
|
||||
managed_directory.atomic_write(*TEST_PATH2, &vec!(0u8,1u8)).unwrap();
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> = [TEST_PATH1.to_owned()]
|
||||
.into_iter()
|
||||
.cloned()
|
||||
.collect();
|
||||
managed_directory.garbage_collect(living_files);
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
@@ -340,13 +331,13 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||
managed_directory.garbage_collect(living_files);
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -357,11 +348,13 @@ mod tests {
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||
managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap();
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||
managed_directory.garbage_collect(living_files.clone());
|
||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
if cfg!(target_os = "windows") {
|
||||
// On Windows, gc should try and fail the file as it is mmapped.
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
@@ -369,16 +362,13 @@ mod tests {
|
||||
drop(_mmap_read);
|
||||
// The file should still be in the list of managed file and
|
||||
// eventually be deleted once mmap is released.
|
||||
managed_directory.garbage_collect(living_files);
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
} else {
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
}
|
||||
else {
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_managed_directory_protect() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
@@ -387,19 +377,19 @@ mod tests {
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||
managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap();
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
{
|
||||
let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1);
|
||||
managed_directory.garbage_collect(living_files.clone());
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
}
|
||||
|
||||
managed_directory.garbage_collect(living_files.clone());
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
use atomicwrites;
|
||||
use common::make_io_err;
|
||||
use directory::Directory;
|
||||
use directory::error::{OpenWriteError, OpenReadError, DeleteError, OpenDirectoryError};
|
||||
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use directory::ReadOnlySource;
|
||||
use directory::shared_vec_slice::SharedVecSlice;
|
||||
use directory::WritePtr;
|
||||
use fst::raw::MmapReadOnly;
|
||||
use memmap::{Mmap, Protection};
|
||||
use std::collections::hash_map::Entry as HashMapEntry;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::From;
|
||||
@@ -15,57 +14,47 @@ use std::fs::{self, File};
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::{self, Seek, SeekFrom};
|
||||
use std::io::{BufWriter, Read, Write};
|
||||
use std::mem;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::Weak;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
|
||||
let convert_file_error = |err: io::Error| {
|
||||
if err.kind() == io::ErrorKind::NotFound {
|
||||
OpenReadError::FileDoesNotExist(full_path.clone())
|
||||
/// Returns None iff the file exists, can be read, but is empty (and hence
|
||||
/// cannot be mmapped).
|
||||
///
|
||||
fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadError> {
|
||||
let file = File::open(full_path).map_err(|e| {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
OpenReadError::FileDoesNotExist(full_path.to_owned())
|
||||
} else {
|
||||
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
|
||||
}
|
||||
else {
|
||||
OpenReadError::IOError(err)
|
||||
}
|
||||
};
|
||||
let file = File::open(&full_path).map_err(convert_file_error)?;
|
||||
let meta_data = file
|
||||
.metadata()
|
||||
.map_err(|e| OpenReadError::IOError(e))?;
|
||||
})?;
|
||||
|
||||
let meta_data = file.metadata()
|
||||
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
|
||||
if meta_data.len() == 0 {
|
||||
// if the file size is 0, it will not be possible
|
||||
// to mmap the file, so we return an anonymous mmap_cache
|
||||
// if the file size is 0, it will not be possible
|
||||
// to mmap the file, so we return None
|
||||
// instead.
|
||||
return Ok(None)
|
||||
return Ok(None);
|
||||
}
|
||||
match Mmap::open(&file, Protection::Read) {
|
||||
Ok(mmap) => {
|
||||
Ok(Some(Arc::new(mmap)))
|
||||
}
|
||||
Err(e) => {
|
||||
Err(OpenReadError::IOError(e))
|
||||
}
|
||||
}
|
||||
|
||||
MmapReadOnly::open(&file)
|
||||
.map(Some)
|
||||
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
||||
}
|
||||
|
||||
#[derive(Default,Clone,Debug,RustcDecodable,RustcEncodable)]
|
||||
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct CacheCounters {
|
||||
// Number of time the cache prevents to call `mmap`
|
||||
pub hit: usize,
|
||||
// Number of time tantivy had to call `mmap`
|
||||
// as no entry was in the cache.
|
||||
pub miss_empty: usize,
|
||||
// Number of time tantivy had to call `mmap`
|
||||
// as the entry in the cache was evinced.
|
||||
pub miss_weak: usize,
|
||||
pub miss: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct CacheInfo {
|
||||
pub counters: CacheCounters,
|
||||
pub mmapped: Vec<PathBuf>,
|
||||
@@ -73,80 +62,45 @@ pub struct CacheInfo {
|
||||
|
||||
struct MmapCache {
|
||||
counters: CacheCounters,
|
||||
cache: HashMap<PathBuf, Weak<Mmap>>,
|
||||
purge_weak_limit: usize,
|
||||
cache: HashMap<PathBuf, MmapReadOnly>,
|
||||
}
|
||||
|
||||
const STARTING_PURGE_WEAK_LIMIT: usize = 1_000;
|
||||
|
||||
impl Default for MmapCache {
|
||||
fn default() -> MmapCache {
|
||||
MmapCache {
|
||||
counters: CacheCounters::default(),
|
||||
cache: HashMap::new(),
|
||||
purge_weak_limit: STARTING_PURGE_WEAK_LIMIT,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl MmapCache {
|
||||
|
||||
fn cleanup(&mut self) {
|
||||
let previous_cache_size = self.cache.len();
|
||||
let mut new_cache = HashMap::new();
|
||||
mem::swap(&mut new_cache, &mut self.cache);
|
||||
self.cache = new_cache
|
||||
.into_iter()
|
||||
.filter(|&(_, ref weak_ref)| weak_ref.upgrade().is_some())
|
||||
.collect();
|
||||
if self.cache.len() == previous_cache_size {
|
||||
self.purge_weak_limit *= 2;
|
||||
}
|
||||
/// Removes a `MmapReadOnly` entry from the mmap cache.
|
||||
fn discard_from_cache(&mut self, full_path: &Path) -> bool {
|
||||
self.cache.remove(full_path).is_some()
|
||||
}
|
||||
|
||||
fn get_info(&mut self) -> CacheInfo {
|
||||
self.cleanup();
|
||||
let paths: Vec<PathBuf> = self.cache.keys()
|
||||
.cloned()
|
||||
.collect();
|
||||
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
|
||||
CacheInfo {
|
||||
counters: self.counters.clone(),
|
||||
mmapped: paths,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mmap(&mut self, full_path: PathBuf) -> Result<Option<Arc<Mmap>>, OpenReadError> {
|
||||
// if we exceed this limit, then we go through the weak
|
||||
// and remove those that are obsolete.
|
||||
if self.cache.len() > self.purge_weak_limit {
|
||||
self.cleanup();
|
||||
}
|
||||
Ok(match self.cache.entry(full_path.clone()) {
|
||||
HashMapEntry::Occupied(mut occupied_entry) => {
|
||||
if let Some(mmap_arc) = occupied_entry.get().upgrade() {
|
||||
self.counters.hit += 1;
|
||||
Some(mmap_arc.clone())
|
||||
}
|
||||
else {
|
||||
// The entry exists but the weak ref has been destroyed.
|
||||
self.counters.miss_weak += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
occupied_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
}
|
||||
else {
|
||||
None
|
||||
}
|
||||
}
|
||||
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<MmapReadOnly>, OpenReadError> {
|
||||
Ok(match self.cache.entry(full_path.to_owned()) {
|
||||
HashMapEntry::Occupied(occupied_entry) => {
|
||||
let mmap = occupied_entry.get();
|
||||
self.counters.hit += 1;
|
||||
Some(mmap.clone())
|
||||
}
|
||||
HashMapEntry::Vacant(vacant_entry) => {
|
||||
self.counters.miss_empty += 1;
|
||||
if let Some(mmap_arc) = open_mmap(&full_path)? {
|
||||
vacant_entry.insert(Arc::downgrade(&mmap_arc));
|
||||
Some(mmap_arc)
|
||||
}
|
||||
else {
|
||||
self.counters.miss += 1;
|
||||
if let Some(mmap) = open_mmap(full_path)? {
|
||||
vacant_entry.insert(mmap.clone());
|
||||
Some(mmap)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
@@ -156,35 +110,33 @@ impl MmapCache {
|
||||
|
||||
/// Directory storing data in files, read via mmap.
|
||||
///
|
||||
/// The Mmap object are cached to limit the
|
||||
/// system calls.
|
||||
/// The Mmap object are cached to limit the
|
||||
/// system calls.
|
||||
#[derive(Clone)]
|
||||
pub struct MmapDirectory {
|
||||
root_path: PathBuf,
|
||||
mmap_cache: Arc<RwLock<MmapCache>>,
|
||||
_temp_directory: Arc<Option<TempDir>>,
|
||||
|
||||
}
|
||||
|
||||
impl fmt::Debug for MmapDirectory {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "MmapDirectory({:?})", self.root_path)
|
||||
}
|
||||
write!(f, "MmapDirectory({:?})", self.root_path)
|
||||
}
|
||||
}
|
||||
|
||||
impl MmapDirectory {
|
||||
|
||||
/// Creates a new MmapDirectory in a temporary directory.
|
||||
///
|
||||
/// This is mostly useful to test the MmapDirectory itself.
|
||||
/// For your unit tests, prefer the RAMDirectory.
|
||||
/// For your unit tests, prefer the RAMDirectory.
|
||||
pub fn create_from_tempdir() -> io::Result<MmapDirectory> {
|
||||
let tempdir = try!(TempDir::new("index"));
|
||||
let tempdir = TempDir::new("index")?;
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let directory = MmapDirectory {
|
||||
root_path: PathBuf::from(tempdir_path),
|
||||
root_path: tempdir_path,
|
||||
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
|
||||
_temp_directory: Arc::new(Some(tempdir))
|
||||
_temp_directory: Arc::new(Some(tempdir)),
|
||||
};
|
||||
Ok(directory)
|
||||
}
|
||||
@@ -193,18 +145,21 @@ impl MmapDirectory {
|
||||
///
|
||||
/// Returns an error if the `directory_path` does not
|
||||
/// exist or if it is not a directory.
|
||||
pub fn open(directory_path: &Path) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let directory_path: &Path = directory_path.as_ref();
|
||||
if !directory_path.exists() {
|
||||
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(directory_path)))
|
||||
}
|
||||
else if !directory_path.is_dir() {
|
||||
Err(OpenDirectoryError::NotADirectory(PathBuf::from(directory_path)))
|
||||
}
|
||||
else {
|
||||
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
|
||||
directory_path,
|
||||
)))
|
||||
} else if !directory_path.is_dir() {
|
||||
Err(OpenDirectoryError::NotADirectory(PathBuf::from(
|
||||
directory_path,
|
||||
)))
|
||||
} else {
|
||||
Ok(MmapDirectory {
|
||||
root_path: PathBuf::from(directory_path),
|
||||
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
|
||||
_temp_directory: Arc::new(None)
|
||||
_temp_directory: Arc::new(None),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -232,18 +187,20 @@ impl MmapDirectory {
|
||||
use std::os::windows::fs::OpenOptionsExt;
|
||||
use winapi::winbase;
|
||||
|
||||
open_opts.write(true)
|
||||
open_opts
|
||||
.write(true)
|
||||
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
|
||||
}
|
||||
|
||||
let fd = try!(open_opts.open(&self.root_path));
|
||||
try!(fd.sync_all());
|
||||
let fd = open_opts.open(&self.root_path)?;
|
||||
fd.sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns some statistical information
|
||||
/// about the Mmap cache.
|
||||
///
|
||||
/// The `MmapDirectory` embeds a `MmapDirectory`
|
||||
///
|
||||
/// The `MmapDirectory` embeds a `MmapDirectory`
|
||||
/// to avoid multiplying the `mmap` system calls.
|
||||
pub fn get_cache_info(&mut self) -> CacheInfo {
|
||||
self.mmap_cache
|
||||
@@ -251,12 +208,10 @@ impl MmapDirectory {
|
||||
.expect("Mmap cache lock is poisoned.")
|
||||
.get_info()
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
/// This Write wraps a File, but has the specificity of
|
||||
/// call `sync_all` on flush.
|
||||
/// This Write wraps a File, but has the specificity of
|
||||
/// call `sync_all` on flush.
|
||||
struct SafeFileWriter(File);
|
||||
|
||||
impl SafeFileWriter {
|
||||
@@ -266,13 +221,12 @@ impl SafeFileWriter {
|
||||
}
|
||||
|
||||
impl Write for SafeFileWriter {
|
||||
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
self.0.write(buf)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
try!(self.0.flush());
|
||||
self.0.flush()?;
|
||||
self.0.sync_all()
|
||||
}
|
||||
}
|
||||
@@ -283,80 +237,83 @@ impl Seek for SafeFileWriter {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl Directory for MmapDirectory {
|
||||
|
||||
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
debug!("Open Read {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
|
||||
let mut mmap_cache = self.mmap_cache
|
||||
.write()
|
||||
.map_err(|_| OpenReadError::IOError(
|
||||
make_io_err(format!("Failed to acquired write lock on mmap cache while reading {:?}", path))
|
||||
))?;
|
||||
|
||||
Ok(mmap_cache.get_mmap(full_path)?
|
||||
.map(MmapReadOnly::from)
|
||||
|
||||
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
|
||||
let msg = format!(
|
||||
"Failed to acquired write lock \
|
||||
on mmap cache while reading {:?}",
|
||||
path
|
||||
);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
|
||||
Ok(mmap_cache
|
||||
.get_mmap(&full_path)?
|
||||
.map(ReadOnlySource::Mmap)
|
||||
.unwrap_or(ReadOnlySource::Anonymous(SharedVecSlice::empty()))
|
||||
)
|
||||
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
|
||||
}
|
||||
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
||||
debug!("Open Write {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
|
||||
|
||||
let open_res = OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.open(full_path);
|
||||
|
||||
let mut file = try!(
|
||||
open_res.map_err(|err| {
|
||||
if err.kind() == io::ErrorKind::AlreadyExists {
|
||||
OpenWriteError::FileAlreadyExists(PathBuf::from(path))
|
||||
}
|
||||
else {
|
||||
OpenWriteError::IOError(err)
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
|
||||
let mut file = open_res.map_err(|err| {
|
||||
if err.kind() == io::ErrorKind::AlreadyExists {
|
||||
OpenWriteError::FileAlreadyExists(path.to_owned())
|
||||
} else {
|
||||
IOError::with_path(path.to_owned(), err).into()
|
||||
}
|
||||
})?;
|
||||
|
||||
// making sure the file is created.
|
||||
try!(file.flush());
|
||||
|
||||
file.flush()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
|
||||
|
||||
// Apparetntly, on some filesystem syncing the parent
|
||||
// directory is required.
|
||||
try!(self.sync_directory());
|
||||
|
||||
self.sync_directory()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
|
||||
|
||||
let writer = SafeFileWriter::new(file);
|
||||
Ok(BufWriter::new(Box::new(writer)))
|
||||
}
|
||||
|
||||
/// Any entry associated to the path in the mmap will be
|
||||
/// removed before the file is deleted.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
debug!("Deleting file {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
let mut mmap_cache = try!(self.mmap_cache
|
||||
.write()
|
||||
.map_err(|_|
|
||||
DeleteError::IOError(make_io_err(format!("Failed to acquired write lock on mmap cache while deleting {:?}", path))))
|
||||
);
|
||||
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
|
||||
let msg = format!(
|
||||
"Failed to acquired write lock \
|
||||
on mmap cache while deleting {:?}",
|
||||
path
|
||||
);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
mmap_cache.discard_from_cache(path);
|
||||
|
||||
// Removing the entry in the MMap cache.
|
||||
// The munmap will appear on Drop,
|
||||
// when the last reference is gone.
|
||||
mmap_cache.cache.remove(&full_path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => {
|
||||
self.sync_directory()
|
||||
.map_err(|e| DeleteError::IOError(e))
|
||||
}
|
||||
Ok(_) => self.sync_directory()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Err(DeleteError::FileDoesNotExist(path.to_owned()))
|
||||
}
|
||||
else {
|
||||
Err(DeleteError::IOError(e))
|
||||
} else {
|
||||
Err(IOError::with_path(path.to_owned(), e).into())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -373,39 +330,32 @@ impl Directory for MmapDirectory {
|
||||
match File::open(&full_path) {
|
||||
Ok(mut file) => {
|
||||
file.read_to_end(&mut buffer)
|
||||
.map_err(|e| OpenReadError::IOError(e))?;
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e))?;
|
||||
Ok(buffer)
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
|
||||
}
|
||||
else {
|
||||
Err(OpenReadError::IOError(e))
|
||||
} else {
|
||||
Err(IOError::with_path(path.to_owned(), e).into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
debug!("Atomic Write {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
let meta_file = atomicwrites::AtomicFile::new(full_path, atomicwrites::AllowOverwrite);
|
||||
try!(meta_file.write(|f| {
|
||||
f.write_all(data)
|
||||
}));
|
||||
meta_file.write(|f| f.write_all(data))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn box_clone(&self,) -> Box<Directory> {
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -437,7 +387,8 @@ mod tests {
|
||||
// here we test if the cache releases
|
||||
// mmaps correctly.
|
||||
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
||||
let paths: Vec<PathBuf> = (0..10)
|
||||
let num_paths = 10;
|
||||
let paths: Vec<PathBuf> = (0..num_paths)
|
||||
.map(|i| PathBuf::from(&*format!("file_{}", i)))
|
||||
.collect();
|
||||
{
|
||||
@@ -448,53 +399,25 @@ mod tests {
|
||||
}
|
||||
}
|
||||
{
|
||||
for path in &paths {
|
||||
{
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 1);
|
||||
}
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
||||
}
|
||||
}
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.miss_empty, 10);
|
||||
|
||||
|
||||
{
|
||||
// test weak miss
|
||||
// the first pass create the weak refs.
|
||||
for path in &paths {
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
}
|
||||
// ... the second hits the weak refs.
|
||||
for path in &paths {
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
}
|
||||
let cache_info = mmap_directory.get_cache_info();
|
||||
assert_eq!(cache_info.counters.miss_empty, 20);
|
||||
assert_eq!(cache_info.counters.miss_weak, 10);
|
||||
}
|
||||
|
||||
{
|
||||
let mut saved_readmmaps = vec!();
|
||||
// Keeps reference alive
|
||||
for (i, path) in paths.iter().enumerate() {
|
||||
let r = mmap_directory.open_read(path).unwrap();
|
||||
saved_readmmaps.push(r);
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
|
||||
}
|
||||
let cache_info = mmap_directory.get_cache_info();
|
||||
println!("{:?}", cache_info);
|
||||
assert_eq!(cache_info.counters.miss_empty, 30);
|
||||
assert_eq!(cache_info.counters.miss_weak, 10);
|
||||
assert_eq!(cache_info.mmapped.len(), 10);
|
||||
|
||||
for saved_readmmap in saved_readmmaps {
|
||||
assert_eq!(saved_readmmap.as_slice(), content);
|
||||
for path in paths.iter() {
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
|
||||
}
|
||||
for (i, path) in paths.iter().enumerate() {
|
||||
mmap_directory.delete(path).unwrap();
|
||||
assert_eq!(
|
||||
mmap_directory.get_cache_info().mmapped.len(),
|
||||
num_paths - i - 1
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 10);
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,3 +1,8 @@
|
||||
/*!
|
||||
|
||||
WORM directory abstraction.
|
||||
|
||||
*/
|
||||
mod mmap_directory;
|
||||
mod ram_directory;
|
||||
mod directory;
|
||||
@@ -8,14 +13,15 @@ mod managed_directory;
|
||||
/// Errors specific to the directory module.
|
||||
pub mod error;
|
||||
|
||||
use std::io::{Write, Seek};
|
||||
use std::io::{BufWriter, Seek, Write};
|
||||
|
||||
use std::io::BufWriter;
|
||||
pub use self::read_only_source::ReadOnlySource;
|
||||
pub use self::directory::Directory;
|
||||
pub use self::ram_directory::RAMDirectory;
|
||||
pub use self::mmap_directory::MmapDirectory;
|
||||
pub use self::managed_directory::{ManagedDirectory, FileProtection};
|
||||
|
||||
pub(crate) use self::read_only_source::SourceRead;
|
||||
pub(crate) use self::managed_directory::{FileProtection, ManagedDirectory};
|
||||
|
||||
/// Synonym of Seek + Write
|
||||
pub trait SeekableWrite: Seek + Write {}
|
||||
@@ -31,8 +37,8 @@ pub type WritePtr = BufWriter<Box<SeekableWrite>>;
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use std::path::Path;
|
||||
use std::io::{Write, Seek, SeekFrom};
|
||||
use std::path::Path;
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
|
||||
lazy_static! {
|
||||
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
|
||||
@@ -65,7 +71,7 @@ mod tests {
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
write_file.write_all(&[4]).unwrap();
|
||||
write_file.write_all(&[3]).unwrap();
|
||||
write_file.write_all(&[7,3,5]).unwrap();
|
||||
write_file.write_all(&[7, 3, 5]).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
||||
@@ -81,9 +87,9 @@ mod tests {
|
||||
{
|
||||
{
|
||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||
write_file.write_all(&[4, 3, 7,3,5]).unwrap();
|
||||
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
|
||||
write_file.seek(SeekFrom::Start(0)).unwrap();
|
||||
write_file.write_all(&[3,1]).unwrap();
|
||||
write_file.write_all(&[3, 1]).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
||||
@@ -98,7 +104,6 @@ mod tests {
|
||||
{
|
||||
directory.open_write(*TEST_PATH).unwrap();
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
|
||||
}
|
||||
{
|
||||
assert!(directory.open_write(*TEST_PATH).is_err());
|
||||
@@ -111,9 +116,6 @@ mod tests {
|
||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||
let _w = directory.open_write(*TEST_PATH).unwrap();
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
if let Err(e) = directory.open_read(*TEST_PATH) {
|
||||
println!("{:?}", e);
|
||||
}
|
||||
assert!(directory.open_read(*TEST_PATH).is_ok());
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
}
|
||||
|
||||
@@ -1,24 +1,24 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::io::{self, BufWriter, Cursor, Write, Seek, SeekFrom};
|
||||
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use common::make_io_err;
|
||||
use directory::{Directory, ReadOnlySource};
|
||||
use directory::error::{OpenWriteError, OpenReadError, DeleteError};
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::WritePtr;
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
|
||||
/// Writer associated with the `RAMDirectory`
|
||||
///
|
||||
///
|
||||
/// The Writer just writes a buffer.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// On drop, if the writer was left in a *dirty* state.
|
||||
/// That is, if flush was not called after the last call
|
||||
/// to write.
|
||||
/// to write.
|
||||
///
|
||||
struct VecWriter {
|
||||
path: PathBuf,
|
||||
@@ -32,7 +32,7 @@ impl VecWriter {
|
||||
VecWriter {
|
||||
path: path_buf,
|
||||
data: Cursor::new(Vec::new()),
|
||||
shared_directory: shared_directory,
|
||||
shared_directory,
|
||||
is_flushed: true,
|
||||
}
|
||||
}
|
||||
@@ -40,8 +40,11 @@ impl VecWriter {
|
||||
|
||||
impl Drop for VecWriter {
|
||||
fn drop(&mut self) {
|
||||
if !self.is_flushed {
|
||||
panic!("You forgot to flush {:?} before its writter got Drop. Do not rely on drop.", self.path)
|
||||
if !self.is_flushed {
|
||||
panic!(
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
|
||||
self.path
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -55,13 +58,14 @@ impl Seek for VecWriter {
|
||||
impl Write for VecWriter {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
self.is_flushed = false;
|
||||
try!(self.data.write_all(buf));
|
||||
self.data.write_all(buf)?;
|
||||
Ok(buf.len())
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.is_flushed = true;
|
||||
try!(self.shared_directory.write(self.path.clone(), self.data.get_ref()));
|
||||
self.shared_directory
|
||||
.write(self.path.clone(), self.data.get_ref())?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -69,38 +73,40 @@ impl Write for VecWriter {
|
||||
#[derive(Clone)]
|
||||
struct InnerDirectory(Arc<RwLock<HashMap<PathBuf, Arc<Vec<u8>>>>>);
|
||||
|
||||
|
||||
|
||||
impl InnerDirectory {
|
||||
|
||||
fn new() -> InnerDirectory {
|
||||
InnerDirectory(Arc::new(RwLock::new(HashMap::new())))
|
||||
}
|
||||
|
||||
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
|
||||
let mut map = try!(
|
||||
self.0
|
||||
.write()
|
||||
.map_err(|_| make_io_err(format!("Failed to lock the directory, when trying to write {:?}", path)))
|
||||
);
|
||||
let mut map = self.0.write().map_err(|_| {
|
||||
make_io_err(format!(
|
||||
"Failed to lock the directory, when trying to write {:?}",
|
||||
path
|
||||
))
|
||||
})?;
|
||||
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
|
||||
Ok(prev_value.is_some())
|
||||
}
|
||||
|
||||
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
||||
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
||||
self.0
|
||||
.read()
|
||||
.map_err(|_| {
|
||||
let io_err = make_io_err(format!("Failed to acquire read lock for the directory, when trying to read {:?}", path));
|
||||
OpenReadError::IOError(io_err)
|
||||
let msg = format!(
|
||||
"Failed to acquire read lock for the \
|
||||
directory when trying to read {:?}",
|
||||
path
|
||||
);
|
||||
let io_err = make_io_err(msg);
|
||||
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
})
|
||||
.and_then(|readable_map| {
|
||||
readable_map
|
||||
.get(path)
|
||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||
.map(|data| {
|
||||
ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone()))
|
||||
})
|
||||
.get(path)
|
||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||
.map(Arc::clone)
|
||||
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data)))
|
||||
})
|
||||
}
|
||||
|
||||
@@ -108,18 +114,17 @@ impl InnerDirectory {
|
||||
self.0
|
||||
.write()
|
||||
.map_err(|_| {
|
||||
let io_err = make_io_err(format!("Failed to acquire write lock for the directory, when trying to delete {:?}", path));
|
||||
DeleteError::IOError(io_err)
|
||||
let msg = format!(
|
||||
"Failed to acquire write lock for the \
|
||||
directory when trying to delete {:?}",
|
||||
path
|
||||
);
|
||||
let io_err = make_io_err(msg);
|
||||
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
})
|
||||
.and_then(|mut writable_map| {
|
||||
match writable_map.remove(path) {
|
||||
Some(_) => {
|
||||
Ok(())
|
||||
},
|
||||
None => {
|
||||
Err(DeleteError::FileDoesNotExist(PathBuf::from(path)))
|
||||
}
|
||||
}
|
||||
.and_then(|mut writable_map| match writable_map.remove(path) {
|
||||
Some(_) => Ok(()),
|
||||
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -129,16 +134,14 @@ impl InnerDirectory {
|
||||
.expect("Failed to get read lock directory.")
|
||||
.contains_key(path)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl fmt::Debug for RAMDirectory {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "RAMDirectory")
|
||||
}
|
||||
write!(f, "RAMDirectory")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// A Directory storing everything in anonymous memory.
|
||||
///
|
||||
/// It is mainly meant for unit testing.
|
||||
@@ -150,11 +153,10 @@ pub struct RAMDirectory {
|
||||
}
|
||||
|
||||
impl RAMDirectory {
|
||||
|
||||
/// Constructor
|
||||
pub fn create() -> RAMDirectory {
|
||||
RAMDirectory {
|
||||
fs: InnerDirectory::new()
|
||||
fs: InnerDirectory::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -163,15 +165,19 @@ impl Directory for RAMDirectory {
|
||||
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
self.fs.open_read(path)
|
||||
}
|
||||
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
||||
let path_buf = PathBuf::from(path);
|
||||
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
|
||||
let exists = self.fs
|
||||
.write(path_buf.clone(), &Vec::new())
|
||||
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
|
||||
|
||||
// force the creation of the file to mimic the MMap directory.
|
||||
if try!(self.fs.write(path_buf.clone(), &Vec::new())) {
|
||||
if exists {
|
||||
Err(OpenWriteError::FileAlreadyExists(path_buf))
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
Ok(BufWriter::new(Box::new(vec_writer)))
|
||||
}
|
||||
}
|
||||
@@ -180,28 +186,25 @@ impl Directory for RAMDirectory {
|
||||
self.fs.delete(path)
|
||||
}
|
||||
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
self.fs.exists(path)
|
||||
}
|
||||
|
||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
|
||||
let read = self.open_read(path)?;
|
||||
Ok(read.as_slice()
|
||||
.to_owned())
|
||||
Ok(read.as_slice().to_owned())
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
let path_buf = PathBuf::from(path);
|
||||
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
try!(self.fs.write(path_buf, &Vec::new()));
|
||||
try!(vec_writer.write_all(data));
|
||||
try!(vec_writer.flush());
|
||||
self.fs.write(path_buf, &Vec::new())?;
|
||||
vec_writer.write_all(data)?;
|
||||
vec_writer.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn box_clone(&self,) -> Box<Directory> {
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -2,10 +2,12 @@ use fst::raw::MmapReadOnly;
|
||||
use std::ops::Deref;
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
use common::HasLen;
|
||||
|
||||
use std::slice;
|
||||
use std::io::{self, Read};
|
||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
///
|
||||
///
|
||||
/// These read objects are only in charge to deliver
|
||||
/// the data in the form of a constant read-only `&[u8]`.
|
||||
/// Whatever happens to the directory file, the data
|
||||
@@ -13,12 +15,14 @@ use common::HasLen;
|
||||
pub enum ReadOnlySource {
|
||||
/// Mmap source of data
|
||||
Mmap(MmapReadOnly),
|
||||
/// Wrapping a `Vec<u8>`
|
||||
/// Wrapping a `Vec<u8>`
|
||||
Anonymous(SharedVecSlice),
|
||||
}
|
||||
|
||||
unsafe impl StableDeref for ReadOnlySource {}
|
||||
unsafe impl CloneStableDeref for ReadOnlySource {}
|
||||
|
||||
impl Deref for ReadOnlySource {
|
||||
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &[u8] {
|
||||
@@ -27,35 +31,38 @@ impl Deref for ReadOnlySource {
|
||||
}
|
||||
|
||||
impl ReadOnlySource {
|
||||
|
||||
/// Creates an empty ReadOnlySource
|
||||
pub fn empty() -> ReadOnlySource {
|
||||
ReadOnlySource::Anonymous(SharedVecSlice::empty())
|
||||
}
|
||||
|
||||
/// Returns the data underlying the ReadOnlySource object.
|
||||
pub fn as_slice(&self,) -> &[u8] {
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
match *self {
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe {
|
||||
mmap_read_only.as_slice()
|
||||
},
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => {
|
||||
shared_vec.as_slice()
|
||||
},
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a ReadOnlySource that is just a
|
||||
/// Splits into 2 `ReadOnlySource`, at the offset given
|
||||
/// as an argument.
|
||||
pub fn split(self, addr: usize) -> (ReadOnlySource, ReadOnlySource) {
|
||||
let left = self.slice(0, addr);
|
||||
let right = self.slice_from(addr);
|
||||
(left, right)
|
||||
}
|
||||
|
||||
/// Creates a ReadOnlySource that is just a
|
||||
/// view over a slice of the data.
|
||||
///
|
||||
///
|
||||
/// Keep in mind that any living slice extends
|
||||
/// the lifetime of the original ReadOnlySource,
|
||||
///
|
||||
///
|
||||
/// For instance, if `ReadOnlySource` wraps 500MB
|
||||
/// worth of data in anonymous memory, and only a
|
||||
/// 1KB slice is remaining, the whole `500MBs`
|
||||
/// 1KB slice is remaining, the whole `500MBs`
|
||||
/// are retained in memory.
|
||||
pub fn slice(&self, from_offset:usize, to_offset:usize) -> ReadOnlySource {
|
||||
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
|
||||
match *self {
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => {
|
||||
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
|
||||
@@ -63,13 +70,30 @@ impl ReadOnlySource {
|
||||
}
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => {
|
||||
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Like `.slice(...)` but enforcing only the `from`
|
||||
/// boundary.
|
||||
///
|
||||
/// Equivalent to `.slice(from_offset, self.len())`
|
||||
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
|
||||
let len = self.len();
|
||||
self.slice(from_offset, len)
|
||||
}
|
||||
|
||||
/// Like `.slice(...)` but enforcing only the `to`
|
||||
/// boundary.
|
||||
///
|
||||
/// Equivalent to `.slice(0, to_offset)`
|
||||
pub fn slice_to(&self, to_offset: usize) -> ReadOnlySource {
|
||||
self.slice(0, to_offset)
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for ReadOnlySource {
|
||||
fn len(&self,) -> usize {
|
||||
fn len(&self) -> usize {
|
||||
self.as_slice().len()
|
||||
}
|
||||
}
|
||||
@@ -79,3 +103,48 @@ impl Clone for ReadOnlySource {
|
||||
self.slice(0, self.len())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<u8>> for ReadOnlySource {
|
||||
fn from(data: Vec<u8>) -> ReadOnlySource {
|
||||
let shared_data = SharedVecSlice::from(data);
|
||||
ReadOnlySource::Anonymous(shared_data)
|
||||
}
|
||||
}
|
||||
|
||||
/// Acts as a owning cursor over the data backed up by a `ReadOnlySource`
|
||||
pub(crate) struct SourceRead {
|
||||
_data_owner: ReadOnlySource,
|
||||
cursor: &'static [u8],
|
||||
}
|
||||
|
||||
impl SourceRead {
|
||||
// Advance the cursor by a given number of bytes.
|
||||
pub fn advance(&mut self, len: usize) {
|
||||
self.cursor = &self.cursor[len..];
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for SourceRead {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.cursor
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ReadOnlySource> for SourceRead {
|
||||
// Creates a new `SourceRead` from a given `ReadOnlySource`
|
||||
fn from(source: ReadOnlySource) -> SourceRead {
|
||||
let len = source.len();
|
||||
let slice_ptr = source.as_slice().as_ptr();
|
||||
let static_slice = unsafe { slice::from_raw_parts(slice_ptr, len) };
|
||||
SourceRead {
|
||||
_data_owner: source,
|
||||
cursor: static_slice,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Read for SourceRead {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
self.cursor.read(buf)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,15 +1,13 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SharedVecSlice {
|
||||
pub data: Arc<Vec<u8>>,
|
||||
pub start: usize,
|
||||
pub len: usize
|
||||
pub start: usize,
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl SharedVecSlice {
|
||||
|
||||
pub fn empty() -> SharedVecSlice {
|
||||
SharedVecSlice::new(Arc::new(Vec::new()))
|
||||
}
|
||||
@@ -17,21 +15,27 @@ impl SharedVecSlice {
|
||||
pub fn new(data: Arc<Vec<u8>>) -> SharedVecSlice {
|
||||
let data_len = data.len();
|
||||
SharedVecSlice {
|
||||
data: data,
|
||||
data,
|
||||
start: 0,
|
||||
len: data_len,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_slice(&self,) -> &[u8] {
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
&self.data[self.start..self.start + self.len]
|
||||
}
|
||||
|
||||
pub fn slice(&self, from_offset: usize, to_offset:usize) -> SharedVecSlice {
|
||||
pub fn slice(&self, from_offset: usize, to_offset: usize) -> SharedVecSlice {
|
||||
SharedVecSlice {
|
||||
data: self.data.clone(),
|
||||
data: Arc::clone(&self.data),
|
||||
start: self.start + from_offset,
|
||||
len: to_offset - from_offset,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<u8>> for SharedVecSlice {
|
||||
fn from(data: Vec<u8>) -> SharedVecSlice {
|
||||
SharedVecSlice::new(Arc::new(data))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use DocId;
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use common::BitSet;
|
||||
|
||||
/// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`.
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
@@ -16,7 +16,6 @@ pub enum SkipResult {
|
||||
End,
|
||||
}
|
||||
|
||||
|
||||
/// Represents an iterable set of sorted doc ids.
|
||||
pub trait DocSet {
|
||||
/// Goes to the next element.
|
||||
@@ -35,6 +34,9 @@ pub trait DocSet {
|
||||
/// More specifically, if the docset is already positionned on the target
|
||||
/// skipping will advance to the next position and return SkipResult::Overstep.
|
||||
///
|
||||
/// If `.skip_next()` oversteps, then the docset must be positionned correctly
|
||||
/// on an existing document. In other words, `.doc()` should return the first document
|
||||
/// greater than `DocId`.
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
@@ -52,21 +54,57 @@ pub trait DocSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Fills a given mutable buffer with the next doc ids from the
|
||||
/// `DocSet`
|
||||
///
|
||||
/// If that many `DocId`s are available, the method should
|
||||
/// fill the entire buffer and return the length of the buffer.
|
||||
///
|
||||
/// If we reach the end of the `DocSet` before filling
|
||||
/// it entirely, then the buffer is filled up to this point, and
|
||||
/// return value is the number of elements that were filled.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This method is only here for specific high-performance
|
||||
/// use case where batching. The normal way to
|
||||
/// go through the `DocId`'s is to call `.advance()`.
|
||||
fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
|
||||
for (i, buffer_val) in buffer.iter_mut().enumerate() {
|
||||
if self.advance() {
|
||||
*buffer_val = self.doc();
|
||||
} else {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
buffer.len()
|
||||
}
|
||||
|
||||
/// Returns the current document
|
||||
fn doc(&self) -> DocId;
|
||||
|
||||
/// Advances the cursor to the next document
|
||||
/// None is returned if the iterator has `DocSet`
|
||||
/// has already been entirely consumed.
|
||||
fn next(&mut self) -> Option<DocId> {
|
||||
if self.advance() {
|
||||
Some(self.doc())
|
||||
} else {
|
||||
None
|
||||
/// Returns a best-effort hint of the
|
||||
/// length of the docset.
|
||||
fn size_hint(&self) -> u32;
|
||||
|
||||
/// Appends all docs to a `bitset`.
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
while self.advance() {
|
||||
bitset.insert(self.doc());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number documents matching.
|
||||
///
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
fn count(&mut self) -> u32 {
|
||||
let mut count = 0u32;
|
||||
while self.advance() {
|
||||
count += 1u32;
|
||||
}
|
||||
count
|
||||
}
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
fn advance(&mut self) -> bool {
|
||||
@@ -83,21 +121,19 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
let unboxed: &TDocSet = self.borrow();
|
||||
unboxed.doc()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, TDocSet: DocSet> DocSet for &'a mut TDocSet {
|
||||
fn advance(&mut self) -> bool {
|
||||
let unref: &mut TDocSet = *self;
|
||||
unref.advance()
|
||||
fn size_hint(&self) -> u32 {
|
||||
let unboxed: &TDocSet = self.borrow();
|
||||
unboxed.size_hint()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
let unref: &mut TDocSet = *self;
|
||||
unref.skip_next(target)
|
||||
fn count(&mut self) -> u32 {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
let unref: &TDocSet = *self;
|
||||
unref.doc()
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.append_to_bitset(bitset);
|
||||
}
|
||||
}
|
||||
133
src/error.rs
133
src/error.rs
@@ -1,97 +1,138 @@
|
||||
#![allow(enum_variant_names)]
|
||||
|
||||
/// Definition of Tantivy's error and result.
|
||||
//! Definition of Tantivy's error and result.
|
||||
|
||||
use std::io;
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::error;
|
||||
use std::sync::PoisonError;
|
||||
use directory::error::{OpenReadError, OpenWriteError, OpenDirectoryError};
|
||||
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use query;
|
||||
use schema;
|
||||
use fastfield::FastFieldNotAvailableError;
|
||||
use serde_json;
|
||||
|
||||
error_chain!(
|
||||
errors {
|
||||
/// Path does not exist.
|
||||
PathDoesNotExist(buf: PathBuf) {
|
||||
description("path does not exist")
|
||||
display("path does not exist: '{:?}'", buf)
|
||||
}
|
||||
/// File already exists, this is a problem when we try to write into a new file.
|
||||
FileAlreadyExists(buf: PathBuf) {
|
||||
description("file already exists")
|
||||
display("file already exists: '{:?}'", buf)
|
||||
}
|
||||
/// IO Error.
|
||||
IOError(err: IOError) {
|
||||
description("an IO error occurred")
|
||||
display("an IO error occurred: '{}'", err)
|
||||
}
|
||||
/// The data within is corrupted.
|
||||
///
|
||||
/// For instance, it contains invalid JSON.
|
||||
CorruptedFile(buf: PathBuf) {
|
||||
description("file contains corrupted data")
|
||||
display("file contains corrupted data: '{:?}'", buf)
|
||||
}
|
||||
/// A thread holding the locked panicked and poisoned the lock.
|
||||
Poisoned {
|
||||
description("a thread holding the locked panicked and poisoned the lock")
|
||||
}
|
||||
/// Invalid argument was passed by the user.
|
||||
InvalidArgument(arg: String) {
|
||||
description("an invalid argument was passed")
|
||||
display("an invalid argument was passed: '{}'", arg)
|
||||
}
|
||||
/// An Error happened in one of the thread.
|
||||
ErrorInThread(err: String) {
|
||||
description("an error occurred in a thread")
|
||||
display("an error occurred in a thread: '{}'", err)
|
||||
}
|
||||
/// An Error appeared related to the lack of a field.
|
||||
SchemaError(field: String) {
|
||||
description("a schema field is missing")
|
||||
display("a schema field is missing: '{}'", field)
|
||||
}
|
||||
/// Tried to access a fastfield reader for a field not configured accordingly.
|
||||
FastFieldError(err: FastFieldNotAvailableError) {
|
||||
description("fast field not available")
|
||||
display("fast field not available: '{:?}'", err)
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
impl From<FastFieldNotAvailableError> for Error {
|
||||
fn from(fastfield_error: FastFieldNotAvailableError) -> Error {
|
||||
ErrorKind::FastFieldError(fastfield_error).into()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Generic tantivy error.
|
||||
///
|
||||
/// Any specialized error return in tantivy can be converted in `tantivy::Error`.
|
||||
#[derive(Debug)]
|
||||
pub enum Error {
|
||||
/// Path does not exist.
|
||||
PathDoesNotExist(PathBuf),
|
||||
/// File already exists, this is a problem when we try to write into a new file.
|
||||
FileAlreadyExists(PathBuf),
|
||||
/// IO Error
|
||||
IOError(io::Error),
|
||||
/// A thread holding the locked panicked and poisoned the lock.
|
||||
Poisoned,
|
||||
/// The data within is corrupted.
|
||||
///
|
||||
/// For instance, it contains invalid JSON.
|
||||
CorruptedFile(PathBuf, Box<error::Error + Send + Sync>),
|
||||
/// Invalid argument was passed by the user.
|
||||
InvalidArgument(String),
|
||||
/// An Error happened in one of the thread
|
||||
ErrorInThread(String),
|
||||
/// An Error appeared related to the lack of a field.
|
||||
SchemaError(String),
|
||||
|
||||
impl From<IOError> for Error {
|
||||
fn from(io_error: IOError) -> Error {
|
||||
ErrorKind::IOError(io_error).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<io::Error> for Error {
|
||||
fn from(io_error: io::Error) -> Error {
|
||||
Error::IOError(io_error)
|
||||
ErrorKind::IOError(io_error.into()).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<query::QueryParserError> for Error {
|
||||
fn from(parsing_error: query::QueryParserError) -> Error {
|
||||
Error::InvalidArgument(format!("Query is invalid. {:?}", parsing_error))
|
||||
ErrorKind::InvalidArgument(format!("Query is invalid. {:?}", parsing_error)).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl<Guard> From<PoisonError<Guard>> for Error {
|
||||
fn from(_: PoisonError<Guard>) -> Error {
|
||||
Error::Poisoned
|
||||
ErrorKind::Poisoned.into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OpenReadError> for Error {
|
||||
fn from(error: OpenReadError) -> Error {
|
||||
match error {
|
||||
OpenReadError::FileDoesNotExist(filepath) => Error::PathDoesNotExist(filepath),
|
||||
OpenReadError::IOError(io_error) => Error::IOError(io_error),
|
||||
OpenReadError::FileDoesNotExist(filepath) => {
|
||||
ErrorKind::PathDoesNotExist(filepath).into()
|
||||
}
|
||||
OpenReadError::IOError(io_error) => ErrorKind::IOError(io_error).into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<schema::DocParsingError> for Error {
|
||||
fn from(error: schema::DocParsingError) -> Error {
|
||||
Error::InvalidArgument(format!("Failed to parse document {:?}", error))
|
||||
ErrorKind::InvalidArgument(format!("Failed to parse document {:?}", error)).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OpenWriteError> for Error {
|
||||
fn from(error: OpenWriteError) -> Error {
|
||||
match error {
|
||||
OpenWriteError::FileAlreadyExists(filepath) =>
|
||||
Error::FileAlreadyExists(filepath),
|
||||
OpenWriteError::IOError(io_error) =>
|
||||
Error::IOError(io_error),
|
||||
}
|
||||
OpenWriteError::FileAlreadyExists(filepath) => ErrorKind::FileAlreadyExists(filepath),
|
||||
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
|
||||
}.into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OpenDirectoryError> for Error {
|
||||
fn from(error: OpenDirectoryError) -> Error {
|
||||
match error {
|
||||
OpenDirectoryError::DoesNotExist(directory_path) =>
|
||||
Error::PathDoesNotExist(directory_path),
|
||||
OpenDirectoryError::NotADirectory(directory_path) =>
|
||||
Error::InvalidArgument(format!("{:?} is not a directory", directory_path)),
|
||||
OpenDirectoryError::DoesNotExist(directory_path) => {
|
||||
ErrorKind::PathDoesNotExist(directory_path).into()
|
||||
}
|
||||
OpenDirectoryError::NotADirectory(directory_path) => ErrorKind::InvalidArgument(
|
||||
format!("{:?} is not a directory", directory_path),
|
||||
).into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for Error {
|
||||
fn from(error: serde_json::Error) -> Error {
|
||||
let io_err = io::Error::from(error);
|
||||
ErrorKind::IOError(io_err.into()).into()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,9 @@ use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use common::HasLen;
|
||||
|
||||
/// Write a delete `BitSet`
|
||||
///
|
||||
/// where `delete_bitset` is the set of deleted `DocId`.
|
||||
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> {
|
||||
let max_doc = delete_bitset.capacity();
|
||||
let mut byte = 0u8;
|
||||
@@ -18,8 +21,7 @@ pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io:
|
||||
writer.write_all(&[byte])?;
|
||||
shift = 0;
|
||||
byte = 0;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
shift += 1;
|
||||
}
|
||||
}
|
||||
@@ -29,26 +31,27 @@ pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io:
|
||||
writer.flush()
|
||||
}
|
||||
|
||||
/// Set of deleted `DocId`s.
|
||||
#[derive(Clone)]
|
||||
pub struct DeleteBitSet {
|
||||
data: ReadOnlySource,
|
||||
len: usize,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
impl DeleteBitSet {
|
||||
|
||||
/// Opens a delete bitset given its data source.
|
||||
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
|
||||
let num_deleted: usize = data
|
||||
.as_slice()
|
||||
let num_deleted: usize = data.as_slice()
|
||||
.iter()
|
||||
.map(|b| b.count_ones() as usize)
|
||||
.sum();
|
||||
DeleteBitSet {
|
||||
data: data,
|
||||
data,
|
||||
len: num_deleted,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an empty delete bit set.
|
||||
pub fn empty() -> DeleteBitSet {
|
||||
DeleteBitSet {
|
||||
data: ReadOnlySource::empty(),
|
||||
@@ -56,25 +59,25 @@ impl DeleteBitSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true iff the segment has some deleted documents.
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.len() > 0
|
||||
}
|
||||
|
||||
/// Returns true iff the document is deleted.
|
||||
#[inline]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
if self.len == 0 {
|
||||
false
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
let byte_offset = doc / 8u32;
|
||||
let b: u8 = (*self.data)[byte_offset as usize];
|
||||
let shift = (doc & 7u32) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
impl HasLen for DeleteBitSet {
|
||||
fn len(&self) -> usize {
|
||||
self.len
|
||||
@@ -124,4 +127,4 @@ mod tests {
|
||||
test_delete_bitset_helper(&bitset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
24
src/fastfield/error.rs
Normal file
24
src/fastfield/error.rs
Normal file
@@ -0,0 +1,24 @@
|
||||
use std::result;
|
||||
use schema::FieldEntry;
|
||||
|
||||
/// `FastFieldNotAvailableError` is returned when the
|
||||
/// user requested for a fast field reader, and the field was not
|
||||
/// defined in the schema as a fast field.
|
||||
#[derive(Debug)]
|
||||
pub struct FastFieldNotAvailableError {
|
||||
field_name: String,
|
||||
}
|
||||
|
||||
impl FastFieldNotAvailableError {
|
||||
/// Creates a `FastFieldNotAvailable` error.
|
||||
/// `field_entry` is the configuration of the field
|
||||
/// for which fast fields are not available.
|
||||
pub fn new(field_entry: &FieldEntry) -> FastFieldNotAvailableError {
|
||||
FastFieldNotAvailableError {
|
||||
field_name: field_entry.name().to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result when trying to access a fast field reader.
|
||||
pub type Result<R> = result::Result<R, FastFieldNotAvailableError>;
|
||||
68
src/fastfield/facet_reader.rs
Normal file
68
src/fastfield/facet_reader.rs
Normal file
@@ -0,0 +1,68 @@
|
||||
use super::MultiValueIntFastFieldReader;
|
||||
use DocId;
|
||||
use termdict::TermOrdinal;
|
||||
use schema::Facet;
|
||||
use termdict::{TermDictionary, TermDictionaryImpl};
|
||||
|
||||
/// The facet reader makes it possible to access the list of
|
||||
/// facets associated to a given document in a specific
|
||||
/// segment.
|
||||
///
|
||||
/// Rather than manipulating `Facet` object directly, the API
|
||||
/// exposes those in the form of list of `Facet` ordinal.
|
||||
///
|
||||
/// A segment ordinal can then be translated into a facet via
|
||||
/// `.facet_from_ord(...)`.
|
||||
///
|
||||
/// Facet ordinals are defined as their position in the sorted
|
||||
/// list of facets. This ordinal is segment local and
|
||||
/// only makes sense for a given segment.
|
||||
pub struct FacetReader {
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionaryImpl,
|
||||
}
|
||||
|
||||
impl FacetReader {
|
||||
/// Creates a new `FacetReader`.
|
||||
///
|
||||
/// A facet reader just wraps :
|
||||
/// - a `MultiValueIntFastFieldReader` that makes it possible to
|
||||
/// access the list of facet ords for a given document.
|
||||
/// - a `TermDictionaryImpl` that helps associating a facet to
|
||||
/// an ordinal and vice versa.
|
||||
pub fn new(
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionaryImpl,
|
||||
) -> FacetReader {
|
||||
FacetReader {
|
||||
term_ords,
|
||||
term_dict,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the size of the sets of facets in the segment.
|
||||
/// This does not take in account the documents that may be marked
|
||||
/// as deleted.
|
||||
///
|
||||
/// `Facet` ordinals range from `0` to `num_facets() - 1`.
|
||||
pub fn num_facets(&self) -> usize {
|
||||
self.term_dict.num_terms()
|
||||
}
|
||||
|
||||
/// Accessor for the facet term dictionary.
|
||||
pub fn facet_dict(&self) -> &TermDictionaryImpl {
|
||||
&self.term_dict
|
||||
}
|
||||
|
||||
/// Given a term ordinal returns the term associated to it.
|
||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
||||
let found_term = self.term_dict
|
||||
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||
}
|
||||
|
||||
/// Return the list of facet ordinals associated to a document.
|
||||
pub fn facet_ords(&mut self, doc: DocId, output: &mut Vec<u64>) {
|
||||
self.term_ords.get_vals(doc, output);
|
||||
}
|
||||
}
|
||||
@@ -1,62 +1,159 @@
|
||||
/// Fast field module
|
||||
///
|
||||
/// Fast fields are the equivalent of `DocValues` in `Lucene`.
|
||||
/// Fast fields are stored in column-oriented fashion and allow fast
|
||||
/// random access given a `DocId`.
|
||||
///
|
||||
/// Their performance is comparable to that of an array lookup.
|
||||
/// They are useful when a field is required for all or most of
|
||||
/// the `DocSet` : for instance for scoring, grouping, filtering, or facetting.
|
||||
///
|
||||
/// Currently only u32 fastfield are supported.
|
||||
/*!
|
||||
Column oriented field storage for tantivy.
|
||||
|
||||
It is the equivalent of `Lucene`'s `DocValues`.
|
||||
|
||||
Fast fields is a column-oriented fashion storage of `tantivy`.
|
||||
|
||||
It is designed for the fast random access of some document
|
||||
fields given a document id.
|
||||
|
||||
`FastField` are useful when a field is required for all or most of
|
||||
the `DocSet` : for instance for scoring, grouping, filtering, or faceting.
|
||||
|
||||
|
||||
Fields have to be declared as `FAST` in the schema.
|
||||
Currently only 64-bits integers (signed or unsigned) are
|
||||
supported.
|
||||
|
||||
They are stored in a bit-packed fashion so that their
|
||||
memory usage is directly linear with the amplitude of the
|
||||
values stored.
|
||||
|
||||
Read access performance is comparable to that of an array lookup.
|
||||
*/
|
||||
|
||||
use common;
|
||||
use schema::Cardinality;
|
||||
use schema::FieldType;
|
||||
use schema::Value;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::MultiValueIntFastFieldReader;
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
|
||||
mod reader;
|
||||
mod writer;
|
||||
mod serializer;
|
||||
pub mod delete;
|
||||
mod error;
|
||||
mod delete;
|
||||
mod facet_reader;
|
||||
mod multivalued;
|
||||
|
||||
pub use self::writer::{U32FastFieldsWriter, U32FastFieldWriter};
|
||||
pub use self::reader::{U32FastFieldsReader, U32FastFieldReader};
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
/// Trait for types that are allowed for fast fields: (u64 or i64).
|
||||
pub trait FastValue: Default + Clone + Copy {
|
||||
/// Converts a value from u64
|
||||
///
|
||||
/// Internally all fast field values are encoded as u64.
|
||||
fn from_u64(val: u64) -> Self;
|
||||
|
||||
/// Converts a value to u64.
|
||||
///
|
||||
/// Internally all fast field values are encoded as u64.
|
||||
fn to_u64(&self) -> u64;
|
||||
|
||||
/// Returns the fast field cardinality that can be extracted from the given
|
||||
/// `FieldType`.
|
||||
///
|
||||
/// If the type is not a fast field, `None` is returned.
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality>;
|
||||
|
||||
/// Cast value to `u64`.
|
||||
/// The value is just reinterpreted in memory.
|
||||
fn as_u64(&self) -> u64;
|
||||
}
|
||||
|
||||
impl FastValue for u64 {
|
||||
fn from_u64(val: u64) -> Self {
|
||||
val
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
FieldType::HierarchicalFacet => Some(Cardinality::MultiValues),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for i64 {
|
||||
fn from_u64(val: u64) -> Self {
|
||||
common::u64_to_i64(val)
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
common::i64_to_u64(*self)
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::I64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self as u64
|
||||
}
|
||||
}
|
||||
|
||||
fn value_to_u64(value: &Value) -> u64 {
|
||||
match *value {
|
||||
Value::U64(ref val) => *val,
|
||||
Value::I64(ref val) => common::i64_to_u64(*val),
|
||||
_ => panic!("Expected a u64/i64 field, got {:?} ", value),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use schema::Field;
|
||||
use std::path::Path;
|
||||
use directory::{Directory, WritePtr, RAMDirectory};
|
||||
use schema::Document;
|
||||
use schema::{Schema, SchemaBuilder};
|
||||
use schema::FAST;
|
||||
use test::Bencher;
|
||||
use test;
|
||||
|
||||
use common::CompositeFile;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::FastFieldReader;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use rand::XorShiftRng;
|
||||
use schema::{Schema, SchemaBuilder};
|
||||
use schema::Document;
|
||||
use schema::FAST;
|
||||
use schema::Field;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use super::*;
|
||||
use test;
|
||||
use test::Bencher;
|
||||
|
||||
lazy_static! {
|
||||
static ref SCHEMA: Schema = {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
schema_builder.add_u32_field("field", FAST);
|
||||
schema_builder.add_u64_field("field", FAST);
|
||||
schema_builder.build()
|
||||
};
|
||||
static ref FIELD: Field = {
|
||||
static ref FIELD: Field = {
|
||||
SCHEMA.get_field("field").unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
fn add_single_field_doc(fast_field_writers: &mut U32FastFieldsWriter, field: Field, value: u32) {
|
||||
let mut doc = Document::default();
|
||||
doc.add_u32(field, value);
|
||||
fast_field_writers.add_document(&doc);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield() {
|
||||
let test_fastfield = U32FastFieldReader::from(vec!(100,200,300));
|
||||
let test_fastfield = FastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
assert_eq!(test_fastfield.get(0), 100);
|
||||
assert_eq!(test_fastfield.get(1), 200);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -65,24 +162,27 @@ mod tests {
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 13u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 14u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 2u32);
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>13u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>14u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>2u64));
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 20 as usize);
|
||||
assert_eq!(source.len(), 36 as usize);
|
||||
}
|
||||
{
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 13u32);
|
||||
assert_eq!(fast_field_reader.get(1), 14u32);
|
||||
assert_eq!(fast_field_reader.get(2), 2u32);
|
||||
let composite_file = CompositeFile::open(&source).unwrap();
|
||||
let field_source = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(field_source);
|
||||
assert_eq!(fast_field_reader.get(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get(2), 2u64);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,72 +192,187 @@ mod tests {
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 4u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 14_082_001u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 3_052u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 9002u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 15_001u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 777u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_002u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 1_501u32);
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 215u32);
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>4u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>3_052u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>9_002u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>15_001u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>777u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>1_002u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>1_501u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>215u64));
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 45 as usize);
|
||||
assert_eq!(source.len(), 61 as usize);
|
||||
}
|
||||
{
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4u32);
|
||||
assert_eq!(fast_field_reader.get(1), 14_082_001u32);
|
||||
assert_eq!(fast_field_reader.get(2), 3_052u32);
|
||||
assert_eq!(fast_field_reader.get(3), 9002u32);
|
||||
assert_eq!(fast_field_reader.get(4), 15_001u32);
|
||||
assert_eq!(fast_field_reader.get(5), 777u32);
|
||||
assert_eq!(fast_field_reader.get(6), 1_002u32);
|
||||
assert_eq!(fast_field_reader.get(7), 1_501u32);
|
||||
assert_eq!(fast_field_reader.get(8), 215u32);
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
assert_eq!(fast_field_reader.get(0), 4u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
|
||||
assert_eq!(fast_field_reader.get(2), 3_052u64);
|
||||
assert_eq!(fast_field_reader.get(3), 9002u64);
|
||||
assert_eq!(fast_field_reader.get(4), 15_001u64);
|
||||
assert_eq!(fast_field_reader.get(5), 777u64);
|
||||
assert_eq!(fast_field_reader.get(6), 1_002u64);
|
||||
assert_eq!(fast_field_reader.get(7), 1_501u64);
|
||||
assert_eq!(fast_field_reader.get(8), 215u64);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intfastfield_null_amplitude() {
|
||||
|
||||
#[test]
|
||||
fn test_intfastfield_null_amplitude() {
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for _ in 0..10_000 {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, 100_000u32);
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>100_000u64));
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 18 as usize);
|
||||
assert_eq!(source.len(), 34 as usize);
|
||||
}
|
||||
{
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
for doc in 0..10_000 {
|
||||
assert_eq!(fast_field_reader.get(doc), 100_000u32);
|
||||
assert_eq!(fast_field_reader.get(doc), 100_000u64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_permutation() -> Vec<u32> {
|
||||
#[test]
|
||||
fn test_intfastfield_large_numbers() {
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
// forcing the amplitude to be high
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>0u64));
|
||||
for i in 0u64..10_000u64 {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + i));
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 80042 as usize);
|
||||
}
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
assert_eq!(fast_field_reader.get(0), 0u64);
|
||||
for doc in 1..10_001 {
|
||||
assert_eq!(
|
||||
fast_field_reader.get(doc),
|
||||
5_000_000_000_000_000_000u64 + doc as u64 - 1u64
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_signed_intfastfield() {
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
|
||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
for i in -100i64..10_000i64 {
|
||||
let mut doc = Document::default();
|
||||
doc.add_i64(i64_field, i);
|
||||
fast_field_writers.add_document(&doc);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
assert_eq!(source.len(), 17709 as usize);
|
||||
}
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<i64>::open(data);
|
||||
|
||||
assert_eq!(fast_field_reader.min_value(), -100i64);
|
||||
assert_eq!(fast_field_reader.max_value(), 9_999i64);
|
||||
for (doc, i) in (-100i64..10_000i64).enumerate() {
|
||||
assert_eq!(fast_field_reader.get(doc as u32), i);
|
||||
}
|
||||
let mut buffer = vec![0i64; 100];
|
||||
fast_field_reader.get_range(53, &mut buffer[..]);
|
||||
for i in 0..100 {
|
||||
assert_eq!(buffer[i], -100i64 + 53i64 + i as i64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_signed_intfastfield_default_val() {
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
let doc = Document::default();
|
||||
fast_field_writers.add_document(&doc);
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<i64>::open(data);
|
||||
assert_eq!(fast_field_reader.get(0u32), 0i64);
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_permutation() -> Vec<u64> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, 4];
|
||||
let mut rng = XorShiftRng::from_seed(*seed);
|
||||
let mut permutation: Vec<u32> = (0u32..1_000_000u32).collect();
|
||||
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
|
||||
rng.shuffle(&mut permutation);
|
||||
permutation
|
||||
}
|
||||
@@ -170,19 +385,23 @@ mod tests {
|
||||
let mut directory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
|
||||
let mut a = 0u32;
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
|
||||
let mut a = 0u64;
|
||||
for _ in 0..n {
|
||||
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
|
||||
a = fast_field_reader.get(a as u32);
|
||||
@@ -195,8 +414,8 @@ mod tests {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u32;
|
||||
for i in (0u32..n).step_by(7) {
|
||||
let mut a = 0u64;
|
||||
for i in Iterator::step_by(0u32..n, 7) {
|
||||
a ^= permutation[i as usize];
|
||||
}
|
||||
a
|
||||
@@ -208,7 +427,7 @@ mod tests {
|
||||
let permutation = generate_permutation();
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u32;
|
||||
let mut a = 0u64;
|
||||
for _ in 0u32..n {
|
||||
a = permutation[a as usize];
|
||||
}
|
||||
@@ -223,22 +442,26 @@ mod tests {
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
let mut a = 0u32;
|
||||
for i in (0u32..n).step_by(7) {
|
||||
let mut a = 0u64;
|
||||
for i in Iterator::step_by(0u32..n, 7) {
|
||||
a ^= fast_field_reader.get(i);
|
||||
}
|
||||
a
|
||||
@@ -253,23 +476,27 @@ mod tests {
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for x in &permutation {
|
||||
add_single_field_doc(&mut fast_field_writers, *FIELD, *x);
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
let fast_field_reader = fast_field_readers.get_field(*FIELD).unwrap();
|
||||
let fast_fields_composite = CompositeFile::open(&source).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data);
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u32;
|
||||
for _ in 0u32..n {
|
||||
a = fast_field_reader.get(a);
|
||||
a = fast_field_reader.get(a) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
|
||||
88
src/fastfield/multivalued/mod.rs
Normal file
88
src/fastfield/multivalued/mod.rs
Normal file
@@ -0,0 +1,88 @@
|
||||
mod writer;
|
||||
mod reader;
|
||||
|
||||
pub use self::writer::MultiValueIntFastFieldWriter;
|
||||
pub use self::reader::MultiValueIntFastFieldReader;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Cardinality;
|
||||
use schema::IntOptions;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_u64() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_u64_field(
|
||||
"multifield",
|
||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(field=>1u64, field=>3u64));
|
||||
index_writer.add_document(doc!());
|
||||
index_writer.add_document(doc!(field=>4u64));
|
||||
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = reader.multi_fast_field_reader::<u64>(field).unwrap();
|
||||
{
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[4u64]);
|
||||
}
|
||||
{
|
||||
multi_value_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1u64, 3u64]);
|
||||
}
|
||||
{
|
||||
multi_value_reader.get_vals(1, &mut vals);
|
||||
assert!(vals.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_i64() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_i64_field(
|
||||
"multifield",
|
||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(field=> 1i64, field => 3i64));
|
||||
index_writer.add_document(doc!());
|
||||
index_writer.add_document(doc!(field=> -4i64));
|
||||
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap();
|
||||
{
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[-4i64]);
|
||||
}
|
||||
{
|
||||
multi_value_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1i64, 3i64]);
|
||||
}
|
||||
{
|
||||
multi_value_reader.get_vals(1, &mut vals);
|
||||
assert!(vals.is_empty());
|
||||
}
|
||||
{
|
||||
multi_value_reader.get_vals(3, &mut vals);
|
||||
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
|
||||
}
|
||||
}
|
||||
}
|
||||
109
src/fastfield/multivalued/reader.rs
Normal file
109
src/fastfield/multivalued/reader.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
use DocId;
|
||||
use fastfield::{FastFieldReader, FastValue};
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
///
|
||||
/// The reader is implemented as two `u64` fast field.
|
||||
///
|
||||
/// The `vals_reader` will access the concatenated list of all
|
||||
/// values for all reader.
|
||||
/// The `idx_reader` associated, for each document, the index of its first value.
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct MultiValueIntFastFieldReader<Item: FastValue> {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
vals_reader: FastFieldReader<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
pub(crate) fn open(
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
vals_reader: FastFieldReader<Item>,
|
||||
) -> MultiValueIntFastFieldReader<Item> {
|
||||
MultiValueIntFastFieldReader {
|
||||
idx_reader,
|
||||
vals_reader,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let start = self.idx_reader.get(doc) as u32;
|
||||
let stop = self.idx_reader.get(doc + 1) as u32;
|
||||
let len = (stop - start) as usize;
|
||||
vals.resize(len, Item::default());
|
||||
self.vals_reader.get_range(start, &mut vals[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use core::Index;
|
||||
use schema::{Document, Facet, SchemaBuilder};
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let facet_field = schema_builder.add_facet_field("facets");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index
|
||||
.writer_with_num_threads(1, 30_000_000)
|
||||
.expect("Failed to create index writer.");
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat2");
|
||||
doc.add_facet(facet_field, "/category/cat1");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat2");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat3");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().expect("Commit failed");
|
||||
index.load_searchers().expect("Reloading searchers");
|
||||
let searcher = index.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
|
||||
|
||||
let mut facet = Facet::root();
|
||||
{
|
||||
facet_reader.facet_from_ord(1, &mut facet);
|
||||
assert_eq!(facet, Facet::from("/category"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(2, &mut facet);
|
||||
assert_eq!(facet, Facet::from("/category/cat1"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(3, &mut facet);
|
||||
assert_eq!(format!("{}", facet), "/category/cat2");
|
||||
assert_eq!(facet, Facet::from("/category/cat2"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(4, &mut facet);
|
||||
assert_eq!(facet, Facet::from("/category/cat3"));
|
||||
}
|
||||
|
||||
let mut vals = Vec::new();
|
||||
{
|
||||
facet_reader.facet_ords(0, &mut vals);
|
||||
assert_eq!(&vals[..], &[3, 2]);
|
||||
}
|
||||
{
|
||||
facet_reader.facet_ords(1, &mut vals);
|
||||
assert_eq!(&vals[..], &[3]);
|
||||
}
|
||||
{
|
||||
facet_reader.facet_ords(2, &mut vals);
|
||||
assert_eq!(&vals[..], &[4]);
|
||||
}
|
||||
}
|
||||
}
|
||||
111
src/fastfield/multivalued/writer.rs
Normal file
111
src/fastfield/multivalued/writer.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
use fastfield::FastFieldSerializer;
|
||||
use fastfield::serializer::FastSingleFieldSerializer;
|
||||
use fastfield::value_to_u64;
|
||||
use std::collections::HashMap;
|
||||
use postings::UnorderedTermId;
|
||||
use schema::{Document, Field};
|
||||
use std::io;
|
||||
use itertools::Itertools;
|
||||
|
||||
pub struct MultiValueIntFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u64>,
|
||||
doc_index: Vec<u64>,
|
||||
is_facet: bool,
|
||||
}
|
||||
|
||||
impl MultiValueIntFastFieldWriter {
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub fn new(field: Field, is_facet: bool) -> Self {
|
||||
MultiValueIntFastFieldWriter {
|
||||
field,
|
||||
vals: Vec::new(),
|
||||
doc_index: Vec::new(),
|
||||
is_facet,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
pub fn next_doc(&mut self) {
|
||||
self.doc_index.push(self.vals.len() as u64);
|
||||
}
|
||||
|
||||
/// Records a new value.
|
||||
///
|
||||
/// The n-th value being recorded is implicitely
|
||||
/// associated to the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: UnorderedTermId) {
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
if !self.is_facet {
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field() == self.field {
|
||||
self.add_val(value_to_u64(field_value.value()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes fast field values by pushing them to the `FastFieldSerializer`.
|
||||
///
|
||||
/// HashMap makes it possible to remap them before serializing.
|
||||
/// Specifically, string terms are first stored in the writer as their
|
||||
/// position in the `IndexWriter`'s `HashMap`. This value is called
|
||||
/// an `UnorderedTermId`.
|
||||
///
|
||||
/// During the serialization of the segment, terms gets sorted and
|
||||
/// `tantivy` builds a mapping to convert this `UnorderedTermId` into
|
||||
/// term ordinals.
|
||||
///
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping_opt: Option<&HashMap<UnorderedTermId, usize>>,
|
||||
) -> io::Result<()> {
|
||||
{
|
||||
// writing the offset index
|
||||
let mut doc_index_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
|
||||
for &offset in &self.doc_index {
|
||||
doc_index_serializer.add_val(offset)?;
|
||||
}
|
||||
doc_index_serializer.add_val(self.vals.len() as u64)?;
|
||||
doc_index_serializer.close_field()?;
|
||||
}
|
||||
{
|
||||
// writing the values themselves.
|
||||
let mut value_serializer: FastSingleFieldSerializer<_>;
|
||||
match mapping_opt {
|
||||
Some(mapping) => {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
self.field,
|
||||
0u64,
|
||||
mapping.len() as u64,
|
||||
1,
|
||||
)?;
|
||||
for val in &self.vals {
|
||||
let remapped_val = *mapping.get(val).expect("Missing term ordinal") as u64;
|
||||
value_serializer.add_val(remapped_val)?;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let val_min_max = self.vals.iter().cloned().minmax();
|
||||
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
for &val in &self.vals {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
value_serializer.close_field()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,155 +1,137 @@
|
||||
use std::io;
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Deref;
|
||||
|
||||
use directory::ReadOnlySource;
|
||||
use common::BinarySerializable;
|
||||
use DocId;
|
||||
use schema::{Field, SchemaBuilder};
|
||||
use std::path::Path;
|
||||
use schema::FAST;
|
||||
use directory::{WritePtr, RAMDirectory, Directory};
|
||||
use fastfield::FastFieldSerializer;
|
||||
use fastfield::U32FastFieldsWriter;
|
||||
use common::bitpacker::compute_num_bits;
|
||||
use common::bitpacker::BitUnpacker;
|
||||
use common::CompositeFile;
|
||||
use common::compute_num_bits;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use owning_ref::OwningRef;
|
||||
use schema::FAST;
|
||||
use schema::SchemaBuilder;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::mem;
|
||||
use std::path::Path;
|
||||
use super::FastValue;
|
||||
|
||||
|
||||
lazy_static! {
|
||||
static ref U32_FAST_FIELD_EMPTY: ReadOnlySource = {
|
||||
let u32_fast_field = U32FastFieldReader::from(Vec::new());
|
||||
u32_fast_field._data.clone()
|
||||
};
|
||||
/// Trait for accessing a fastfield.
|
||||
///
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct FastFieldReader<Item: FastValue> {
|
||||
bit_unpacker: BitUnpacker<OwningRef<ReadOnlySource, [u8]>>,
|
||||
min_value_u64: u64,
|
||||
max_value_u64: u64,
|
||||
_phantom: PhantomData<Item>,
|
||||
}
|
||||
|
||||
pub struct U32FastFieldReader {
|
||||
_data: ReadOnlySource,
|
||||
bit_unpacker: BitUnpacker,
|
||||
min_val: u32,
|
||||
max_val: u32,
|
||||
}
|
||||
|
||||
impl U32FastFieldReader {
|
||||
|
||||
pub fn empty() -> U32FastFieldReader {
|
||||
U32FastFieldReader::open(U32_FAST_FIELD_EMPTY.clone())
|
||||
}
|
||||
|
||||
pub fn min_val(&self,) -> u32 {
|
||||
self.min_val
|
||||
}
|
||||
|
||||
pub fn max_val(&self,) -> u32 {
|
||||
self.max_val
|
||||
}
|
||||
|
||||
/// Opens a new fast field reader given a read only source.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if the data is corrupted.
|
||||
pub fn open(data: ReadOnlySource) -> U32FastFieldReader {
|
||||
let min_val;
|
||||
let amplitude;
|
||||
let max_val;
|
||||
impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// Opens a fast field given a source.
|
||||
pub fn open(data: ReadOnlySource) -> Self {
|
||||
let min_value: u64;
|
||||
let amplitude: u64;
|
||||
{
|
||||
let mut cursor = data.as_slice();
|
||||
min_val = u32::deserialize(&mut cursor).unwrap();
|
||||
amplitude = u32::deserialize(&mut cursor).unwrap();
|
||||
max_val = min_val + amplitude;
|
||||
min_value =
|
||||
u64::deserialize(&mut cursor).expect("Failed to read the min_value of fast field.");
|
||||
amplitude =
|
||||
u64::deserialize(&mut cursor).expect("Failed to read the amplitude of fast field.");
|
||||
}
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = {
|
||||
let data_arr = &(data.deref()[8..]);
|
||||
BitUnpacker::new(data_arr, num_bits as usize)
|
||||
};
|
||||
U32FastFieldReader {
|
||||
_data: data,
|
||||
bit_unpacker: bit_unpacker,
|
||||
min_val: min_val,
|
||||
max_val: max_val,
|
||||
let owning_ref = OwningRef::new(data).map(|data| &data[16..]);
|
||||
let bit_unpacker = BitUnpacker::new(owning_ref, num_bits);
|
||||
FastFieldReader {
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, doc: DocId) -> u32 {
|
||||
self.min_val + self.bit_unpacker.get(doc as usize)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl From<Vec<u32>> for U32FastFieldReader {
|
||||
fn from(vals: Vec<u32>) -> U32FastFieldReader {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_u32_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::new(write).unwrap();
|
||||
let mut fast_field_writers = U32FastFieldsWriter::from_schema(&schema);
|
||||
for val in vals {
|
||||
let mut fast_field_writer = fast_field_writers.get_field_writer(field).unwrap();
|
||||
fast_field_writer.add_val(val);
|
||||
}
|
||||
fast_field_writers.serialize(&mut serializer).unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let source = directory.open_read(&path).unwrap();
|
||||
let fast_field_readers = U32FastFieldsReader::open(source).unwrap();
|
||||
fast_field_readers.get_field(field).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct U32FastFieldsReader {
|
||||
source: ReadOnlySource,
|
||||
field_offsets: HashMap<Field, (u32, u32)>,
|
||||
}
|
||||
|
||||
impl U32FastFieldsReader {
|
||||
pub fn open(source: ReadOnlySource) -> io::Result<U32FastFieldsReader> {
|
||||
let header_offset;
|
||||
let field_offsets: Vec<(Field, u32)>;
|
||||
{
|
||||
let buffer = source.as_slice();
|
||||
{
|
||||
let mut cursor = buffer;
|
||||
header_offset = try!(u32::deserialize(&mut cursor));
|
||||
}
|
||||
{
|
||||
let mut cursor = &buffer[header_offset as usize..];
|
||||
field_offsets = try!(Vec::deserialize(&mut cursor));
|
||||
}
|
||||
}
|
||||
let mut end_offsets: Vec<u32> = field_offsets
|
||||
.iter()
|
||||
.map(|&(_, offset)| offset)
|
||||
.collect();
|
||||
end_offsets.push(header_offset);
|
||||
let mut field_offsets_map: HashMap<Field, (u32, u32)> = HashMap::new();
|
||||
for (field_start_offsets, stop_offset) in field_offsets.iter().zip(end_offsets.iter().skip(1)) {
|
||||
let (field, start_offset) = *field_start_offsets;
|
||||
field_offsets_map.insert(field, (start_offset, *stop_offset));
|
||||
}
|
||||
Ok(U32FastFieldsReader {
|
||||
field_offsets: field_offsets_map,
|
||||
source: source,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the u32 fast value reader if the field
|
||||
/// is a u32 field indexed as "fast".
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// Return None if the field is not a u32 field
|
||||
/// indexed with the fast option.
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
pub fn get_field(&self, field: Field) -> Option<U32FastFieldReader> {
|
||||
self.field_offsets
|
||||
.get(&field)
|
||||
.map(|&(start, stop)| {
|
||||
let field_source = self.source.slice(start as usize, stop as usize);
|
||||
U32FastFieldReader::open(field_source)
|
||||
})
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
pub fn get(&self, doc: DocId) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc as usize))
|
||||
}
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
||||
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
|
||||
self.bit_unpacker.get_range(start, output_u64);
|
||||
for out in output_u64.iter_mut() {
|
||||
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn min_value(&self) -> Item {
|
||||
Item::from_u64(self.min_value_u64)
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self) -> Item {
|
||||
Item::from_u64(self.max_value_u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let path = Path::new("__dummy__");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory
|
||||
.open_write(path)
|
||||
.expect("With a RAMDirectory, this should never fail.");
|
||||
let mut serializer = FastFieldSerializer::from_write(write)
|
||||
.expect("With a RAMDirectory, this should never fail.");
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
{
|
||||
let fast_field_writer = fast_field_writers
|
||||
.get_field_writer(field)
|
||||
.expect("With a RAMDirectory, this should never fail.");
|
||||
for val in vals {
|
||||
fast_field_writer.add_val(val.to_u64());
|
||||
}
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new())
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
|
||||
let source = directory.open_read(path).expect("Failed to open the file");
|
||||
let composite_file =
|
||||
CompositeFile::open(&source).expect("Failed to read the composite file");
|
||||
let field_source = composite_file
|
||||
.open_read(field)
|
||||
.expect("File component not found");
|
||||
FastFieldReader::open(field_source)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,106 +1,109 @@
|
||||
use common::BinarySerializable;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use common::bitpacker::{compute_num_bits, BitPacker};
|
||||
use std::io::{self, Write, Seek, SeekFrom};
|
||||
|
||||
use common::bitpacker::BitPacker;
|
||||
use common::compute_num_bits;
|
||||
use common::CountingWriter;
|
||||
use common::CompositeWrite;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// `FastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
///
|
||||
///
|
||||
/// Fast fields are encoded using bit-packing.
|
||||
///
|
||||
///
|
||||
/// `FastFieldWriter`s are in charge of pushing the data to
|
||||
/// the serializer.
|
||||
/// The serializer expects to receive the following calls.
|
||||
///
|
||||
/// * `new_u32_fast_field(...)`
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `new_u32_fast_field(...)`
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `close()`
|
||||
pub struct FastFieldSerializer {
|
||||
write: WritePtr,
|
||||
written_size: usize,
|
||||
fields: Vec<(Field, u32)>,
|
||||
min_value: u32,
|
||||
field_open: bool,
|
||||
bit_packer: BitPacker,
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
}
|
||||
|
||||
|
||||
impl FastFieldSerializer {
|
||||
/// Constructor
|
||||
pub fn new(mut write: WritePtr) -> io::Result<FastFieldSerializer> {
|
||||
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let written_size: usize = try!(0u32.serialize(&mut write));
|
||||
Ok(FastFieldSerializer {
|
||||
write: write,
|
||||
written_size: written_size,
|
||||
fields: Vec::new(),
|
||||
min_value: 0,
|
||||
field_open: false,
|
||||
bit_packer: BitPacker::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
/// Start serializing a new u32 fast field
|
||||
pub fn new_u32_fast_field(&mut self, field: Field, min_value: u32, max_value: u32) -> io::Result<()> {
|
||||
if self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Previous field not closed"));
|
||||
}
|
||||
self.min_value = min_value;
|
||||
self.field_open = true;
|
||||
self.fields.push((field, self.written_size as u32));
|
||||
let write: &mut Write = &mut self.write;
|
||||
self.written_size += try!(min_value.serialize(write));
|
||||
let amplitude = max_value - min_value;
|
||||
self.written_size += try!(amplitude.serialize(write));
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
self.bit_packer = BitPacker::new(num_bits as usize);
|
||||
Ok(())
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(FastFieldSerializer { composite_write })
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
|
||||
}
|
||||
|
||||
/// Pushes a new value to the currently open u32 fast field.
|
||||
pub fn add_val(&mut self, val: u32) -> io::Result<()> {
|
||||
let val_to_write: u32 = val - self.min_value;
|
||||
self.bit_packer.write(val_to_write, &mut self.write)?;
|
||||
Ok(())
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
idx: usize,
|
||||
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastSingleFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
/// Close the u32 fast field.
|
||||
pub fn close_field(&mut self,) -> io::Result<()> {
|
||||
if !self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Current field is already closed"));
|
||||
}
|
||||
self.field_open = false;
|
||||
// adding some padding to make sure we
|
||||
// can read the last elements with our u64
|
||||
// cursor
|
||||
self.written_size += self.bit_packer.close(&mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
pub fn close(mut self,) -> io::Result<usize> {
|
||||
if self.field_open {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "Last field not closed"));
|
||||
}
|
||||
let header_offset: usize = self.written_size;
|
||||
self.written_size += try!(self.fields.serialize(&mut self.write));
|
||||
try!(self.write.seek(SeekFrom::Start(0)));
|
||||
try!((header_offset as u32).serialize(&mut self.write));
|
||||
try!(self.write.flush());
|
||||
Ok(self.written_size)
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.composite_write.close()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(FastSingleFieldSerializer {
|
||||
write,
|
||||
bit_packer,
|
||||
min_value,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,121 +1,255 @@
|
||||
use schema::{Schema, Field, Document};
|
||||
use schema::{Cardinality, Document, Field, Schema};
|
||||
use fastfield::FastFieldSerializer;
|
||||
use std::io;
|
||||
use schema::Value;
|
||||
use DocId;
|
||||
use schema::FieldType;
|
||||
use common;
|
||||
use common::VInt;
|
||||
use std::collections::HashMap;
|
||||
use postings::UnorderedTermId;
|
||||
use super::multivalued::MultiValueIntFastFieldWriter;
|
||||
use common::BinarySerializable;
|
||||
|
||||
pub struct U32FastFieldsWriter {
|
||||
field_writers: Vec<U32FastFieldWriter>,
|
||||
/// The fastfieldswriter regroup all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
single_value_writers: Vec<IntFastFieldWriter>,
|
||||
multi_values_writers: Vec<MultiValueIntFastFieldWriter>,
|
||||
}
|
||||
|
||||
impl U32FastFieldsWriter {
|
||||
impl FastFieldsWriter {
|
||||
/// Create all `FastFieldWriter` required by the schema.
|
||||
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
|
||||
let mut single_value_writers = Vec::new();
|
||||
let mut multi_values_writers = Vec::new();
|
||||
|
||||
pub fn from_schema(schema: &Schema) -> U32FastFieldsWriter {
|
||||
let u32_fields: Vec<Field> = schema.fields()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|&(_, field_entry)| field_entry.is_u32_fast())
|
||||
.map(|(field_id, _)| Field(field_id as u8))
|
||||
.collect();
|
||||
U32FastFieldsWriter::new(u32_fields)
|
||||
}
|
||||
|
||||
pub fn new(fields: Vec<Field>) -> U32FastFieldsWriter {
|
||||
U32FastFieldsWriter {
|
||||
field_writers: fields
|
||||
.into_iter()
|
||||
.map(U32FastFieldWriter::new)
|
||||
.collect(),
|
||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
let default_value = if let FieldType::I64(_) = *field_entry.field_type() {
|
||||
common::i64_to_u64(0i64)
|
||||
} else {
|
||||
0u64
|
||||
};
|
||||
match *field_entry.field_type() {
|
||||
FieldType::I64(ref int_options) | FieldType::U64(ref int_options) => {
|
||||
match int_options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
let mut fast_field_writer = IntFastFieldWriter::new(field);
|
||||
fast_field_writer.set_val_if_missing(default_value);
|
||||
single_value_writers.push(fast_field_writer);
|
||||
}
|
||||
Some(Cardinality::MultiValues) => {
|
||||
let fast_field_writer = MultiValueIntFastFieldWriter::new(field, false);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
FieldType::HierarchicalFacet => {
|
||||
let fast_field_writer = MultiValueIntFastFieldWriter::new(field, true);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
FastFieldsWriter {
|
||||
single_value_writers,
|
||||
multi_values_writers,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut U32FastFieldWriter> {
|
||||
self.field_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field == field)
|
||||
|
||||
/// Returns a `FastFieldsWriter with a `u64` `IntFastFieldWriter` for each
|
||||
/// of the field given in argument.
|
||||
pub(crate) fn new(fields: Vec<Field>) -> FastFieldsWriter {
|
||||
FastFieldsWriter {
|
||||
single_value_writers: fields.into_iter().map(IntFastFieldWriter::new).collect(),
|
||||
multi_values_writers: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_field_writer(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.single_value_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Returns the fast field multi-value writer for the given field.
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
/// configured as a multivalued fastfield in the schema.
|
||||
pub(crate) fn get_multivalue_writer(
|
||||
&mut self,
|
||||
field: Field,
|
||||
) -> Option<&mut MultiValueIntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
// TODO expose for users
|
||||
self.multi_values_writers
|
||||
.iter_mut()
|
||||
.find(|multivalue_writer| multivalue_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
for field_writer in &mut self.field_writers {
|
||||
for field_writer in &mut self.single_value_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
for field_writer in &mut self.multi_values_writers {
|
||||
field_writer.next_doc();
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
|
||||
for field_writer in &self.field_writers {
|
||||
try!(field_writer.serialize(serializer));
|
||||
/// Serializes all of the `FastFieldWriter`s by pushing them in
|
||||
/// order to the fast field serializer.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
mapping: &HashMap<Field, HashMap<UnorderedTermId, usize>>,
|
||||
) -> io::Result<()> {
|
||||
for field_writer in &self.single_value_writers {
|
||||
field_writer.serialize(serializer)?;
|
||||
}
|
||||
for field_writer in &self.multi_values_writers {
|
||||
let field = field_writer.field();
|
||||
field_writer.serialize(serializer, mapping.get(&field))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Ensures all of the fast field writers have
|
||||
/// reached `doc`. (included)
|
||||
///
|
||||
///
|
||||
/// The missing values will be filled with 0.
|
||||
pub fn fill_val_up_to(&mut self, doc: DocId) {
|
||||
for field_writer in &mut self.field_writers {
|
||||
for field_writer in &mut self.single_value_writers {
|
||||
field_writer.fill_val_up_to(doc);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
pub struct U32FastFieldWriter {
|
||||
/// Fast field writer for ints.
|
||||
/// The fast field writer just keeps the values in memory.
|
||||
///
|
||||
/// Only when the segment writer can be closed and
|
||||
/// persisted on disc, the fast field writer is
|
||||
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
|
||||
/// method.
|
||||
///
|
||||
/// We cannot serialize earlier as the values are
|
||||
/// bitpacked and the number of bits required for bitpacking
|
||||
/// can only been known once we have seen all of the values.
|
||||
///
|
||||
/// Both u64, and i64 use the same writer.
|
||||
/// i64 are just remapped to the `0..2^64 - 1`
|
||||
/// using `common::i64_to_u64`.
|
||||
pub struct IntFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u32>,
|
||||
vals: Vec<u8>,
|
||||
val_count: usize,
|
||||
val_if_missing: u64,
|
||||
val_min: u64,
|
||||
val_max: u64,
|
||||
}
|
||||
|
||||
impl U32FastFieldWriter {
|
||||
pub fn new(field: Field) -> U32FastFieldWriter {
|
||||
U32FastFieldWriter {
|
||||
field: field,
|
||||
impl IntFastFieldWriter {
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub fn new(field: Field) -> IntFastFieldWriter {
|
||||
IntFastFieldWriter {
|
||||
field,
|
||||
vals: Vec::new(),
|
||||
val_count: 0,
|
||||
val_if_missing: 0u64,
|
||||
val_min: u64::max_value(),
|
||||
val_max: 0,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Returns the field that this writer is targetting.
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
|
||||
/// Sets the default value.
|
||||
///
|
||||
/// This default value is recorded for documents if
|
||||
/// a document does not have any value.
|
||||
fn set_val_if_missing(&mut self, val_if_missing: u64) {
|
||||
self.val_if_missing = val_if_missing;
|
||||
}
|
||||
|
||||
/// Ensures all of the fast field writer have
|
||||
/// reached `doc`. (included)
|
||||
///
|
||||
///
|
||||
/// The missing values will be filled with 0.
|
||||
fn fill_val_up_to(&mut self, doc: DocId) {
|
||||
let target = doc as usize + 1;
|
||||
debug_assert!(self.vals.len() <= target);
|
||||
while self.vals.len() < target {
|
||||
self.add_val(0u32)
|
||||
debug_assert!(self.val_count <= target);
|
||||
let val_if_missing = self.val_if_missing;
|
||||
while self.val_count < target {
|
||||
self.add_val(val_if_missing);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_val(&mut self, val: u32) {
|
||||
self.vals.push(val);
|
||||
|
||||
/// Records a new value.
|
||||
///
|
||||
/// The n-th value being recorded is implicitely
|
||||
/// associated to the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: u64) {
|
||||
VInt(val)
|
||||
.serialize(&mut self.vals)
|
||||
.expect("unable to serialize VInt to Vec");
|
||||
|
||||
if val > self.val_max {
|
||||
self.val_max = val;
|
||||
}
|
||||
if val < self.val_min {
|
||||
self.val_min = val;
|
||||
}
|
||||
|
||||
self.val_count += 1;
|
||||
}
|
||||
|
||||
fn extract_val(&self, doc: &Document) -> u32 {
|
||||
|
||||
/// Extract the value associated to the fast field for
|
||||
/// this document.
|
||||
///
|
||||
/// i64 are remapped to u64 using the logic
|
||||
/// in `common::i64_to_u64`.
|
||||
///
|
||||
/// If the value is missing, then the default value is used
|
||||
/// instead.
|
||||
/// If the document has more than one value for the given field,
|
||||
/// only the first one is taken in account.
|
||||
fn extract_val(&self, doc: &Document) -> u64 {
|
||||
match doc.get_first(self.field) {
|
||||
Some(v) => {
|
||||
match *v {
|
||||
Value::U32(ref val) => { *val }
|
||||
_ => { panic!("Expected a u32field, got {:?} ", v) }
|
||||
}
|
||||
},
|
||||
None => {
|
||||
0u32
|
||||
}
|
||||
Some(v) => super::value_to_u64(v),
|
||||
None => self.val_if_missing,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Extract the fast field value from the document
|
||||
/// (or use the default value) and records it.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
let val = self.extract_val(doc);
|
||||
self.add_val(val);
|
||||
}
|
||||
|
||||
/// Push the fast fields value to the `FastFieldWriter`.
|
||||
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
|
||||
let zero = 0;
|
||||
let min = *self.vals.iter().min().unwrap_or(&zero);
|
||||
let max = *self.vals.iter().max().unwrap_or(&min);
|
||||
try!(serializer.new_u32_fast_field(self.field, min, max));
|
||||
for &val in &self.vals {
|
||||
try!(serializer.add_val(val));
|
||||
let (min, max) = if self.val_min > self.val_max {
|
||||
(0, 0)
|
||||
} else {
|
||||
(self.val_min, self.val_max)
|
||||
};
|
||||
|
||||
let mut single_field_serializer = serializer.new_u64_fast_field(self.field, min, max)?;
|
||||
|
||||
let mut cursor = self.vals.as_slice();
|
||||
while let Ok(VInt(val)) = VInt::deserialize(&mut cursor) {
|
||||
single_field_serializer.add_val(val)?;
|
||||
}
|
||||
serializer.close_field()
|
||||
|
||||
single_field_serializer.close_field()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ use Index;
|
||||
use Searcher;
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
|
||||
fn check_index_content(searcher: &Searcher, vals: &HashSet<u32>) {
|
||||
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
assert!(searcher.segment_readers().len() < 20);
|
||||
assert_eq!(searcher.num_docs() as usize, vals.len());
|
||||
}
|
||||
@@ -14,22 +14,21 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u32>) {
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_indexing() {
|
||||
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
|
||||
let id_field = schema_builder.add_u32_field("id", U32_INDEXED);
|
||||
let multiples_field = schema_builder.add_u32_field("multiples", U32_INDEXED);
|
||||
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
|
||||
let universe = Range::new(0u32, 20u32);
|
||||
let universe = Range::new(0u64, 20u64);
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
|
||||
|
||||
let mut committed_docs: HashSet<u32> = HashSet::new();
|
||||
let mut uncommitted_docs: HashSet<u32> = HashSet::new();
|
||||
let mut committed_docs: HashSet<u64> = HashSet::new();
|
||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||
|
||||
for _ in 0..200 {
|
||||
let random_val = universe.ind_sample(&mut rng);
|
||||
@@ -41,19 +40,16 @@ fn test_indexing() {
|
||||
let searcher = index.searcher();
|
||||
// check that everything is correct.
|
||||
check_index_content(&searcher, &committed_docs);
|
||||
}
|
||||
else {
|
||||
if committed_docs.remove(&random_val) ||
|
||||
uncommitted_docs.remove(&random_val) {
|
||||
let doc_id_term = Term::from_field_u32(id_field, random_val);
|
||||
} else {
|
||||
if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
||||
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
||||
index_writer.delete_term(doc_id_term);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
uncommitted_docs.insert(random_val);
|
||||
let mut doc = Document::new();
|
||||
doc.add_u32(id_field, random_val);
|
||||
for i in 1u32..10u32 {
|
||||
doc.add_u32(multiples_field, random_val * i);
|
||||
doc.add_u64(id_field, random_val);
|
||||
for i in 1u64..10u64 {
|
||||
doc.add_u64(multiples_field, random_val * i);
|
||||
}
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
|
||||
@@ -3,20 +3,19 @@ use std::sync::{Arc, RwLock};
|
||||
use std::mem;
|
||||
use std::ops::DerefMut;
|
||||
|
||||
|
||||
// The DeleteQueue is similar in conceptually to a multiple
|
||||
// consumer single producer broadcast channel.
|
||||
//
|
||||
//
|
||||
// All consumer will receive all messages.
|
||||
//
|
||||
//
|
||||
// Consumer of the delete queue are holding a `DeleteCursor`,
|
||||
// which points to a specific place of the `DeleteQueue`.
|
||||
//
|
||||
//
|
||||
// New consumer can be created in two ways
|
||||
// - calling `delete_queue.cursor()` returns a cursor, that
|
||||
// - calling `delete_queue.cursor()` returns a cursor, that
|
||||
// will include all future delete operation (and no past operations).
|
||||
// - cloning an existing cursor returns a new cursor, that
|
||||
// is at the exact same position, and can now advance independantly
|
||||
// is at the exact same position, and can now advance independently
|
||||
// from the original cursor.
|
||||
#[derive(Default)]
|
||||
struct InnerDeleteQueue {
|
||||
@@ -29,34 +28,28 @@ pub struct DeleteQueue {
|
||||
inner: Arc<RwLock<InnerDeleteQueue>>,
|
||||
}
|
||||
|
||||
|
||||
impl DeleteQueue {
|
||||
|
||||
// Creates a new delete queue.
|
||||
pub fn new() -> DeleteQueue {
|
||||
|
||||
let delete_queue = DeleteQueue {
|
||||
inner: Arc::default(),
|
||||
};
|
||||
|
||||
|
||||
let next_block = NextBlock::from(delete_queue.clone());
|
||||
{
|
||||
let mut delete_queue_wlock = delete_queue.inner.write().unwrap();
|
||||
delete_queue_wlock.last_block = Some(
|
||||
Arc::new(Block {
|
||||
operations: Arc::default(),
|
||||
next: next_block,
|
||||
})
|
||||
);
|
||||
delete_queue_wlock.last_block = Some(Arc::new(Block {
|
||||
operations: Arc::default(),
|
||||
next: next_block,
|
||||
}));
|
||||
}
|
||||
|
||||
delete_queue
|
||||
}
|
||||
|
||||
|
||||
// Creates a new cursor that makes it possible to
|
||||
// Creates a new cursor that makes it possible to
|
||||
// consume future delete operations.
|
||||
//
|
||||
//
|
||||
// Past delete operations are not accessible.
|
||||
pub fn cursor(&self) -> DeleteCursor {
|
||||
let last_block = self.inner
|
||||
@@ -64,9 +57,11 @@ impl DeleteQueue {
|
||||
.expect("Read lock poisoned when opening delete queue cursor")
|
||||
.last_block
|
||||
.clone()
|
||||
.expect("Failed to unwrap last_block. This should never happen
|
||||
.expect(
|
||||
"Failed to unwrap last_block. This should never happen
|
||||
as the Option<> is only here to make
|
||||
initialization possible");
|
||||
initialization possible",
|
||||
);
|
||||
let operations_len = last_block.operations.len();
|
||||
DeleteCursor {
|
||||
block: last_block,
|
||||
@@ -85,40 +80,37 @@ impl DeleteQueue {
|
||||
|
||||
// DeleteQueue is a linked list of blocks of
|
||||
// delete operations.
|
||||
//
|
||||
//
|
||||
// Writing happens by simply appending to a vec.
|
||||
// `.flush()` takes this pending delete operations vec
|
||||
// creates a new read-only block from it,
|
||||
// creates a new read-only block from it,
|
||||
// and appends it to the linked list.
|
||||
//
|
||||
// `.flush()` happens when, for instance,
|
||||
//
|
||||
// `.flush()` happens when, for instance,
|
||||
// a consumer reaches the last read-only operations.
|
||||
// It then ask the delete queue if there happen to
|
||||
// It then ask the delete queue if there happen to
|
||||
// be some unflushed operations.
|
||||
//
|
||||
fn flush(&self) -> Option<Arc<Block>> {
|
||||
let mut self_wlock = self
|
||||
.inner
|
||||
let mut self_wlock = self.inner
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on delete queue writer");
|
||||
|
||||
|
||||
let delete_operations;
|
||||
{
|
||||
let writer: &mut Vec<DeleteOperation> = &mut self_wlock.writer;
|
||||
if writer.is_empty() {
|
||||
return None;
|
||||
}
|
||||
delete_operations = mem::replace(writer, vec!());
|
||||
delete_operations = mem::replace(writer, vec![]);
|
||||
}
|
||||
|
||||
let next_block = NextBlock::from(self.clone());
|
||||
{
|
||||
self_wlock.last_block = Some(
|
||||
Arc::new(Block {
|
||||
operations: Arc::new(delete_operations),
|
||||
next: next_block,
|
||||
})
|
||||
);
|
||||
self_wlock.last_block = Some(Arc::new(Block {
|
||||
operations: Arc::new(delete_operations),
|
||||
next: next_block,
|
||||
}));
|
||||
}
|
||||
self_wlock.last_block.clone()
|
||||
}
|
||||
@@ -137,17 +129,14 @@ impl From<DeleteQueue> for NextBlock {
|
||||
}
|
||||
}
|
||||
|
||||
impl NextBlock {
|
||||
impl NextBlock {
|
||||
fn next_block(&self) -> Option<Arc<Block>> {
|
||||
{
|
||||
let next_read_lock = self.0
|
||||
.read()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
match *next_read_lock {
|
||||
InnerNextBlock::Closed(ref block) => {
|
||||
return Some(block.clone());
|
||||
}
|
||||
_ => {}
|
||||
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
|
||||
return Some(Arc::clone(block));
|
||||
}
|
||||
}
|
||||
let next_block;
|
||||
@@ -157,21 +146,19 @@ impl NextBlock {
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
match *next_write_lock {
|
||||
InnerNextBlock::Closed(ref block) => {
|
||||
return Some(block.clone());
|
||||
return Some(Arc::clone(block));
|
||||
}
|
||||
InnerNextBlock::Writer(ref writer) => {
|
||||
match writer.flush() {
|
||||
Some(flushed_next_block) => {
|
||||
next_block = flushed_next_block;
|
||||
}
|
||||
None => {
|
||||
return None;
|
||||
}
|
||||
InnerNextBlock::Writer(ref writer) => match writer.flush() {
|
||||
Some(flushed_next_block) => {
|
||||
next_block = flushed_next_block;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
return None;
|
||||
}
|
||||
},
|
||||
}
|
||||
*next_write_lock.deref_mut() = InnerNextBlock::Closed(next_block.clone()); // TODO fix
|
||||
return Some(next_block)
|
||||
*next_write_lock.deref_mut() = InnerNextBlock::Closed(Arc::clone(&next_block));
|
||||
Some(next_block)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -181,40 +168,37 @@ struct Block {
|
||||
next: NextBlock,
|
||||
}
|
||||
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct DeleteCursor {
|
||||
block: Arc<Block>,
|
||||
pos: usize,
|
||||
}
|
||||
|
||||
|
||||
impl DeleteCursor {
|
||||
|
||||
impl DeleteCursor {
|
||||
/// Skips operations and position it so that
|
||||
/// - either all of the delete operation currently in the
|
||||
/// - either all of the delete operation currently in the
|
||||
/// queue are consume and the next get will return None.
|
||||
/// - the next get will return the first operation with an
|
||||
/// `opstamp >= target_opstamp`.
|
||||
pub fn skip_to(&mut self, target_opstamp: u64) {
|
||||
// TODO Can be optimize as we work with block.
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))]
|
||||
loop {
|
||||
if let Some(operation) = self.get() {
|
||||
if operation.opstamp >= target_opstamp {
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
self.advance();
|
||||
}
|
||||
}
|
||||
|
||||
/// If the current block has been entirely
|
||||
/// If the current block has been entirely
|
||||
/// consumed, try to load the next one.
|
||||
///
|
||||
/// Return `true`, if after this attempt,
|
||||
///
|
||||
/// Return `true`, if after this attempt,
|
||||
/// the cursor is on a block that has not
|
||||
/// been entirely consumed.
|
||||
/// Return `false`, if we have reached the end of the queue.
|
||||
@@ -229,24 +213,20 @@ impl DeleteCursor {
|
||||
self.pos = 0;
|
||||
true
|
||||
}
|
||||
None => {
|
||||
false
|
||||
}
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Advance to the next delete operation.
|
||||
/// Returns true iff there is such an operation.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
if self.load_block_if_required() {
|
||||
self.pos += 1;
|
||||
true
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
@@ -256,34 +236,27 @@ impl DeleteCursor {
|
||||
pub fn get(&mut self) -> Option<&DeleteOperation> {
|
||||
if self.load_block_if_required() {
|
||||
Some(&self.block.operations[self.pos])
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::{DeleteQueue, DeleteOperation};
|
||||
use schema::{Term, Field};
|
||||
use super::{DeleteOperation, DeleteQueue};
|
||||
use schema::{Field, Term};
|
||||
|
||||
#[test]
|
||||
fn test_deletequeue() {
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
|
||||
let make_op = |i: usize| {
|
||||
let field = Field(1u8);
|
||||
let field = Field(1u32);
|
||||
DeleteOperation {
|
||||
opstamp: i as u64,
|
||||
term: Term::from_field_u32(field, i as u32)
|
||||
term: Term::from_field_u64(field, i as u64),
|
||||
}
|
||||
};
|
||||
|
||||
@@ -299,7 +272,7 @@ mod tests {
|
||||
operations_it.advance();
|
||||
assert!(operations_it.get().is_none());
|
||||
operations_it.advance();
|
||||
|
||||
|
||||
let mut snapshot2 = delete_queue.cursor();
|
||||
assert!(snapshot2.get().is_none());
|
||||
delete_queue.push(make_op(3));
|
||||
@@ -310,7 +283,7 @@ mod tests {
|
||||
assert!(operations_it.get().is_none());
|
||||
operations_it.advance();
|
||||
}
|
||||
{
|
||||
{
|
||||
let mut operations_it = snapshot.clone();
|
||||
assert_eq!(operations_it.get().unwrap().opstamp, 1);
|
||||
operations_it.advance();
|
||||
@@ -320,6 +293,5 @@ mod tests {
|
||||
operations_it.advance();
|
||||
assert!(operations_it.get().is_none());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,8 +2,6 @@ use Directory;
|
||||
use directory::error::OpenWriteError;
|
||||
use core::LOCKFILE_FILEPATH;
|
||||
|
||||
|
||||
|
||||
/// The directory lock is a mechanism used to
|
||||
/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html)
|
||||
///
|
||||
@@ -15,8 +13,8 @@ pub struct DirectoryLock {
|
||||
|
||||
impl DirectoryLock {
|
||||
pub fn lock(mut directory: Box<Directory>) -> Result<DirectoryLock, OpenWriteError> {
|
||||
try!(directory.open_write(&*LOCKFILE_FILEPATH));
|
||||
Ok(DirectoryLock { directory: directory })
|
||||
directory.open_write(&*LOCKFILE_FILEPATH)?;
|
||||
Ok(DirectoryLock { directory })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,4 +24,4 @@ impl Drop for DirectoryLock {
|
||||
error!("Failed to remove the lock file. {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
use std::sync::Arc;
|
||||
use DocId;
|
||||
|
||||
|
||||
// Doc to opstamp is used to identify which
|
||||
// document should be deleted.
|
||||
//
|
||||
//
|
||||
// Since the docset matching the query of a delete operation
|
||||
// is not computed right when the delete operation is received,
|
||||
// we need to find a way to evaluate, for each document,
|
||||
@@ -14,13 +13,13 @@ use DocId;
|
||||
//
|
||||
// The doc to opstamp mapping stores precisely an array
|
||||
// indexed by doc id and storing the opstamp of the document.
|
||||
//
|
||||
//
|
||||
// This mapping is (for the moment) stricly increasing
|
||||
// because of the way document id are allocated.
|
||||
#[derive(Clone)]
|
||||
pub enum DocToOpstampMapping {
|
||||
WithMap(Arc<Vec<u64>>),
|
||||
None
|
||||
None,
|
||||
}
|
||||
|
||||
impl From<Vec<u64>> for DocToOpstampMapping {
|
||||
@@ -30,9 +29,8 @@ impl From<Vec<u64>> for DocToOpstampMapping {
|
||||
}
|
||||
|
||||
impl DocToOpstampMapping {
|
||||
|
||||
/// Given an opstamp return the limit doc id L
|
||||
/// such that all doc id D such that
|
||||
/// such that all doc id D such that
|
||||
// D >= L iff opstamp(D) >= than `target_opstamp`.
|
||||
//
|
||||
// The edge case opstamp = some doc opstamp is in practise
|
||||
@@ -41,8 +39,7 @@ impl DocToOpstampMapping {
|
||||
match *self {
|
||||
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
|
||||
match doc_opstamps.binary_search(&target_opstamp) {
|
||||
Ok(doc_id) => doc_id as DocId,
|
||||
Err(doc_id) => doc_id as DocId,
|
||||
Ok(doc_id) | Err(doc_id) => doc_id as DocId,
|
||||
}
|
||||
}
|
||||
DocToOpstampMapping::None => DocId::max_value(),
|
||||
@@ -58,23 +55,26 @@ mod tests {
|
||||
#[test]
|
||||
fn test_doc_to_opstamp_mapping_none() {
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::None;
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1), u32::max_value());
|
||||
assert_eq!(
|
||||
doc_to_opstamp_mapping.compute_doc_limit(1),
|
||||
u32::max_value()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_to_opstamp_mapping_complex() {
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!());
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 0);
|
||||
}
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!(1u64));
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 1);
|
||||
}
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!(1u64, 12u64, 17u64, 23u64));
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64, 12u64, 17u64, 23u64]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
for i in 2u64..13u64 {
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 1);
|
||||
@@ -90,4 +90,4 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,39 +9,39 @@ use core::SegmentReader;
|
||||
use indexer::stamper::Stamper;
|
||||
use datastruct::stacker::Heap;
|
||||
use directory::FileProtection;
|
||||
use Error;
|
||||
use Directory;
|
||||
use fastfield::delete::write_delete_bitset;
|
||||
use error::{Error, ErrorKind, Result, ResultExt};
|
||||
use fastfield::write_delete_bitset;
|
||||
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||
use futures::Canceled;
|
||||
use datastruct::stacker::hashmap::split_memory;
|
||||
use futures::Future;
|
||||
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
||||
use indexer::MergePolicy;
|
||||
use indexer::operation::DeleteOperation;
|
||||
use indexer::SegmentEntry;
|
||||
use indexer::SegmentWriter;
|
||||
use postings::DocSet;
|
||||
use postings::SegmentPostingsOption;
|
||||
use Result;
|
||||
use docset::DocSet;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Document;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use std::mem;
|
||||
use std::mem::swap;
|
||||
use std::mem::swap;
|
||||
use std::thread::JoinHandle;
|
||||
use super::directory_lock::DirectoryLock;
|
||||
use indexer::DirectoryLock;
|
||||
use super::operation::AddOperation;
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use std::thread;
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
// in the heap goes below MARGIN_IN_BYTES.
|
||||
pub const MARGIN_IN_BYTES: u32 = 10_000_000u32;
|
||||
pub const MARGIN_IN_BYTES: u32 = 1_000_000u32;
|
||||
|
||||
// We impose the memory per thread to be at least 30 MB.
|
||||
// We impose the memory per thread to be at least 3 MB.
|
||||
pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32;
|
||||
|
||||
// Add document will block if the number of docs waiting in the queue to be indexed reaches PIPELINE_MAX_SIZE_IN_DOCS
|
||||
// Add document will block if the number of docs waiting in the queue to be indexed
|
||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
|
||||
type DocumentSender = chan::Sender<AddOperation>;
|
||||
@@ -51,14 +51,13 @@ type DocumentReceiver = chan::Receiver<AddOperation>;
|
||||
///
|
||||
/// It manages a small number of indexing thread, as well as a shared
|
||||
/// indexing queue.
|
||||
/// Each indexing thread builds its own independant `Segment`, via
|
||||
/// Each indexing thread builds its own independent `Segment`, via
|
||||
/// a `SegmentWriter` object.
|
||||
pub struct IndexWriter {
|
||||
|
||||
// the lock is just used to bind the
|
||||
// the lock is just used to bind the
|
||||
// lifetime of the lock with that of the IndexWriter.
|
||||
_directory_lock: DirectoryLock,
|
||||
|
||||
_directory_lock: Option<DirectoryLock>,
|
||||
|
||||
index: Index,
|
||||
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
@@ -86,8 +85,6 @@ pub struct IndexWriter {
|
||||
impl !Send for IndexWriter {}
|
||||
impl !Sync for IndexWriter {}
|
||||
|
||||
|
||||
|
||||
/// Open a new index writer. Attempts to acquire a lockfile.
|
||||
///
|
||||
/// The lockfile should be deleted on drop, but it is possible
|
||||
@@ -96,7 +93,7 @@ impl !Sync for IndexWriter {}
|
||||
/// `IndexWriter` on the system is accessing the index directory,
|
||||
/// it is safe to manually delete the lockfile.
|
||||
///
|
||||
/// num_threads specifies the number of indexing workers that
|
||||
/// `num_threads` specifies the number of indexing workers that
|
||||
/// should work at the same time.
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
@@ -105,79 +102,79 @@ impl !Sync for IndexWriter {}
|
||||
pub fn open_index_writer(
|
||||
index: &Index,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes_per_thread: usize) -> Result<IndexWriter> {
|
||||
|
||||
if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize {
|
||||
panic!(format!("The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_LIMIT));
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
directory_lock: DirectoryLock,
|
||||
) -> Result<IndexWriter> {
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_LIMIT as usize {
|
||||
panic!(format!(
|
||||
"The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_LIMIT
|
||||
));
|
||||
}
|
||||
|
||||
let directory_lock = try!(DirectoryLock::lock(index.directory().box_clone()));
|
||||
|
||||
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
let stamper = Stamper::new(index.opstamp());
|
||||
|
||||
let segment_updater = SegmentUpdater::new(index.clone(),
|
||||
stamper.clone(),
|
||||
delete_queue.cursor())?;
|
||||
|
||||
let current_opstamp = index.load_metas()?.opstamp;
|
||||
|
||||
let stamper = Stamper::new(current_opstamp);
|
||||
|
||||
let segment_updater =
|
||||
SegmentUpdater::new(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||
|
||||
let mut index_writer = IndexWriter {
|
||||
|
||||
_directory_lock: directory_lock,
|
||||
|
||||
heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread,
|
||||
_directory_lock: Some(directory_lock),
|
||||
|
||||
heap_size_in_bytes_per_thread,
|
||||
index: index.clone(),
|
||||
|
||||
document_receiver: document_receiver,
|
||||
document_sender: document_sender,
|
||||
document_receiver,
|
||||
document_sender,
|
||||
|
||||
segment_updater: segment_updater,
|
||||
segment_updater,
|
||||
|
||||
workers_join_handle: vec!(),
|
||||
num_threads: num_threads,
|
||||
workers_join_handle: vec![],
|
||||
num_threads,
|
||||
|
||||
delete_queue: delete_queue,
|
||||
delete_queue,
|
||||
|
||||
committed_opstamp: index.opstamp(),
|
||||
stamper: stamper,
|
||||
committed_opstamp: current_opstamp,
|
||||
stamper,
|
||||
|
||||
generation: 0,
|
||||
|
||||
worker_id: 0,
|
||||
};
|
||||
try!(index_writer.start_workers());
|
||||
index_writer.start_workers()?;
|
||||
Ok(index_writer)
|
||||
}
|
||||
|
||||
|
||||
|
||||
pub fn compute_deleted_bitset(
|
||||
delete_bitset: &mut BitSet,
|
||||
segment_reader: &SegmentReader,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: DocToOpstampMapping,
|
||||
target_opstamp: u64) -> Result<bool> {
|
||||
|
||||
doc_opstamps: &DocToOpstampMapping,
|
||||
target_opstamp: u64,
|
||||
) -> Result<bool> {
|
||||
let mut might_have_changed = false;
|
||||
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))]
|
||||
loop {
|
||||
if let Some(delete_op) = delete_cursor.get() {
|
||||
if delete_op.opstamp > target_opstamp {
|
||||
break;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
// A delete operation should only affect
|
||||
// document that were inserted after it.
|
||||
//
|
||||
//
|
||||
// Limit doc helps identify the first document
|
||||
// that may be affected by the delete operation.
|
||||
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
|
||||
if let Some(mut docset) = segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) {
|
||||
let inverted_index = segment_reader.inverted_index(delete_op.term.field());
|
||||
if let Some(mut docset) =
|
||||
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)
|
||||
{
|
||||
while docset.advance() {
|
||||
let deleted_doc = docset.doc();
|
||||
if deleted_doc < limit_doc {
|
||||
@@ -187,8 +184,7 @@ pub fn compute_deleted_bitset(
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
delete_cursor.advance();
|
||||
@@ -196,17 +192,13 @@ pub fn compute_deleted_bitset(
|
||||
Ok(might_have_changed)
|
||||
}
|
||||
|
||||
|
||||
// TODO skip delete operation before teh
|
||||
// last delete opstamp
|
||||
|
||||
/// Advance delete for the given segment up
|
||||
/// to the target opstamp.
|
||||
pub fn advance_deletes(
|
||||
mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: u64) -> Result<Option<FileProtection>> {
|
||||
|
||||
target_opstamp: u64,
|
||||
) -> Result<Option<FileProtection>> {
|
||||
let mut file_protect: Option<FileProtection> = None;
|
||||
|
||||
{
|
||||
@@ -216,26 +208,24 @@ pub fn advance_deletes(
|
||||
return Ok(file_protect);
|
||||
}
|
||||
}
|
||||
let segment_reader = SegmentReader::open(segment.clone())?;
|
||||
let segment_reader = SegmentReader::open(&segment)?;
|
||||
let max_doc = segment_reader.max_doc();
|
||||
|
||||
let mut delete_bitset: BitSet =
|
||||
match segment_entry.delete_bitset() {
|
||||
Some(ref previous_delete_bitset) =>
|
||||
(*previous_delete_bitset).clone(),
|
||||
None =>
|
||||
BitSet::with_capacity(max_doc as usize)
|
||||
};
|
||||
|
||||
|
||||
let mut delete_bitset: BitSet = match segment_entry.delete_bitset() {
|
||||
Some(previous_delete_bitset) => (*previous_delete_bitset).clone(),
|
||||
None => BitSet::with_capacity(max_doc as usize),
|
||||
};
|
||||
|
||||
let delete_cursor = segment_entry.delete_cursor();
|
||||
|
||||
compute_deleted_bitset(
|
||||
&mut delete_bitset,
|
||||
&segment_reader,
|
||||
delete_cursor,
|
||||
DocToOpstampMapping::None,
|
||||
target_opstamp)?;
|
||||
|
||||
&DocToOpstampMapping::None,
|
||||
target_opstamp,
|
||||
)?;
|
||||
|
||||
for doc in 0u32..max_doc {
|
||||
if segment_reader.is_deleted(doc) {
|
||||
delete_bitset.insert(doc as usize);
|
||||
@@ -255,30 +245,55 @@ pub fn advance_deletes(
|
||||
Ok(file_protect)
|
||||
}
|
||||
|
||||
fn index_documents(heap: &mut Heap,
|
||||
segment: Segment,
|
||||
schema: &Schema,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item=AddOperation>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor)
|
||||
-> Result<bool> {
|
||||
fn index_documents(
|
||||
heap: &mut Heap,
|
||||
table_size: usize,
|
||||
segment: &Segment,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
heap.clear();
|
||||
let schema = segment.schema();
|
||||
let segment_id = segment.id();
|
||||
let mut segment_writer = SegmentWriter::for_segment(heap, segment.clone(), &schema)?;
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(heap, table_size, segment.clone(), &schema)?;
|
||||
for doc in document_iterator {
|
||||
try!(segment_writer.add_document(&doc, &schema));
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
// There is two possible conditions to close the segment.
|
||||
// One is the memory arena dedicated to the segment is
|
||||
// getting full.
|
||||
if segment_writer.is_buffer_full() {
|
||||
info!("Buffer limit reached, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc());
|
||||
info!(
|
||||
"Buffer limit reached, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
// The second is the term dictionary hash table
|
||||
// is reaching saturation.
|
||||
//
|
||||
// Tantivy does not resize its hashtable. When it reaches
|
||||
// capacity, we just stop indexing new document.
|
||||
if segment_writer.is_term_saturated() {
|
||||
info!(
|
||||
"Term dic saturated, flushing segment with maxdoc={}.",
|
||||
segment_writer.max_doc()
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !segment_updater.is_alive() {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let num_docs = segment_writer.max_doc();
|
||||
|
||||
|
||||
// this is ensured by the call to peek before starting
|
||||
// the worker thread.
|
||||
assert!(num_docs > 0);
|
||||
assert!(num_docs > 0);
|
||||
|
||||
let doc_opstamps: Vec<u64> = segment_writer.finalize()?;
|
||||
|
||||
@@ -286,7 +301,7 @@ fn index_documents(heap: &mut Heap,
|
||||
segment_meta.set_max_doc(num_docs);
|
||||
|
||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
||||
|
||||
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||
@@ -294,106 +309,114 @@ fn index_documents(heap: &mut Heap,
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
doc_to_opstamps,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
|
||||
let segment_entry = SegmentEntry::new(
|
||||
segment_meta,
|
||||
delete_cursor,
|
||||
{ if may_have_deletes { Some(deleted_bitset) }
|
||||
else { None } }
|
||||
);
|
||||
|
||||
Ok(
|
||||
segment_updater
|
||||
.add_segment(generation, segment_entry)
|
||||
)
|
||||
|
||||
}
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
|
||||
if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
Ok(segment_updater.add_segment(generation, segment_entry))
|
||||
}
|
||||
|
||||
impl IndexWriter {
|
||||
/// The index writer
|
||||
pub fn wait_merging_threads(mut self) -> Result<()> {
|
||||
|
||||
// this will stop the indexing thread,
|
||||
// dropping the last reference to the segment_updater.
|
||||
drop(self.document_sender);
|
||||
|
||||
|
||||
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec!());
|
||||
|
||||
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]);
|
||||
for join_handle in former_workers_handles {
|
||||
try!(join_handle.join()
|
||||
join_handle
|
||||
.join()
|
||||
.expect("Indexing Worker thread panicked")
|
||||
.map_err(|e| {
|
||||
Error::ErrorInThread(format!("Error in indexing worker thread. {:?}", e))
|
||||
}));
|
||||
.chain_err(|| ErrorKind::ErrorInThread("Error in indexing worker thread.".into()))?;
|
||||
}
|
||||
drop(self.workers_join_handle);
|
||||
|
||||
let result = self.segment_updater
|
||||
.wait_merging_thread()
|
||||
.map_err(|_|
|
||||
Error::ErrorInThread("Failed to join merging thread.".to_string())
|
||||
);
|
||||
|
||||
if let &Err(ref e) = &result {
|
||||
.chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into()));
|
||||
|
||||
if let Err(ref e) = result {
|
||||
error!("Some merging thread failed {:?}", e);
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
|
||||
let delete_cursor = self.delete_queue.cursor();
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
|
||||
self.segment_updater
|
||||
.add_segment(self.generation, segment_entry);
|
||||
}
|
||||
|
||||
/// *Experimental & Advanced API* Creates a new segment.
|
||||
/// and marks it as currently in write.
|
||||
///
|
||||
/// This method is useful only for users trying to do complex
|
||||
/// operations, like converting an index format to another.
|
||||
pub fn new_segment(&self) -> Segment {
|
||||
self.segment_updater.new_segment()
|
||||
}
|
||||
|
||||
/// Spawns a new worker thread for indexing.
|
||||
/// The thread consumes documents from the pipeline.
|
||||
///
|
||||
fn add_indexing_worker(&mut self) -> Result<()> {
|
||||
let schema = self.index.schema();
|
||||
let document_receiver_clone = self.document_receiver.clone();
|
||||
let mut segment_updater = self.segment_updater.clone();
|
||||
let mut heap = Heap::with_capacity(self.heap_size_in_bytes_per_thread);
|
||||
|
||||
let (heap_size, table_size) = split_memory(self.heap_size_in_bytes_per_thread);
|
||||
info!("heap size {}, table_size {}", heap_size, table_size);
|
||||
let mut heap = Heap::with_capacity(heap_size);
|
||||
|
||||
let generation = self.generation;
|
||||
|
||||
|
||||
let mut delete_cursor = self.delete_queue.cursor();
|
||||
|
||||
let join_handle: JoinHandle<Result<()>> =
|
||||
thread::Builder::new()
|
||||
.name(format!("indexing thread {} for gen {}", self.worker_id, generation))
|
||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||
.name(format!(
|
||||
"indexing thread {} for gen {}",
|
||||
self.worker_id, generation
|
||||
))
|
||||
.spawn(move || {
|
||||
|
||||
loop {
|
||||
let mut document_iterator =
|
||||
document_receiver_clone.clone().into_iter().peekable();
|
||||
|
||||
let mut document_iterator = document_receiver_clone.clone()
|
||||
.into_iter()
|
||||
.peekable();
|
||||
|
||||
// the peeking here is to avoid
|
||||
// creating a new segment's files
|
||||
// if no document are available.
|
||||
//
|
||||
// this is a valid guarantee as the
|
||||
// this is a valid guarantee as the
|
||||
// peeked document now belongs to
|
||||
// our local iterator.
|
||||
if let Some(operation) = document_iterator.peek() {
|
||||
delete_cursor.skip_to(operation.opstamp);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
// No more documents.
|
||||
// Happens when there is a commit, or if the `IndexWriter`
|
||||
// was dropped.
|
||||
return Ok(())
|
||||
return Ok(());
|
||||
}
|
||||
let segment = segment_updater.new_segment();
|
||||
index_documents(&mut heap,
|
||||
segment,
|
||||
&schema,
|
||||
generation,
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
delete_cursor.clone())?;
|
||||
|
||||
index_documents(
|
||||
&mut heap,
|
||||
table_size,
|
||||
&segment,
|
||||
generation,
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
delete_cursor.clone(),
|
||||
)?;
|
||||
}
|
||||
})?;
|
||||
self.worker_id += 1;
|
||||
@@ -410,16 +433,25 @@ impl IndexWriter {
|
||||
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
|
||||
self.segment_updater.set_merge_policy(merge_policy);
|
||||
}
|
||||
|
||||
|
||||
fn start_workers(&mut self) -> Result<()> {
|
||||
for _ in 0..self.num_threads {
|
||||
try!(self.add_indexing_worker());
|
||||
self.add_indexing_worker()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Detects and removes the files that
|
||||
/// are not used by the index anymore.
|
||||
pub fn garbage_collect_files(&mut self) -> Result<()> {
|
||||
self.segment_updater.garbage_collect_files()
|
||||
}
|
||||
|
||||
/// Merges a given list of segments
|
||||
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> impl Future<Item=SegmentMeta, Error=Canceled> {
|
||||
pub fn merge(
|
||||
&mut self,
|
||||
segment_ids: &[SegmentId],
|
||||
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
|
||||
self.segment_updater.start_merge(segment_ids)
|
||||
}
|
||||
|
||||
@@ -432,8 +464,10 @@ impl IndexWriter {
|
||||
///
|
||||
/// Returns the former segment_ready channel.
|
||||
fn recreate_document_channel(&mut self) -> DocumentReceiver {
|
||||
let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
let (mut document_sender, mut document_receiver): (
|
||||
DocumentSender,
|
||||
DocumentReceiver,
|
||||
) = chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
swap(&mut self.document_sender, &mut document_sender);
|
||||
swap(&mut self.document_receiver, &mut document_receiver);
|
||||
document_receiver
|
||||
@@ -447,29 +481,102 @@ impl IndexWriter {
|
||||
/// state as it was after the last commit.
|
||||
///
|
||||
/// The opstamp at the last commit is returned.
|
||||
pub fn rollback(mut self) -> Result<IndexWriter> {
|
||||
pub fn rollback(&mut self) -> Result<()> {
|
||||
info!("Rolling back to opstamp {}", self.committed_opstamp);
|
||||
|
||||
// marks the segment updater as killed. From now on, all
|
||||
// segment updates will be ignored.
|
||||
self.segment_updater.kill();
|
||||
|
||||
let document_receiver = self.document_receiver.clone();
|
||||
|
||||
// take the directory lock to create a new index_writer.
|
||||
let directory_lock = self._directory_lock
|
||||
.take()
|
||||
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
|
||||
|
||||
let new_index_writer: IndexWriter = open_index_writer(
|
||||
&self.index,
|
||||
self.num_threads,
|
||||
self.heap_size_in_bytes_per_thread,
|
||||
directory_lock,
|
||||
)?;
|
||||
|
||||
// the current `self` is dropped right away because of this call.
|
||||
//
|
||||
// This will drop the document queue, and the thread
|
||||
// should terminate.
|
||||
mem::replace(self, new_index_writer);
|
||||
|
||||
// Drains the document receiver pipeline :
|
||||
// Workers don't need to index the pending documents.
|
||||
let receiver_clone = self.document_receiver.clone();
|
||||
let index = self.index.clone();
|
||||
let num_threads = self.num_threads;
|
||||
let heap_size_in_bytes_per_thread = self.heap_size_in_bytes_per_thread;
|
||||
drop(self);
|
||||
for _ in receiver_clone {}
|
||||
|
||||
let index_writer = open_index_writer(
|
||||
&index,
|
||||
num_threads,
|
||||
heap_size_in_bytes_per_thread)?;
|
||||
|
||||
Ok(index_writer)
|
||||
//
|
||||
// This will reach an end as the only document_sender
|
||||
// was dropped with the index_writer.
|
||||
for _ in document_receiver.clone() {}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Prepares a commit.
|
||||
///
|
||||
/// Calling `prepare_commit()` will cut the indexing
|
||||
/// queue. All pending documents will be sent to the
|
||||
/// indexing workers. They will then terminate, regardless
|
||||
/// of the size of their current segment and flush their
|
||||
/// work on disk.
|
||||
///
|
||||
/// Once a commit is "prepared", you can either
|
||||
/// call
|
||||
/// * `.commit()`: to accept this commit
|
||||
/// * `.abort()`: to cancel this commit.
|
||||
///
|
||||
/// In the current implementation, `PreparedCommit` borrows
|
||||
/// the `IndexWriter` mutably so we are guaranteed that no new
|
||||
/// document can be added as long as it is committed or is
|
||||
/// dropped.
|
||||
///
|
||||
/// It is also possible to add a payload to the `commit`
|
||||
/// using this API.
|
||||
/// See [`PreparedCommit::set_payload()`](PreparedCommit.html)
|
||||
pub fn prepare_commit(&mut self) -> Result<PreparedCommit> {
|
||||
// Here, because we join all of the worker threads,
|
||||
// all of the segment update for this commit have been
|
||||
// sent.
|
||||
//
|
||||
// No document belonging to the next generation have been
|
||||
// pushed too, because add_document can only happen
|
||||
// on this thread.
|
||||
|
||||
// This will move uncommitted segments to the state of
|
||||
// committed segments.
|
||||
info!("Preparing commit");
|
||||
|
||||
// this will drop the current document channel
|
||||
// and recreate a new one channels.
|
||||
self.recreate_document_channel();
|
||||
|
||||
let mut former_workers_join_handle = Vec::new();
|
||||
swap(
|
||||
&mut former_workers_join_handle,
|
||||
&mut self.workers_join_handle,
|
||||
);
|
||||
|
||||
for worker_handle in former_workers_join_handle {
|
||||
let indexing_worker_result = worker_handle
|
||||
.join()
|
||||
.map_err(|e| Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))))?;
|
||||
|
||||
indexing_worker_result?;
|
||||
// add a new worker for the next generation.
|
||||
self.add_indexing_worker()?;
|
||||
}
|
||||
|
||||
let commit_opstamp = self.stamper.stamp();
|
||||
let prepared_commit = PreparedCommit::new(self, commit_opstamp);
|
||||
info!("Prepared commit {}", commit_opstamp);
|
||||
Ok(prepared_commit)
|
||||
}
|
||||
|
||||
/// Commits all of the pending changes
|
||||
///
|
||||
@@ -486,44 +593,12 @@ impl IndexWriter {
|
||||
/// that made it in the commit.
|
||||
///
|
||||
pub fn commit(&mut self) -> Result<u64> {
|
||||
self.prepare_commit()?.commit()
|
||||
}
|
||||
|
||||
// here, because we join all of the worker threads,
|
||||
// all of the segment update for this commit have been
|
||||
// sent.
|
||||
//
|
||||
// No document belonging to the next generation have been
|
||||
// pushed too, because add_document can only happen
|
||||
// on this thread.
|
||||
|
||||
// This will move uncommitted segments to the state of
|
||||
// committed segments.
|
||||
self.committed_opstamp = self.stamper.stamp();
|
||||
info!("committing {}", self.committed_opstamp);
|
||||
|
||||
// this will drop the current document channel
|
||||
// and recreate a new one channels.
|
||||
self.recreate_document_channel();
|
||||
|
||||
let mut former_workers_join_handle = Vec::new();
|
||||
swap(&mut former_workers_join_handle,
|
||||
&mut self.workers_join_handle);
|
||||
|
||||
for worker_handle in former_workers_join_handle {
|
||||
let indexing_worker_result = try!(worker_handle.join()
|
||||
.map_err(|e| Error::ErrorInThread(format!("{:?}", e))));
|
||||
try!(indexing_worker_result);
|
||||
// add a new worker for the next generation.
|
||||
try!(self.add_indexing_worker());
|
||||
}
|
||||
|
||||
|
||||
|
||||
// wait for the segment update thread to have processed the info
|
||||
self.segment_updater
|
||||
.commit(self.committed_opstamp)?;
|
||||
|
||||
Ok(self.committed_opstamp)
|
||||
}
|
||||
pub(crate) fn segment_updater(&self) -> &SegmentUpdater {
|
||||
&self.segment_updater
|
||||
}
|
||||
|
||||
/// Delete all documents containing a given term.
|
||||
///
|
||||
@@ -531,20 +606,17 @@ impl IndexWriter {
|
||||
/// were added in previous commits, and documents
|
||||
/// that were added previously in the same commit.
|
||||
///
|
||||
/// Like adds, the deletion itself will be visible
|
||||
/// Like adds, the deletion itself will be visible
|
||||
/// only after calling `commit()`.
|
||||
pub fn delete_term(&mut self, term: Term) -> u64 {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let delete_operation = DeleteOperation {
|
||||
opstamp: opstamp,
|
||||
term: term,
|
||||
};
|
||||
let delete_operation = DeleteOperation { opstamp, term };
|
||||
self.delete_queue.push(delete_operation);
|
||||
opstamp
|
||||
}
|
||||
|
||||
/// Returns the opstamp of the last successful commit.
|
||||
///
|
||||
///
|
||||
/// This is, for instance, the opstamp the index will
|
||||
/// rollback to if there is a failure like a power surge.
|
||||
///
|
||||
@@ -566,18 +638,12 @@ impl IndexWriter {
|
||||
/// have been added since the creation of the index.
|
||||
pub fn add_document(&mut self, document: Document) -> u64 {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let add_operation = AddOperation {
|
||||
opstamp: opstamp,
|
||||
document: document,
|
||||
};
|
||||
let add_operation = AddOperation { opstamp, document };
|
||||
self.document_sender.send(add_operation);
|
||||
opstamp
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -585,7 +651,7 @@ mod tests {
|
||||
use schema::{self, Document};
|
||||
use Index;
|
||||
use Term;
|
||||
use Error;
|
||||
use error::*;
|
||||
use env_logger;
|
||||
|
||||
#[test]
|
||||
@@ -594,20 +660,27 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let _index_writer = index.writer(40_000_000).unwrap();
|
||||
match index.writer(40_000_000) {
|
||||
Err(Error::FileAlreadyExists(_)) => {}
|
||||
Err(Error(ErrorKind::FileAlreadyExists(_), _)) => {}
|
||||
_ => panic!("Expected FileAlreadyExists error"),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_set_merge_policy() {
|
||||
let schema_builder = schema::SchemaBuilder::default();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let index_writer = index.writer(40_000_000).unwrap();
|
||||
assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, level_log_size: 0.75 }");
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
|
||||
level_log_size: 0.75 }"
|
||||
);
|
||||
let merge_policy = box NoMergePolicy::default();
|
||||
index_writer.set_merge_policy(merge_policy);
|
||||
assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "NoMergePolicy");
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
"NoMergePolicy"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -616,12 +689,12 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let _index_writer = index.writer(40_000_000).unwrap();
|
||||
// the lock should be released when the
|
||||
// the lock should be released when the
|
||||
// index_writer leaves the scope.
|
||||
}
|
||||
let _index_writer_two = index.writer(40_000_000).unwrap();
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_commit_and_rollback() {
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
@@ -630,33 +703,21 @@ mod tests {
|
||||
|
||||
let num_docs_containing = |s: &str| {
|
||||
let searcher = index.searcher();
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term_a)
|
||||
let term = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term)
|
||||
};
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(3, 40_000_000).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.rollback().unwrap();
|
||||
|
||||
index_writer = index_writer.rollback().unwrap();
|
||||
|
||||
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "b");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "c");
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
}
|
||||
assert_eq!(index_writer.commit().unwrap(), 2u64);
|
||||
index.load_searchers().unwrap();
|
||||
@@ -668,7 +729,6 @@ mod tests {
|
||||
index.searcher();
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_with_merges() {
|
||||
let _ = env_logger::init();
|
||||
@@ -697,14 +757,88 @@ mod tests {
|
||||
}
|
||||
// this should create 8 segments and trigger a merge.
|
||||
index_writer.commit().expect("commit failed");
|
||||
index_writer.wait_merging_threads().expect("waiting merging thread failed");
|
||||
index_writer
|
||||
.wait_merging_threads()
|
||||
.expect("waiting merging thread failed");
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
|
||||
assert_eq!(num_docs_containing("a"), 200);
|
||||
assert!(index.searchable_segments().unwrap().len() < 8);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prepare_with_commit_message() {
|
||||
let _ = env_logger::init();
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
{
|
||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||
prepared_commit.set_payload("first commit");
|
||||
assert_eq!(prepared_commit.opstamp(), 100);
|
||||
prepared_commit.commit().expect("commit failed");
|
||||
}
|
||||
{
|
||||
let metas = index.load_metas().unwrap();
|
||||
assert_eq!(metas.payload.unwrap(), "first commit");
|
||||
}
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
{
|
||||
let metas = index.load_metas().unwrap();
|
||||
assert!(metas.payload.is_none());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prepare_but_rollback() {
|
||||
let _ = env_logger::init();
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
{
|
||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||
prepared_commit.set_payload("first commit");
|
||||
assert_eq!(prepared_commit.opstamp(), 100);
|
||||
prepared_commit.abort().expect("commit failed");
|
||||
}
|
||||
{
|
||||
let metas = index.load_metas().unwrap();
|
||||
assert!(metas.payload.is_none());
|
||||
}
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "b"));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let num_docs_containing = |s: &str| {
|
||||
let searcher = index.searcher();
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term_a)
|
||||
};
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
assert_eq!(num_docs_containing("b"), 100);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
extern crate itertools;
|
||||
use super::merge_policy::{MergePolicy, MergeCandidate};
|
||||
use super::merge_policy::{MergeCandidate, MergePolicy};
|
||||
use core::SegmentMeta;
|
||||
use std::cmp;
|
||||
use std::f64;
|
||||
@@ -8,8 +7,7 @@ const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
|
||||
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
|
||||
const DEFAULT_MIN_MERGE_SIZE: usize = 8;
|
||||
|
||||
|
||||
/// LogMergePolicy tries tries to merge segments that have a similar number of
|
||||
/// `LogMergePolicy` tries tries to merge segments that have a similar number of
|
||||
/// documents.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LogMergePolicy {
|
||||
@@ -24,7 +22,7 @@ impl LogMergePolicy {
|
||||
}
|
||||
|
||||
/// Set the minimum number of segment that may be merge together.
|
||||
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
|
||||
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
|
||||
self.min_merge_size = min_merge_size;
|
||||
}
|
||||
|
||||
@@ -52,15 +50,17 @@ impl MergePolicy for LogMergePolicy {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let mut size_sorted_tuples = segments.iter()
|
||||
let mut size_sorted_tuples = segments
|
||||
.iter()
|
||||
.map(|x| x.num_docs())
|
||||
.enumerate()
|
||||
.collect::<Vec<(usize, u32)>>();
|
||||
|
||||
size_sorted_tuples.sort_by(|x, y| y.cmp(x));
|
||||
size_sorted_tuples.sort_by(|x, y| y.1.cmp(&(x.1)));
|
||||
|
||||
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples.into_iter()
|
||||
.map(|(ind, num_docs)| (ind, (self.clip_min_size(num_docs) as f64).log2()))
|
||||
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
|
||||
.into_iter()
|
||||
.map(|(ind, num_docs)| (ind, f64::from(self.clip_min_size(num_docs)).log2()))
|
||||
.collect();
|
||||
|
||||
let (first_ind, first_score) = size_sorted_log_tuples[0];
|
||||
@@ -77,14 +77,10 @@ impl MergePolicy for LogMergePolicy {
|
||||
levels
|
||||
.iter()
|
||||
.filter(|level| level.len() >= self.min_merge_size)
|
||||
.map(|ind_vec| {
|
||||
MergeCandidate(ind_vec.iter()
|
||||
.map(|&ind| segments[ind].id())
|
||||
.collect())
|
||||
})
|
||||
.map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box self.clone()
|
||||
}
|
||||
@@ -104,7 +100,7 @@ impl Default for LogMergePolicy {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use indexer::merge_policy::MergePolicy;
|
||||
use core::{SegmentMeta, SegmentId};
|
||||
use core::{SegmentId, SegmentMeta};
|
||||
|
||||
fn test_merge_policy() -> LogMergePolicy {
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
@@ -128,9 +124,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_log_merge_policy_pair() {
|
||||
let test_input = vec![seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(10)];
|
||||
let test_input = vec![seg_meta(10), seg_meta(10), seg_meta(10)];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 1);
|
||||
}
|
||||
@@ -138,12 +132,23 @@ mod tests {
|
||||
#[test]
|
||||
fn test_log_merge_policy_levels() {
|
||||
// multiple levels all get merged correctly
|
||||
let test_input = vec![seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000)];
|
||||
// 2 MergeCandidates expected:
|
||||
// * one with the 6 * 10-docs segments
|
||||
// * one with the 3 * 1000-docs segments
|
||||
// no MergeCandidate expected for the 2 * 10_000-docs segments as min_merge_size=3
|
||||
let test_input = vec![
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(10000),
|
||||
seg_meta(10000),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
seg_meta(10),
|
||||
];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 2);
|
||||
}
|
||||
@@ -151,24 +156,28 @@ mod tests {
|
||||
#[test]
|
||||
fn test_log_merge_policy_within_levels() {
|
||||
// multiple levels all get merged correctly
|
||||
let test_input = vec![seg_meta(10),
|
||||
seg_meta(11),
|
||||
seg_meta(12),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000),
|
||||
seg_meta(1000)];
|
||||
let test_input = vec![
|
||||
seg_meta(10), // log2(10) = ~3.32 (> 3.58 - 0.75)
|
||||
seg_meta(11), // log2(11) = ~3.46
|
||||
seg_meta(12), // log2(12) = ~3.58
|
||||
seg_meta(800), // log2(800) = ~9.64 (> 9.97 - 0.75)
|
||||
seg_meta(1000), // log2(1000) = ~9.97
|
||||
seg_meta(1000),
|
||||
]; // log2(1000) = ~9.97
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 2);
|
||||
}
|
||||
#[test]
|
||||
fn test_log_merge_policy_small_segments() {
|
||||
// multiple levels all get merged correctly
|
||||
let test_input = vec![seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(2),
|
||||
seg_meta(2),
|
||||
seg_meta(2)];
|
||||
// segments under min_layer_size are merged together
|
||||
let test_input = vec![
|
||||
seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(1),
|
||||
seg_meta(2),
|
||||
seg_meta(2),
|
||||
seg_meta(2),
|
||||
];
|
||||
let result_list = test_merge_policy().compute_merge_candidates(&test_input);
|
||||
assert_eq!(result_list.len(), 1);
|
||||
}
|
||||
|
||||
@@ -3,27 +3,25 @@ use core::SegmentMeta;
|
||||
use std::marker;
|
||||
use std::fmt::Debug;
|
||||
|
||||
|
||||
/// Set of segment suggested for a merge.
|
||||
/// Set of segment suggested for a merge.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MergeCandidate(pub Vec<SegmentId>);
|
||||
|
||||
|
||||
/// The Merge policy defines which segments should be merged.
|
||||
///
|
||||
/// The `MergePolicy` defines which segments should be merged.
|
||||
///
|
||||
/// Every time a the list of segments changes, the segment updater
|
||||
/// asks the merge policy if some segments should be merged.
|
||||
pub trait MergePolicy: marker::Send + marker::Sync + Debug {
|
||||
/// Given the list of segment metas, returns the list of merge candidates.
|
||||
/// Given the list of segment metas, returns the list of merge candidates.
|
||||
///
|
||||
/// This call happens on the segment updater thread, and will block
|
||||
/// other segment updates, so all implementations should happen rapidly.
|
||||
/// This call happens on the segment updater thread, and will block
|
||||
/// other segment updates, so all implementations should happen rapidly.
|
||||
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
|
||||
/// Returns a boxed clone of the MergePolicy.
|
||||
fn box_clone(&self) -> Box<MergePolicy>;
|
||||
}
|
||||
|
||||
/// Never merge segments.
|
||||
/// Never merge segments.
|
||||
#[derive(Debug)]
|
||||
pub struct NoMergePolicy;
|
||||
|
||||
@@ -37,13 +35,12 @@ impl MergePolicy for NoMergePolicy {
|
||||
fn compute_merge_candidates(&self, _segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
|
||||
Vec::new()
|
||||
}
|
||||
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box NoMergePolicy
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
@@ -51,8 +48,7 @@ pub mod tests {
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
|
||||
|
||||
/// Merge policy useful for test purposes.
|
||||
/// `MergePolicy` useful for test purposes.
|
||||
///
|
||||
/// Everytime there is more than one segment,
|
||||
/// it will suggest to merge them.
|
||||
@@ -66,15 +62,14 @@ pub mod tests {
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect::<Vec<SegmentId>>();
|
||||
if segment_ids.len() > 1 {
|
||||
vec!(MergeCandidate(segment_ids))
|
||||
}
|
||||
else {
|
||||
vec!()
|
||||
vec![MergeCandidate(segment_ids)]
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box MergeWheneverPossible
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -13,14 +13,17 @@ mod segment_entry;
|
||||
mod doc_opstamp_mapping;
|
||||
pub mod operation;
|
||||
mod stamper;
|
||||
mod prepared_commit;
|
||||
|
||||
pub use self::prepared_commit::PreparedCommit;
|
||||
pub use self::segment_entry::{SegmentEntry, SegmentState};
|
||||
pub use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
pub use self::index_writer::IndexWriter;
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_policy::{NoMergePolicy, MergeCandidate, MergePolicy};
|
||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||
pub use self::segment_manager::SegmentManager;
|
||||
pub(crate) use self::directory_lock::DirectoryLock;
|
||||
|
||||
/// Alias for the default merge policy, which is the LogMergePolicy.
|
||||
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
||||
pub type DefaultMergePolicy = LogMergePolicy;
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use schema::Document;
|
||||
use schema::Term;
|
||||
|
||||
|
||||
/// Timestamped Delete operation.
|
||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||
pub struct DeleteOperation {
|
||||
|
||||
39
src/indexer/prepared_commit.rs
Normal file
39
src/indexer/prepared_commit.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
use Result;
|
||||
use super::IndexWriter;
|
||||
|
||||
/// A prepared commit
|
||||
pub struct PreparedCommit<'a> {
|
||||
index_writer: &'a mut IndexWriter,
|
||||
payload: Option<String>,
|
||||
opstamp: u64,
|
||||
}
|
||||
|
||||
impl<'a> PreparedCommit<'a> {
|
||||
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit {
|
||||
PreparedCommit {
|
||||
index_writer,
|
||||
payload: None,
|
||||
opstamp
|
||||
}
|
||||
}
|
||||
|
||||
pub fn opstamp(&self) -> u64 {
|
||||
self.opstamp
|
||||
}
|
||||
|
||||
pub fn set_payload(&mut self, payload: &str) {
|
||||
self.payload = Some(payload.to_string())
|
||||
}
|
||||
|
||||
pub fn abort(self) -> Result<()> {
|
||||
self.index_writer.rollback()
|
||||
}
|
||||
|
||||
pub fn commit(self) -> Result<u64> {
|
||||
info!("committing {}", self.opstamp);
|
||||
self.index_writer
|
||||
.segment_updater()
|
||||
.commit(self.opstamp, self.payload)?;
|
||||
Ok(self.opstamp)
|
||||
}
|
||||
}
|
||||
@@ -4,15 +4,14 @@ use indexer::delete_queue::DeleteCursor;
|
||||
use core::SegmentId;
|
||||
use std::fmt;
|
||||
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
pub enum SegmentState {
|
||||
Ready,
|
||||
InMerge,
|
||||
InMerge,
|
||||
}
|
||||
|
||||
impl SegmentState {
|
||||
pub fn letter_code(&self,) -> char {
|
||||
pub fn letter_code(&self) -> char {
|
||||
match *self {
|
||||
SegmentState::InMerge => 'M',
|
||||
SegmentState::Ready => 'R',
|
||||
@@ -20,49 +19,46 @@ impl SegmentState {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// A segment entry describes the state of
|
||||
/// A segment entry describes the state of
|
||||
/// a given segment, at a given instant.
|
||||
///
|
||||
/// In addition to segment meta,
|
||||
/// In addition to segment `meta`,
|
||||
/// it contains a few transient states
|
||||
/// - state expresses whether the segment is already in the
|
||||
/// - `state` expresses whether the segment is already in the
|
||||
/// middle of a merge
|
||||
/// - delete_bitset is a bitset describing
|
||||
/// - `delete_bitset` is a bitset describing
|
||||
/// documents that were deleted during the commit
|
||||
/// itself.
|
||||
/// - Delete cursor, is the position in the delete queue.
|
||||
/// - `delete_cursor` is the position in the delete queue.
|
||||
/// Deletes happening before the cursor are reflected either
|
||||
/// in the .del file or in the delete_bitset.
|
||||
/// in the .del file or in the `delete_bitset`.
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentEntry {
|
||||
meta: SegmentMeta,
|
||||
state: SegmentState,
|
||||
delete_bitset: Option<BitSet>,
|
||||
delete_cursor: DeleteCursor,
|
||||
|
||||
}
|
||||
|
||||
impl SegmentEntry {
|
||||
|
||||
|
||||
/// Create a new `SegmentEntry`
|
||||
pub fn new(segment_meta: SegmentMeta,
|
||||
delete_cursor: DeleteCursor,
|
||||
delete_bitset: Option<BitSet>) -> SegmentEntry {
|
||||
pub fn new(
|
||||
segment_meta: SegmentMeta,
|
||||
delete_cursor: DeleteCursor,
|
||||
delete_bitset: Option<BitSet>,
|
||||
) -> SegmentEntry {
|
||||
SegmentEntry {
|
||||
meta: segment_meta,
|
||||
state: SegmentState::Ready,
|
||||
delete_bitset: delete_bitset,
|
||||
delete_cursor: delete_cursor,
|
||||
delete_bitset,
|
||||
delete_cursor,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Return a reference to the segment entry deleted bitset.
|
||||
///
|
||||
/// `DocId` in this bitset are flagged as deleted.
|
||||
pub fn delete_bitset(&self,) -> Option<&BitSet> {
|
||||
pub fn delete_bitset(&self) -> Option<&BitSet> {
|
||||
self.delete_bitset.as_ref()
|
||||
}
|
||||
|
||||
@@ -71,13 +67,12 @@ impl SegmentEntry {
|
||||
self.meta = segment_meta;
|
||||
}
|
||||
|
||||
|
||||
/// Return a reference to the segment_entry's delete cursor
|
||||
pub fn delete_cursor(&mut self) -> &mut DeleteCursor {
|
||||
&mut self.delete_cursor
|
||||
}
|
||||
|
||||
/// Return the `SegmentEntry`.
|
||||
/// Return the `SegmentEntry`.
|
||||
///
|
||||
/// The state describes whether the segment is available for
|
||||
/// a merge or not.
|
||||
@@ -89,19 +84,17 @@ impl SegmentEntry {
|
||||
pub fn segment_id(&self) -> SegmentId {
|
||||
self.meta.id()
|
||||
}
|
||||
|
||||
|
||||
/// Accessor to the `SegmentMeta`
|
||||
pub fn meta(&self) -> &SegmentMeta {
|
||||
&self.meta
|
||||
}
|
||||
|
||||
|
||||
/// Mark the `SegmentEntry` as in merge.
|
||||
///
|
||||
/// Only segments that are not already
|
||||
/// Only segments that are not already
|
||||
/// in a merge are elligible for future merge.
|
||||
pub fn start_merge(&mut self,) {
|
||||
pub fn start_merge(&mut self) {
|
||||
self.state = SegmentState::InMerge;
|
||||
}
|
||||
|
||||
@@ -110,14 +103,13 @@ impl SegmentEntry {
|
||||
/// If a merge fails, it is important to switch
|
||||
/// the segment back to a idle state, so that it
|
||||
/// may be elligible for future merges.
|
||||
pub fn cancel_merge(&mut self,) {
|
||||
pub fn cancel_merge(&mut self) {
|
||||
self.state = SegmentState::Ready;
|
||||
}
|
||||
|
||||
|
||||
/// Returns true iff a segment should
|
||||
/// be considered for a merge.
|
||||
pub fn is_ready(&self,) -> bool {
|
||||
pub fn is_ready(&self) -> bool {
|
||||
self.state == SegmentState::Ready
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::segment_register::SegmentRegister;
|
||||
use std::sync::RwLock;
|
||||
use core::SegmentMeta;
|
||||
use core::{META_FILEPATH, LOCKFILE_FILEPATH};
|
||||
use core::{LOCKFILE_FILEPATH, META_FILEPATH};
|
||||
use core::SegmentId;
|
||||
use indexer::SegmentEntry;
|
||||
use std::path::PathBuf;
|
||||
@@ -14,15 +14,13 @@ use indexer::delete_queue::DeleteCursor;
|
||||
struct SegmentRegisters {
|
||||
uncommitted: SegmentRegister,
|
||||
committed: SegmentRegister,
|
||||
writing: HashSet<SegmentId>,
|
||||
writing: HashSet<SegmentId>,
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// The segment manager stores the list of segments
|
||||
/// as well as their state.
|
||||
///
|
||||
/// It guarantees the atomicity of the
|
||||
/// It guarantees the atomicity of the
|
||||
/// changes (merges especially)
|
||||
#[derive(Default)]
|
||||
pub struct SegmentManager {
|
||||
@@ -32,19 +30,29 @@ pub struct SegmentManager {
|
||||
impl Debug for SegmentManager {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
let lock = self.read();
|
||||
write!(f, "{{ uncommitted: {:?}, committed: {:?} }}", lock.uncommitted, lock.committed)
|
||||
write!(
|
||||
f,
|
||||
"{{ uncommitted: {:?}, committed: {:?} }}",
|
||||
lock.uncommitted, lock.committed
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_mergeable_segments(segment_manager: &SegmentManager,) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
|
||||
pub fn get_mergeable_segments(
|
||||
segment_manager: &SegmentManager,
|
||||
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
|
||||
let registers_lock = segment_manager.read();
|
||||
(registers_lock.committed.get_mergeable_segments(),
|
||||
registers_lock.uncommitted.get_mergeable_segments())
|
||||
(
|
||||
registers_lock.committed.get_mergeable_segments(),
|
||||
registers_lock.uncommitted.get_mergeable_segments(),
|
||||
)
|
||||
}
|
||||
|
||||
impl SegmentManager {
|
||||
|
||||
pub fn from_segments(segment_metas: Vec<SegmentMeta>, delete_cursor: DeleteCursor) -> SegmentManager {
|
||||
pub fn from_segments(
|
||||
segment_metas: Vec<SegmentMeta>,
|
||||
delete_cursor: &DeleteCursor,
|
||||
) -> SegmentManager {
|
||||
SegmentManager {
|
||||
registers: RwLock::new(SegmentRegisters {
|
||||
uncommitted: SegmentRegister::default(),
|
||||
@@ -55,20 +63,14 @@ impl SegmentManager {
|
||||
}
|
||||
|
||||
/// Returns all of the segment entries (committed or uncommitted)
|
||||
pub fn segment_entries(&self,) -> Vec<SegmentEntry> {
|
||||
let mut segment_entries = self.read()
|
||||
.uncommitted
|
||||
.segment_entries();
|
||||
segment_entries.extend(
|
||||
self.read()
|
||||
.committed
|
||||
.segment_entries()
|
||||
);
|
||||
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
|
||||
let mut segment_entries = self.read().uncommitted.segment_entries();
|
||||
segment_entries.extend(self.read().committed.segment_entries());
|
||||
segment_entries
|
||||
}
|
||||
|
||||
/// Returns the overall number of segments in the `SegmentManager`
|
||||
pub fn num_segments(&self,) -> usize {
|
||||
pub fn num_segments(&self) -> usize {
|
||||
let registers_lock = self.read();
|
||||
registers_lock.committed.len() + registers_lock.uncommitted.len()
|
||||
}
|
||||
@@ -78,19 +80,14 @@ impl SegmentManager {
|
||||
let mut files = HashSet::new();
|
||||
files.insert(META_FILEPATH.clone());
|
||||
files.insert(LOCKFILE_FILEPATH.clone());
|
||||
|
||||
let segment_metas: Vec<SegmentMeta> =
|
||||
registers_lock.committed
|
||||
.get_all_segments()
|
||||
.into_iter()
|
||||
.chain(registers_lock.uncommitted
|
||||
.get_all_segments()
|
||||
.into_iter())
|
||||
.chain(registers_lock.writing
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(SegmentMeta::new))
|
||||
.collect();
|
||||
|
||||
let segment_metas: Vec<SegmentMeta> = registers_lock
|
||||
.committed
|
||||
.get_all_segments()
|
||||
.into_iter()
|
||||
.chain(registers_lock.uncommitted.get_all_segments().into_iter())
|
||||
.chain(registers_lock.writing.iter().cloned().map(SegmentMeta::new))
|
||||
.collect();
|
||||
for segment_meta in segment_metas {
|
||||
files.extend(segment_meta.list_files());
|
||||
}
|
||||
@@ -102,18 +99,22 @@ impl SegmentManager {
|
||||
registers
|
||||
.committed
|
||||
.segment_entry(segment_id)
|
||||
.or_else(|| registers.uncommitted.segment_entry(segment_id))
|
||||
.or_else(|| registers.uncommitted.segment_entry(segment_id))
|
||||
}
|
||||
|
||||
// Lock poisoning should never happen :
|
||||
// The lock is acquired and released within this class,
|
||||
// and the operations cannot panic.
|
||||
fn read(&self,) -> RwLockReadGuard<SegmentRegisters> {
|
||||
self.registers.read().expect("Failed to acquire read lock on SegmentManager.")
|
||||
// and the operations cannot panic.
|
||||
fn read(&self) -> RwLockReadGuard<SegmentRegisters> {
|
||||
self.registers
|
||||
.read()
|
||||
.expect("Failed to acquire read lock on SegmentManager.")
|
||||
}
|
||||
|
||||
fn write(&self,) -> RwLockWriteGuard<SegmentRegisters> {
|
||||
self.registers.write().expect("Failed to acquire write lock on SegmentManager.")
|
||||
fn write(&self) -> RwLockWriteGuard<SegmentRegisters> {
|
||||
self.registers
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on SegmentManager.")
|
||||
}
|
||||
|
||||
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
|
||||
@@ -124,42 +125,44 @@ impl SegmentManager {
|
||||
registers_lock.committed.add_segment_entry(segment_entry);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) {
|
||||
let mut registers_lock = self.write();
|
||||
if registers_lock.uncommitted.contains_all(segment_ids) {
|
||||
for segment_id in segment_ids {
|
||||
registers_lock.uncommitted.start_merge(segment_id);
|
||||
}
|
||||
}
|
||||
else if registers_lock.committed.contains_all(segment_ids) {
|
||||
} else if registers_lock.committed.contains_all(segment_ids) {
|
||||
for segment_id in segment_ids {
|
||||
registers_lock.committed.start_merge(segment_id);
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
error!("Merge operation sent for segments that are not all uncommited or commited.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn cancel_merge(&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_id: SegmentId) {
|
||||
|
||||
pub fn cancel_merge(
|
||||
&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_id: SegmentId,
|
||||
) {
|
||||
let mut registers_lock = self.write();
|
||||
|
||||
|
||||
// we mark all segments are ready for merge.
|
||||
{
|
||||
let target_segment_register: &mut SegmentRegister;
|
||||
target_segment_register = {
|
||||
if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) {
|
||||
if registers_lock
|
||||
.uncommitted
|
||||
.contains_all(before_merge_segment_ids)
|
||||
{
|
||||
&mut registers_lock.uncommitted
|
||||
}
|
||||
else if registers_lock.committed.contains_all(&before_merge_segment_ids) {
|
||||
} else if registers_lock
|
||||
.committed
|
||||
.contains_all(before_merge_segment_ids)
|
||||
{
|
||||
&mut registers_lock.committed
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
warn!("couldn't find segment in SegmentManager");
|
||||
return;
|
||||
}
|
||||
@@ -172,10 +175,8 @@ impl SegmentManager {
|
||||
// ... and we make sure the target segment entry
|
||||
// can be garbage collected.
|
||||
registers_lock.writing.remove(&after_merge_segment_id);
|
||||
|
||||
}
|
||||
|
||||
|
||||
pub fn write_segment(&self, segment_id: SegmentId) {
|
||||
let mut registers_lock = self.write();
|
||||
registers_lock.writing.insert(segment_id);
|
||||
@@ -186,19 +187,27 @@ impl SegmentManager {
|
||||
registers_lock.writing.remove(&segment_entry.segment_id());
|
||||
registers_lock.uncommitted.add_segment_entry(segment_entry);
|
||||
}
|
||||
|
||||
pub fn end_merge(&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentEntry) {
|
||||
|
||||
|
||||
pub fn end_merge(
|
||||
&self,
|
||||
before_merge_segment_ids: &[SegmentId],
|
||||
after_merge_segment_entry: SegmentEntry,
|
||||
) {
|
||||
let mut registers_lock = self.write();
|
||||
registers_lock.writing.remove(&after_merge_segment_entry.segment_id());
|
||||
|
||||
let mut target_register: &mut SegmentRegister = {
|
||||
if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) {
|
||||
registers_lock
|
||||
.writing
|
||||
.remove(&after_merge_segment_entry.segment_id());
|
||||
|
||||
let target_register: &mut SegmentRegister = {
|
||||
if registers_lock
|
||||
.uncommitted
|
||||
.contains_all(before_merge_segment_ids)
|
||||
{
|
||||
&mut registers_lock.uncommitted
|
||||
}
|
||||
else if registers_lock.committed.contains_all(&before_merge_segment_ids) {
|
||||
} else if registers_lock
|
||||
.committed
|
||||
.contains_all(before_merge_segment_ids)
|
||||
{
|
||||
&mut registers_lock.committed
|
||||
} else {
|
||||
warn!("couldn't find segment in SegmentManager");
|
||||
@@ -209,13 +218,9 @@ impl SegmentManager {
|
||||
target_register.remove_segment(segment_id);
|
||||
}
|
||||
target_register.add_segment_entry(after_merge_segment_entry);
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
pub fn committed_segment_metas(&self,) -> Vec<SegmentMeta> {
|
||||
pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> {
|
||||
let registers_lock = self.read();
|
||||
registers_lock.committed.segment_metas()
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user