mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 04:52:55 +00:00
Compare commits
221 Commits
hotfix-108
...
warming
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d4e2d2e40e | ||
|
|
732f6847c0 | ||
|
|
1c6d9bdc6a | ||
|
|
3ea6800ac5 | ||
|
|
395303b644 | ||
|
|
2c200b46cb | ||
|
|
17e00df112 | ||
|
|
3129d86743 | ||
|
|
e5e252cbc0 | ||
|
|
b2da82f151 | ||
|
|
c81b3030fa | ||
|
|
9e66c75fc6 | ||
|
|
ebdbb6bd2e | ||
|
|
c980b19dd9 | ||
|
|
098eea843a | ||
|
|
466dc8233c | ||
|
|
03c2f6ece2 | ||
|
|
1d4e9a29db | ||
|
|
f378d9a57b | ||
|
|
dde49ac8e2 | ||
|
|
c3cc93406d | ||
|
|
bd0f9211da | ||
|
|
c503c6e4fa | ||
|
|
02174d26af | ||
|
|
cf92be3bd6 | ||
|
|
72cef12db1 | ||
|
|
bbc0a2e233 | ||
|
|
4fd1a6c84b | ||
|
|
c83d99c414 | ||
|
|
eacf510175 | ||
|
|
8802d125f8 | ||
|
|
33301a3eb4 | ||
|
|
7234bef0eb | ||
|
|
fcff91559b | ||
|
|
b75d4e59d1 | ||
|
|
c6b5ab1dbe | ||
|
|
c12e07f0ce | ||
|
|
8b877a4c26 | ||
|
|
7dc0dc1c9b | ||
|
|
0462754673 | ||
|
|
5916ceda73 | ||
|
|
70283dc6c8 | ||
|
|
dbaf4f3623 | ||
|
|
4808648322 | ||
|
|
54afb9b34a | ||
|
|
d336c8b938 | ||
|
|
980d1b2796 | ||
|
|
6317982876 | ||
|
|
e2fbbc08ca | ||
|
|
99cd25beae | ||
|
|
737ecc7015 | ||
|
|
09668459c8 | ||
|
|
e5fd30f438 | ||
|
|
c412a46105 | ||
|
|
3a78402496 | ||
|
|
d18ac136c0 | ||
|
|
b5b1244857 | ||
|
|
27acfa4dea | ||
|
|
02cffa4dea | ||
|
|
b52abbc771 | ||
|
|
894c61867f | ||
|
|
352e0cc58d | ||
|
|
ffe4446d90 | ||
|
|
4d05b26e7a | ||
|
|
0855649986 | ||
|
|
d828e58903 | ||
|
|
aa0396fe27 | ||
|
|
8d8315f8d0 | ||
|
|
078c0a2e2e | ||
|
|
f21e8dd875 | ||
|
|
74e36c7e97 | ||
|
|
f27ae04282 | ||
|
|
0ce49c9dd4 | ||
|
|
fe8e58e078 | ||
|
|
efc0d8341b | ||
|
|
22bcc83d10 | ||
|
|
5ee5037934 | ||
|
|
c217bfed1e | ||
|
|
c27ccd3e24 | ||
|
|
367f5da782 | ||
|
|
b256df6599 | ||
|
|
d7a6a409a1 | ||
|
|
a1f5cead96 | ||
|
|
37c5fe3c86 | ||
|
|
4583fa270b | ||
|
|
beb3a5bd73 | ||
|
|
93cbd52bf0 | ||
|
|
c22177a005 | ||
|
|
4da71273e1 | ||
|
|
2c78b31aab | ||
|
|
4ae1d87632 | ||
|
|
46b86a7976 | ||
|
|
3bc177e69d | ||
|
|
319609e9c1 | ||
|
|
9d87b89718 | ||
|
|
dd81e38e53 | ||
|
|
9f32b22602 | ||
|
|
096ce7488e | ||
|
|
a1782dd172 | ||
|
|
000d76b11a | ||
|
|
abd29f6646 | ||
|
|
b4ecf0ab2f | ||
|
|
798f7dbf67 | ||
|
|
06a2e47c8d | ||
|
|
e0b83eb291 | ||
|
|
13401f46ea | ||
|
|
1a45b030dc | ||
|
|
62052bcc2d | ||
|
|
3265f7bec3 | ||
|
|
ee0881712a | ||
|
|
483e0336b6 | ||
|
|
3e8f267e33 | ||
|
|
3b247fd968 | ||
|
|
750f6e6479 | ||
|
|
5b475e6603 | ||
|
|
0ca7f73dc5 | ||
|
|
47ed18845e | ||
|
|
dc141cdb29 | ||
|
|
f6cf6e889b | ||
|
|
f379a80233 | ||
|
|
4a320fd1ff | ||
|
|
85d23e8e3b | ||
|
|
022ab9d298 | ||
|
|
605e8603dc | ||
|
|
70f160b329 | ||
|
|
6d265e6bed | ||
|
|
fdc512391b | ||
|
|
108714c934 | ||
|
|
44e8cf98a5 | ||
|
|
f0ee69d9e9 | ||
|
|
b8a10c8406 | ||
|
|
ff4813529e | ||
|
|
470bc18e9b | ||
|
|
0b1add0ec6 | ||
|
|
1db76dd9cf | ||
|
|
467a9517db | ||
|
|
b361315a67 | ||
|
|
4e3771bffc | ||
|
|
8176b0335a | ||
|
|
811ac98f36 | ||
|
|
f4b2e71800 | ||
|
|
c431cfcf12 | ||
|
|
92f20bc5a2 | ||
|
|
57f931da3c | ||
|
|
9b662e6d03 | ||
|
|
18377d949c | ||
|
|
e6427b2588 | ||
|
|
0062fe705d | ||
|
|
9b3e508753 | ||
|
|
a1ac63ee1c | ||
|
|
e496ae0470 | ||
|
|
1e4df54ab3 | ||
|
|
2de249af74 | ||
|
|
10f056fbb4 | ||
|
|
074b09d0c0 | ||
|
|
86d0727659 | ||
|
|
be3e1b8718 | ||
|
|
8fdf59bdac | ||
|
|
ebebce2102 | ||
|
|
8044ec38da | ||
|
|
7413f87265 | ||
|
|
aea2e77665 | ||
|
|
a15845f9fd | ||
|
|
94ac44df4f | ||
|
|
f80d804a57 | ||
|
|
3b5c1d7817 | ||
|
|
24274edf81 | ||
|
|
d58497529b | ||
|
|
130495abab | ||
|
|
9b743d60fb | ||
|
|
5c9e2ef036 | ||
|
|
8526434b63 | ||
|
|
6ba302c481 | ||
|
|
de92f094aa | ||
|
|
c82cee66de | ||
|
|
6eed05b1ce | ||
|
|
bb488305c9 | ||
|
|
f05e84f964 | ||
|
|
65546ed22b | ||
|
|
57ae5b27dc | ||
|
|
f9531ec3c9 | ||
|
|
5b54a32563 | ||
|
|
cd049e28bc | ||
|
|
646e41bec4 | ||
|
|
36528c5e83 | ||
|
|
cd169dee23 | ||
|
|
b5cc60f80b | ||
|
|
060b83159a | ||
|
|
a40ff35453 | ||
|
|
268e6bfe6e | ||
|
|
f902440b8b | ||
|
|
77a0902605 | ||
|
|
c889ae10e4 | ||
|
|
0a534c6ee0 | ||
|
|
167d88b449 | ||
|
|
1071ed84f2 | ||
|
|
abb5624af2 | ||
|
|
1d41b96d32 | ||
|
|
ef4665945f | ||
|
|
294cd5fd0b | ||
|
|
f4d271177c | ||
|
|
451538fecf | ||
|
|
e78e0fec59 | ||
|
|
2e639cebf8 | ||
|
|
e296da7ade | ||
|
|
3b3e26c4b8 | ||
|
|
6a4883ac69 | ||
|
|
0ba05df545 | ||
|
|
aa3c4d4029 | ||
|
|
60df629725 | ||
|
|
2570b005ac | ||
|
|
d5212cd19d | ||
|
|
2193d85622 | ||
|
|
dfdbfe9eff | ||
|
|
b999e836b2 | ||
|
|
be2dd41e69 | ||
|
|
483fdb79cc | ||
|
|
aefd0fc907 | ||
|
|
3298d6cb71 | ||
|
|
c02c78ea73 | ||
|
|
6bf4fee1ba |
7
.github/dependabot.yml
vendored
7
.github/dependabot.yml
vendored
@@ -6,3 +6,10 @@ updates:
|
||||
interval: daily
|
||||
time: "20:00"
|
||||
open-pull-requests-limit: 10
|
||||
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: daily
|
||||
time: "20:00"
|
||||
open-pull-requests-limit: 10
|
||||
|
||||
25
.github/workflows/coverage.yml
vendored
Normal file
25
.github/workflows/coverage.yml
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
name: Coverage
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
coverage:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Install Rust
|
||||
run: rustup toolchain install nightly --component llvm-tools-preview
|
||||
- name: Install cargo-llvm-cov
|
||||
run: curl -LsSf https://github.com/taiki-e/cargo-llvm-cov/releases/latest/download/cargo-llvm-cov-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin
|
||||
- name: Generate code coverage
|
||||
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v2
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
|
||||
files: lcov.info
|
||||
fail_ci_if_error: true
|
||||
24
.github/workflows/long_running.yml
vendored
Normal file
24
.github/workflows/long_running.yml
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
name: Rust
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
|
||||
|
||||
jobs:
|
||||
functional_test_unsorted:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Run indexing_unsorted
|
||||
run: cargo test indexing_unsorted -- --ignored
|
||||
functional_test_sorted:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Run indexing_sorted
|
||||
run: cargo test indexing_sorted -- --ignored
|
||||
|
||||
10
.github/workflows/test.yml
vendored
10
.github/workflows/test.yml
vendored
@@ -10,7 +10,7 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
jobs:
|
||||
build:
|
||||
test:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -18,7 +18,13 @@ jobs:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Build
|
||||
run: cargo build --verbose --workspace
|
||||
- name: Install latest nightly to test also against unstable feature flag
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
override: true
|
||||
components: rustfmt
|
||||
- name: Run tests
|
||||
run: cargo test --verbose --workspace
|
||||
run: cargo test --features mmap,brotli-compression,lz4-compression,snappy-compression,failpoints --verbose --workspace
|
||||
- name: Check Formatting
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,4 +1,5 @@
|
||||
tantivy.iml
|
||||
.cargo
|
||||
proptest-regressions
|
||||
*.swp
|
||||
target
|
||||
|
||||
92
.travis.yml
92
.travis.yml
@@ -1,92 +0,0 @@
|
||||
# Based on the "trust" template v0.1.2
|
||||
# https://github.com/japaric/trust/tree/v0.1.2
|
||||
|
||||
dist: trusty
|
||||
language: rust
|
||||
services: docker
|
||||
sudo: required
|
||||
|
||||
env:
|
||||
global:
|
||||
- CRATE_NAME=tantivy
|
||||
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
|
||||
# - secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
- kalakris-cmake
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- g++-4.8
|
||||
- libcurl4-openssl-dev
|
||||
- libelf-dev
|
||||
- libdw-dev
|
||||
- binutils-dev
|
||||
- cmake
|
||||
|
||||
matrix:
|
||||
include:
|
||||
# Android
|
||||
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
||||
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
|
||||
|
||||
# Linux
|
||||
#- env: TARGET=aarch64-unknown-linux-gnu
|
||||
#- env: TARGET=i686-unknown-linux-gnu
|
||||
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 #UPLOAD_DOCS=1
|
||||
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
|
||||
# OSX
|
||||
#- env: TARGET=x86_64-apple-darwin
|
||||
# os: osx
|
||||
|
||||
before_install:
|
||||
- set -e
|
||||
- rustup self update
|
||||
- rustup component add rustfmt
|
||||
|
||||
install:
|
||||
- sh ci/install.sh
|
||||
- source ~/.cargo/env || true
|
||||
- env | grep "TRAVIS"
|
||||
|
||||
before_script:
|
||||
- export PATH=$HOME/.cargo/bin:$PATH
|
||||
- cargo install cargo-update || echo "cargo-update already installed"
|
||||
- cargo install cargo-travis || echo "cargo-travis already installed"
|
||||
|
||||
script:
|
||||
- bash ci/script.sh
|
||||
- cargo fmt --all -- --check
|
||||
|
||||
before_deploy:
|
||||
- sh ci/before_deploy.sh
|
||||
|
||||
after_success:
|
||||
# Needs GH_TOKEN env var to be set in travis settings
|
||||
- if [[ -v GH_TOKEN ]]; then echo "GH TOKEN IS SET"; else echo "GH TOKEN NOT SET"; fi
|
||||
- if [[ -v UPLOAD_DOCS ]]; then cargo doc; cargo doc-upload; else echo "doc upload disabled."; fi
|
||||
|
||||
#cache: cargo
|
||||
#before_cache:
|
||||
# # Travis can't cache files that are not readable by "others"
|
||||
# - chmod -R a+r $HOME/.cargo
|
||||
# - find ./target/debug -type f -maxdepth 1 -delete
|
||||
# - rm -f ./target/.rustc_info.json
|
||||
# - rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
|
||||
# - rm -r target/debug/examples/
|
||||
# - ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
|
||||
|
||||
#branches:
|
||||
# only:
|
||||
# # release tags
|
||||
# - /^v\d+\.\d+\.\d+.*$/
|
||||
# - master
|
||||
|
||||
notifications:
|
||||
email:
|
||||
on_success: never
|
||||
35
CHANGELOG.md
35
CHANGELOG.md
@@ -1,3 +1,36 @@
|
||||
Tantivy 0.17
|
||||
================================
|
||||
- LogMergePolicy now triggers merges if the ratio of deleted documents reaches a threshold (@shikhar) [#115](https://github.com/quickwit-inc/tantivy/issues/115)
|
||||
- Adds a searcher Warmer API (@shikhar)
|
||||
- Change to non-strict schema. Ignore fields in data which are not defined in schema. Previously this returned an error. #1211
|
||||
- Facets are necessarily indexed. Existing index with indexed facets should work out of the box. Index without facets that are marked with index: false should be broken (but they were already broken in a sense). (@fulmicoton) #1195 .
|
||||
- Bugfix that could in theory impact durability in theory on some filesystems [#1224](https://github.com/quickwit-inc/tantivy/issues/1224)
|
||||
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-inc/tantivy/issues/922)
|
||||
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-inc/tantivy/issues/1225)
|
||||
|
||||
Tantivy 0.16.2
|
||||
================================
|
||||
- Bugfix in FuzzyTermQuery. (tranposition_cost_one was not doing anything)
|
||||
|
||||
Tantivy 0.16.1
|
||||
========================
|
||||
- Major Bugfix on multivalued fastfield. #1151
|
||||
- Demux operation (@PSeitz)
|
||||
|
||||
Tantivy 0.16.0
|
||||
=========================
|
||||
- Bugfix in the filesum check. (@evanxg852000) #1127
|
||||
- Bugfix in positions when the index is sorted by a field. (@appaquet) #1125
|
||||
|
||||
Tantivy 0.15.3
|
||||
=========================
|
||||
- Major bugfix. Deleting documents was broken when the index was sorted by a field. (@appaquet, @fulmicoton) #1101
|
||||
|
||||
|
||||
Tantivy 0.15.2
|
||||
========================
|
||||
- Major bugfix. DocStore still panics when a deleted doc is at the beginning of a block. (@appaquet) #1088
|
||||
|
||||
Tantivy 0.15.1
|
||||
=========================
|
||||
- Major bugfix. DocStore panics when first block is deleted. (@appaquet) #1077
|
||||
@@ -95,7 +128,7 @@ Tantivy 0.12.0
|
||||
## How to update?
|
||||
|
||||
Crates relying on custom tokenizer, or registering tokenizer in the manager will require some
|
||||
minor changes. Check https://github.com/tantivy-search/tantivy/blob/main/examples/custom_tokenizer.rs
|
||||
minor changes. Check https://github.com/quickwit-inc/tantivy/blob/main/examples/custom_tokenizer.rs
|
||||
to check for some code sample.
|
||||
|
||||
Tantivy 0.11.3
|
||||
|
||||
31
Cargo.toml
31
Cargo.toml
@@ -1,13 +1,13 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.15.1"
|
||||
version = "0.17.0-dev"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
description = """Search engine library"""
|
||||
documentation = "https://docs.rs/tantivy/"
|
||||
homepage = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
homepage = "https://github.com/quickwit-inc/tantivy"
|
||||
repository = "https://github.com/quickwit-inc/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
@@ -19,8 +19,8 @@ crc32fast = "1.2.1"
|
||||
once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
tantivy-fst = "0.3"
|
||||
memmap = {version = "0.7", optional=true}
|
||||
lz4_flex = { version = "0.8.0", default-features = false, features = ["checked-decode"], optional = true }
|
||||
memmap2 = {version = "0.5", optional=true}
|
||||
lz4_flex = { version = "0.9", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3", optional = true }
|
||||
snap = { version = "1.0.5", optional = true }
|
||||
tempfile = { version = "3.2", optional = true }
|
||||
@@ -31,10 +31,13 @@ num_cpus = "1.13"
|
||||
fs2={ version = "0.4.3", optional = true }
|
||||
levenshtein_automata = "0.2"
|
||||
uuid = { version = "0.8.2", features = ["v4", "serde"] }
|
||||
crossbeam = "0.8"
|
||||
crossbeam = "0.8.1"
|
||||
futures = { version = "0.3.15", features = ["thread-pool"] }
|
||||
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||
common = { version = "0.1", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version="0.2", path="./ownedbytes" }
|
||||
stable_deref_trait = "1.2"
|
||||
rust-stemmers = "1.2"
|
||||
downcast-rs = "1.2"
|
||||
@@ -43,14 +46,15 @@ census = "0.4"
|
||||
fnv = "1.0.7"
|
||||
thiserror = "1.0.24"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.4"
|
||||
fail = "0.5"
|
||||
murmurhash32 = "0.2"
|
||||
chrono = "0.4.19"
|
||||
smallvec = "1.6.1"
|
||||
rayon = "1.5"
|
||||
lru = "0.6.5"
|
||||
lru = "0.7.0"
|
||||
fastdivide = "0.3"
|
||||
itertools = "0.10.0"
|
||||
measure_time = "0.8.0"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3.9"
|
||||
@@ -60,10 +64,12 @@ rand = "0.8.3"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.8"
|
||||
proptest = "1.0"
|
||||
criterion = "0.3.4"
|
||||
criterion = "0.3.5"
|
||||
test-log = "0.2.8"
|
||||
env_logger = "0.9.0"
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.4"
|
||||
version = "0.5"
|
||||
features = ["failpoints"]
|
||||
|
||||
[profile.release]
|
||||
@@ -77,7 +83,7 @@ overflow-checks = true
|
||||
|
||||
[features]
|
||||
default = ["mmap", "lz4-compression" ]
|
||||
mmap = ["fs2", "tempfile", "memmap"]
|
||||
mmap = ["fs2", "tempfile", "memmap2"]
|
||||
|
||||
brotli-compression = ["brotli"]
|
||||
lz4-compression = ["lz4_flex"]
|
||||
@@ -85,10 +91,9 @@ snappy-compression = ["snap"]
|
||||
|
||||
failpoints = ["fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
wasm-bindgen = ["uuid/wasm-bindgen"]
|
||||
|
||||
[workspace]
|
||||
members = ["query-grammar", "bitpacker"]
|
||||
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
18
README.md
18
README.md
@@ -1,9 +1,9 @@
|
||||
|
||||
[](https://travis-ci.org/tantivy-search/tantivy)
|
||||
[](https://codecov.io/gh/tantivy-search/tantivy)
|
||||
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[](https://docs.rs/crate/tantivy/)
|
||||
[](https://github.com/quickwit-inc/tantivy/actions/workflows/test.yml)
|
||||
[](https://codecov.io/gh/quickwit-inc/tantivy)
|
||||
[](https://discord.gg/MT27AG5EVE)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/main)
|
||||
[](https://crates.io/crates/tantivy)
|
||||
|
||||

|
||||
@@ -17,9 +17,6 @@
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
|
||||
|
||||
[](https://www.patreon.com/fulmicoton)
|
||||
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in Rust.
|
||||
|
||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||
@@ -78,13 +75,12 @@ It walks you through getting a wikipedia search engine up and running in a few m
|
||||
|
||||
There are many ways to support this project.
|
||||
|
||||
- Use Tantivy and tell us about your experience on [Gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
|
||||
- Use Tantivy and tell us about your experience on [Discord](https://discord.gg/MT27AG5EVE) or by email (paul.masurel@gmail.com)
|
||||
- Report bugs
|
||||
- Write a blog post
|
||||
- Help with documentation by asking questions or submitting PRs
|
||||
- Contribute code (you can join [our Gitter](https://gitter.im/tantivy-search/tantivy))
|
||||
- Contribute code (you can join [our Discord server](https://discord.gg/MT27AG5EVE))
|
||||
- Talk about Tantivy around you
|
||||
- [](https://www.patreon.com/fulmicoton)
|
||||
|
||||
# Contributing code
|
||||
|
||||
@@ -96,7 +92,7 @@ Tantivy compiles on stable Rust but requires `Rust >= 1.27`.
|
||||
To check out and run tests, you can simply run:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/tantivy-search/tantivy.git
|
||||
git clone https://github.com/quickwit-inc/tantivy.git
|
||||
cd tantivy
|
||||
cargo build
|
||||
```
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use tantivy::tokenizer::TokenizerManager;
|
||||
|
||||
const ALICE_TXT: &'static str = include_str!("alice.txt");
|
||||
const ALICE_TXT: &str = include_str!("alice.txt");
|
||||
|
||||
pub fn criterion_benchmark(c: &mut Criterion) {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
[package]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.1.0"
|
||||
version = "0.1.1"
|
||||
edition = "2018"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = []
|
||||
description = """Tantivy-sub crate: bitpacking"""
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/quickwit-inc/tantivy"
|
||||
keywords = []
|
||||
|
||||
|
||||
|
||||
@@ -49,6 +49,7 @@ impl BitPacker {
|
||||
let bytes = self.mini_buffer.to_le_bytes();
|
||||
output.write_all(&bytes[..num_bytes])?;
|
||||
self.mini_buffer_written = 0;
|
||||
self.mini_buffer = 0;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -61,7 +62,7 @@ impl BitPacker {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct BitUnpacker {
|
||||
num_bits: u64,
|
||||
mask: u64,
|
||||
|
||||
@@ -50,3 +50,32 @@ where
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
assert_eq!(compute_num_bits(0), 0u8);
|
||||
assert_eq!(compute_num_bits(2), 2u8);
|
||||
assert_eq!(compute_num_bits(3), 2u8);
|
||||
assert_eq!(compute_num_bits(4), 3u8);
|
||||
assert_eq!(compute_num_bits(255), 8u8);
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_empty() {
|
||||
let vals: Vec<u32> = vec![];
|
||||
assert_eq!(minmax(vals.into_iter()), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_one() {
|
||||
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_two() {
|
||||
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
|
||||
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
|
||||
}
|
||||
|
||||
17
common/Cargo.toml
Normal file
17
common/Cargo.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[package]
|
||||
name = "tantivy-common"
|
||||
version = "0.1.0"
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
description = "common traits and utility functions used by multiple tantivy subcrates"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
ownedbytes = { version="0.2", path="../ownedbytes" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
rand = "0.8.4"
|
||||
748
common/src/bitset.rs
Normal file
748
common/src/bitset.rs
Normal file
@@ -0,0 +1,748 @@
|
||||
use ownedbytes::OwnedBytes;
|
||||
use std::convert::TryInto;
|
||||
use std::io::Write;
|
||||
use std::u64;
|
||||
use std::{fmt, io};
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub struct TinySet(u64);
|
||||
|
||||
impl fmt::Debug for TinySet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.into_iter().collect::<Vec<u32>>().fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TinySetIterator(TinySet);
|
||||
impl Iterator for TinySetIterator {
|
||||
type Item = u32;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0.pop_lowest()
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for TinySet {
|
||||
type Item = u32;
|
||||
type IntoIter = TinySetIterator;
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
TinySetIterator(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl TinySet {
|
||||
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
|
||||
writer.write_all(self.0.to_le_bytes().as_ref())
|
||||
}
|
||||
|
||||
pub fn into_bytes(self) -> [u8; 8] {
|
||||
self.0.to_le_bytes()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn deserialize(data: [u8; 8]) -> Self {
|
||||
let val: u64 = u64::from_le_bytes(data);
|
||||
TinySet(val)
|
||||
}
|
||||
|
||||
/// Returns an empty `TinySet`.
|
||||
#[inline]
|
||||
pub fn empty() -> TinySet {
|
||||
TinySet(0u64)
|
||||
}
|
||||
|
||||
/// Returns a full `TinySet`.
|
||||
#[inline]
|
||||
pub fn full() -> TinySet {
|
||||
TinySet::empty().complement()
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
///
|
||||
/// Careful on making this function public, as it will break the padding handling in the last
|
||||
/// bucket.
|
||||
#[inline]
|
||||
fn complement(self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` contains the element `el`.
|
||||
#[inline]
|
||||
pub fn contains(self, el: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the TinySet.
|
||||
#[inline]
|
||||
pub fn len(self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
|
||||
/// Returns the intersection of `self` and `other`
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
}
|
||||
|
||||
/// Creates a new `TinySet` containing only one element
|
||||
/// within `[0; 64[`
|
||||
#[inline]
|
||||
pub fn singleton(el: u32) -> TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64)
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Removes an element within [0..64)
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn remove(self, el: u32) -> TinySet {
|
||||
self.intersect(TinySet::singleton(el).complement())
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64)
|
||||
///
|
||||
/// returns true if the set changed
|
||||
#[inline]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.insert(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Remove a element within [0..64)
|
||||
///
|
||||
/// returns true if the set changed
|
||||
#[inline]
|
||||
pub fn remove_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.remove(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 | other.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` is empty.
|
||||
#[inline]
|
||||
pub fn is_empty(self) -> bool {
|
||||
self.0 == 0u64
|
||||
}
|
||||
|
||||
/// Returns the lowest element in the `TinySet`
|
||||
/// and removes it.
|
||||
#[inline]
|
||||
pub fn pop_lowest(&mut self) -> Option<u32> {
|
||||
if self.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let lowest = self.0.trailing_zeros() as u32;
|
||||
self.0 ^= TinySet::singleton(lowest).0;
|
||||
Some(lowest)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` than contains all values up
|
||||
/// to limit excluded.
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_lower(upper_bound: u32) -> TinySet {
|
||||
TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` that contains all values greater
|
||||
/// or equal to the given limit, included. (and up to 63)
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
|
||||
TinySet::range_lower(from_included).complement()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BitSet {
|
||||
tinysets: Box<[TinySet]>,
|
||||
len: u64,
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
fn num_buckets(max_val: u32) -> u32 {
|
||||
(max_val + 63u32) / 64u32
|
||||
}
|
||||
|
||||
impl BitSet {
|
||||
/// serialize a `BitSet`.
|
||||
///
|
||||
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
|
||||
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
|
||||
for tinyset in self.tinysets.iter().cloned() {
|
||||
writer.write_all(&tinyset.into_bytes())?;
|
||||
}
|
||||
writer.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val)`.
|
||||
pub fn with_max_value(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybitsets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
||||
BitSet {
|
||||
tinysets: tinybitsets,
|
||||
len: 0,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `BitSet` that may contain elements. Initially all values will be set.
|
||||
/// within `[0, max_val)`.
|
||||
pub fn with_max_value_and_full(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let mut tinybitsets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
|
||||
|
||||
// Fix padding
|
||||
let lower = max_value % 64u32;
|
||||
if lower != 0 {
|
||||
tinybitsets[tinybitsets.len() - 1] = TinySet::range_lower(lower);
|
||||
}
|
||||
BitSet {
|
||||
tinysets: tinybitsets,
|
||||
len: max_value as u64,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all elements from the `BitSet`.
|
||||
pub fn clear(&mut self) {
|
||||
for tinyset in self.tinysets.iter_mut() {
|
||||
*tinyset = TinySet::empty();
|
||||
}
|
||||
}
|
||||
|
||||
/// Intersect with serialized bitset
|
||||
pub fn intersect_update(&mut self, other: &ReadOnlyBitSet) {
|
||||
self.intersect_update_with_iter(other.iter_tinysets());
|
||||
}
|
||||
|
||||
/// Intersect with tinysets
|
||||
fn intersect_update_with_iter(&mut self, other: impl Iterator<Item = TinySet>) {
|
||||
self.len = 0;
|
||||
for (left, right) in self.tinysets.iter_mut().zip(other) {
|
||||
*left = left.intersect(right);
|
||||
self.len += left.len() as u64;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.len as usize
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
#[inline]
|
||||
pub fn insert(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
#[inline]
|
||||
pub fn remove(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len -= if self.tinysets[higher as usize].remove_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
self.tinyset(el / 64u32).contains(el % 64)
|
||||
}
|
||||
|
||||
/// Returns the first non-empty `TinySet` associated to a bucket lower
|
||||
/// or greater than bucket.
|
||||
///
|
||||
/// Reminder: the tiny set with the bucket `bucket`, represents the
|
||||
/// elements from `bucket * 64` to `(bucket+1) * 64`.
|
||||
pub fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
|
||||
self.tinysets[bucket as usize..]
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|tinyset| !tinyset.is_empty())
|
||||
.map(|delta_bucket| bucket + delta_bucket as u32)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
/// Returns the tiny bitset representing the
|
||||
/// the set restricted to the number range from
|
||||
/// `bucket * 64` to `(bucket + 1) * 64`.
|
||||
pub fn tinyset(&self, bucket: u32) -> TinySet {
|
||||
self.tinysets[bucket as usize]
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialized BitSet.
|
||||
#[derive(Clone)]
|
||||
pub struct ReadOnlyBitSet {
|
||||
data: OwnedBytes,
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
pub fn intersect_bitsets(left: &ReadOnlyBitSet, other: &ReadOnlyBitSet) -> ReadOnlyBitSet {
|
||||
assert_eq!(left.max_value(), other.max_value());
|
||||
assert_eq!(left.data.len(), other.data.len());
|
||||
let union_tinyset_it = left
|
||||
.iter_tinysets()
|
||||
.zip(other.iter_tinysets())
|
||||
.map(|(left_tinyset, right_tinyset)| left_tinyset.intersect(right_tinyset));
|
||||
let mut output_dataset: Vec<u8> = Vec::with_capacity(left.data.len());
|
||||
for tinyset in union_tinyset_it {
|
||||
output_dataset.extend_from_slice(&tinyset.into_bytes());
|
||||
}
|
||||
ReadOnlyBitSet {
|
||||
data: OwnedBytes::new(output_dataset),
|
||||
max_value: left.max_value(),
|
||||
}
|
||||
}
|
||||
|
||||
impl ReadOnlyBitSet {
|
||||
pub fn open(data: OwnedBytes) -> Self {
|
||||
let (max_value_data, data) = data.split(4);
|
||||
assert_eq!(data.len() % 8, 0);
|
||||
let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap());
|
||||
ReadOnlyBitSet { data, max_value }
|
||||
}
|
||||
|
||||
/// Number of elements in the bitset.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.iter_tinysets()
|
||||
.map(|tinyset| tinyset.len() as usize)
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Iterate the tinyset on the fly from serialized data.
|
||||
///
|
||||
#[inline]
|
||||
fn iter_tinysets(&self) -> impl Iterator<Item = TinySet> + '_ {
|
||||
self.data.chunks_exact(8).map(move |chunk| {
|
||||
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap());
|
||||
tinyset
|
||||
})
|
||||
}
|
||||
|
||||
/// Iterate over the positions of the elements.
|
||||
///
|
||||
#[inline]
|
||||
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
|
||||
self.iter_tinysets()
|
||||
.enumerate()
|
||||
.flat_map(move |(chunk_num, tinyset)| {
|
||||
let chunk_base_val = chunk_num as u32 * 64;
|
||||
tinyset
|
||||
.into_iter()
|
||||
.map(move |val| val + chunk_base_val)
|
||||
.take_while(move |doc| *doc < self.max_value)
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
let byte_offset = el / 8u32;
|
||||
let b: u8 = self.data[byte_offset as usize];
|
||||
let shift = (el % 8) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// Maximum value the bitset may contain.
|
||||
/// (Note this is not the maximum value contained in the set.)
|
||||
///
|
||||
/// A bitset has an intrinsic capacity.
|
||||
/// It only stores elements within [0..max_value).
|
||||
#[inline]
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
/// Number of bytes used in the bitset representation.
|
||||
pub fn num_bytes(&self) -> usize {
|
||||
self.data.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a BitSet> for ReadOnlyBitSet {
|
||||
fn from(bitset: &'a BitSet) -> ReadOnlyBitSet {
|
||||
let mut buffer = Vec::with_capacity(bitset.tinysets.len() * 8 + 4);
|
||||
bitset
|
||||
.serialize(&mut buffer)
|
||||
.expect("serializing into a buffer should never fail");
|
||||
ReadOnlyBitSet::open(OwnedBytes::new(buffer))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::BitSet;
|
||||
use super::ReadOnlyBitSet;
|
||||
use super::TinySet;
|
||||
use ownedbytes::OwnedBytes;
|
||||
use rand::distributions::Bernoulli;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_full_multi() {
|
||||
for i in 0..1000 {
|
||||
let bitset = BitSet::with_max_value_and_full(i);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len() as usize, i as usize);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_full_block() {
|
||||
let bitset = BitSet::with_max_value_and_full(64);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len() as usize, 64 as usize);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_full() {
|
||||
let mut bitset = BitSet::with_max_value_and_full(5);
|
||||
bitset.remove(3);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len(), 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_intersect() {
|
||||
let bitset_serialized = {
|
||||
let mut bitset = BitSet::with_max_value_and_full(5);
|
||||
bitset.remove(1);
|
||||
bitset.remove(3);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
ReadOnlyBitSet::open(OwnedBytes::new(out))
|
||||
};
|
||||
|
||||
let mut bitset = BitSet::with_max_value_and_full(5);
|
||||
bitset.remove(1);
|
||||
bitset.intersect_update(&bitset_serialized);
|
||||
|
||||
assert!(bitset.contains(0));
|
||||
assert!(!bitset.contains(1));
|
||||
assert!(bitset.contains(2));
|
||||
assert!(!bitset.contains(3));
|
||||
assert!(bitset.contains(4));
|
||||
|
||||
bitset.intersect_update_with_iter(vec![TinySet::singleton(0)].into_iter());
|
||||
|
||||
assert!(bitset.contains(0));
|
||||
assert!(!bitset.contains(1));
|
||||
assert!(!bitset.contains(2));
|
||||
assert!(!bitset.contains(3));
|
||||
assert!(!bitset.contains(4));
|
||||
assert_eq!(bitset.len(), 1);
|
||||
|
||||
bitset.intersect_update_with_iter(vec![TinySet::singleton(1)].into_iter());
|
||||
assert!(!bitset.contains(0));
|
||||
assert!(!bitset.contains(1));
|
||||
assert!(!bitset.contains(2));
|
||||
assert!(!bitset.contains(3));
|
||||
assert!(!bitset.contains(4));
|
||||
assert_eq!(bitset.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_empty() {
|
||||
let mut bitset = BitSet::with_max_value(5);
|
||||
bitset.insert(3);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len(), 1);
|
||||
|
||||
{
|
||||
let bitset = BitSet::with_max_value(5);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set_remove() {
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32).insert(5).remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty()
|
||||
.insert(63u32)
|
||||
.insert(1)
|
||||
.insert(5)
|
||||
.remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1).remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1).remove(1u32);
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
assert!(TinySet::empty().is_empty());
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32).insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(2u32);
|
||||
assert_eq!(u.pop_lowest(), Some(2u32));
|
||||
u.insert_mut(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(63u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32).insert(5);
|
||||
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||
assert_eq!(u.pop_lowest(), Some(63u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let original = TinySet::empty().insert(63u32).insert(5);
|
||||
let after_serialize_deserialize = TinySet::deserialize(original.into_bytes());
|
||||
assert_eq!(original, after_serialize_deserialize);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset() {
|
||||
let test_against_hashset = |els: &[u32], max_value: u32| {
|
||||
let mut hashset: HashSet<u32> = HashSet::new();
|
||||
let mut bitset = BitSet::with_max_value(max_value);
|
||||
for &el in els {
|
||||
assert!(el < max_value);
|
||||
hashset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for el in 0..max_value {
|
||||
assert_eq!(hashset.contains(&el), bitset.contains(el));
|
||||
}
|
||||
assert_eq!(bitset.max_value(), max_value);
|
||||
|
||||
// test deser
|
||||
let mut data = vec![];
|
||||
bitset.serialize(&mut data).unwrap();
|
||||
let ro_bitset = ReadOnlyBitSet::open(OwnedBytes::new(data));
|
||||
for el in 0..max_value {
|
||||
assert_eq!(hashset.contains(&el), ro_bitset.contains(el));
|
||||
}
|
||||
assert_eq!(ro_bitset.max_value(), max_value);
|
||||
assert_eq!(ro_bitset.len(), els.len());
|
||||
};
|
||||
|
||||
test_against_hashset(&[], 0);
|
||||
test_against_hashset(&[], 1);
|
||||
test_against_hashset(&[0u32], 1);
|
||||
test_against_hashset(&[0u32], 100);
|
||||
test_against_hashset(&[1u32, 2u32], 4);
|
||||
test_against_hashset(&[99u32], 100);
|
||||
test_against_hashset(&[63u32], 64);
|
||||
test_against_hashset(&[62u32, 63u32], 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_num_buckets() {
|
||||
use super::num_buckets;
|
||||
assert_eq!(num_buckets(0u32), 0);
|
||||
assert_eq!(num_buckets(1u32), 1);
|
||||
assert_eq!(num_buckets(64u32), 1);
|
||||
assert_eq!(num_buckets(65u32), 2);
|
||||
assert_eq!(num_buckets(128u32), 2);
|
||||
assert_eq!(num_buckets(129u32), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tinyset_range() {
|
||||
assert_eq!(
|
||||
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1, 2]
|
||||
);
|
||||
assert!(TinySet::range_lower(0).is_empty());
|
||||
assert_eq!(
|
||||
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
|
||||
(0u32..63u32).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
|
||||
[0]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_greater_or_equal(3)
|
||||
.into_iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
(3u32..64u32).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_len() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(104u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
bitset.remove(105u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
bitset.remove(104u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.remove(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.remove(103u32);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
}
|
||||
|
||||
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
StdRng::from_seed([seed_val; 32])
|
||||
.sample_iter(&Bernoulli::new(ratio).unwrap())
|
||||
.take(n as usize)
|
||||
.enumerate()
|
||||
.filter_map(|(val, keep)| if keep { Some(val as u32) } else { None })
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn sample(n: u32, ratio: f64) -> Vec<u32> {
|
||||
sample_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_clear() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
let els = sample(1_000, 0.01f64);
|
||||
for &el in &els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
assert!(els.iter().all(|el| bitset.contains(*el)));
|
||||
bitset.clear();
|
||||
for el in 0u32..1000u32 {
|
||||
assert!(!bitset.contains(el));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use test;
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let mut tinyset = TinySet::singleton(test::black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
b.iter(|| {
|
||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
}
|
||||
@@ -1,23 +1,18 @@
|
||||
mod bitset;
|
||||
mod composite_file;
|
||||
mod counting_writer;
|
||||
mod serialize;
|
||||
mod vint;
|
||||
#![allow(clippy::len_without_is_empty)]
|
||||
|
||||
use std::ops::Deref;
|
||||
|
||||
pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use self::vint::{
|
||||
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
|
||||
};
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||
///
|
||||
/// We do not allow segments with more than
|
||||
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
||||
mod bitset;
|
||||
mod serialize;
|
||||
mod vint;
|
||||
mod writer;
|
||||
|
||||
pub use bitset::*;
|
||||
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt};
|
||||
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
|
||||
|
||||
/// Has length trait
|
||||
pub trait HasLen {
|
||||
@@ -30,6 +25,12 @@ pub trait HasLen {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Deref<Target = [u8]>> HasLen for T {
|
||||
fn len(&self) -> usize {
|
||||
self.deref().len()
|
||||
}
|
||||
}
|
||||
|
||||
const HIGHEST_BIT: u64 = 1 << 63;
|
||||
|
||||
/// Maps a `i64` to `u64`
|
||||
@@ -101,14 +102,12 @@ pub fn u64_to_f64(val: u64) -> f64 {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
pub mod test {
|
||||
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use super::{BinarySerializable, FixedSize};
|
||||
use proptest::prelude::*;
|
||||
use std::f64;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
pub use tantivy_bitpacker::minmax;
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
@@ -118,6 +117,12 @@ pub(crate) mod test {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
|
||||
@@ -161,40 +166,4 @@ pub(crate) mod test {
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
assert_eq!(compute_num_bits(0), 0u8);
|
||||
assert_eq!(compute_num_bits(2), 2u8);
|
||||
assert_eq!(compute_num_bits(3), 2u8);
|
||||
assert_eq!(compute_num_bits(4), 3u8);
|
||||
assert_eq!(compute_num_bits(255), 8u8);
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_max_doc() {
|
||||
// this is the first time I write a unit test for a constant.
|
||||
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
|
||||
assert!((super::MAX_DOC_LIMIT as i32) < 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_empty() {
|
||||
let vals: Vec<u32> = vec![];
|
||||
assert_eq!(minmax(vals.into_iter()), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_one() {
|
||||
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_two() {
|
||||
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
|
||||
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::common::Endianness;
|
||||
use crate::common::VInt;
|
||||
use crate::Endianness;
|
||||
use crate::VInt;
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
@@ -160,6 +160,28 @@ impl FixedSize for u8 {
|
||||
const SIZE_IN_BYTES: usize = 1;
|
||||
}
|
||||
|
||||
impl BinarySerializable for bool {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let val = if *self { 1 } else { 0 };
|
||||
writer.write_u8(val)
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<bool> {
|
||||
let val = reader.read_u8()?;
|
||||
match val {
|
||||
0 => Ok(false),
|
||||
1 => Ok(true),
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"invalid bool value on deserialization, data corrupted",
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for bool {
|
||||
const SIZE_IN_BYTES: usize = 1;
|
||||
}
|
||||
|
||||
impl BinarySerializable for String {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let data: &[u8] = self.as_bytes();
|
||||
@@ -180,9 +202,9 @@ impl BinarySerializable for String {
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
|
||||
use super::VInt;
|
||||
use super::*;
|
||||
use crate::common::VInt;
|
||||
|
||||
use crate::serialize::BinarySerializable;
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
@@ -106,7 +106,7 @@ pub fn read_u32_vint_no_advance(data: &[u8]) -> (u32, usize) {
|
||||
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
|
||||
let mut buf = [0u8; 8];
|
||||
let data = serialize_vint_u32(val, &mut buf);
|
||||
writer.write_all(&data)
|
||||
writer.write_all(data)
|
||||
}
|
||||
|
||||
impl VInt {
|
||||
@@ -175,14 +175,14 @@ impl BinarySerializable for VInt {
|
||||
mod tests {
|
||||
|
||||
use super::serialize_vint_u32;
|
||||
use super::BinarySerializable;
|
||||
use super::VInt;
|
||||
use crate::common::BinarySerializable;
|
||||
|
||||
fn aux_test_vint(val: u64) {
|
||||
let mut v = [14u8; 10];
|
||||
let num_bytes = VInt(val).serialize_into(&mut v);
|
||||
for i in num_bytes..10 {
|
||||
assert_eq!(v[i], 14u8);
|
||||
for el in &v[num_bytes..10] {
|
||||
assert_eq!(el, &14u8);
|
||||
}
|
||||
assert!(num_bytes > 0);
|
||||
if num_bytes < 10 {
|
||||
@@ -1,7 +1,4 @@
|
||||
use crate::directory::AntiCallToken;
|
||||
use crate::directory::TerminatingWrite;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::io::{self, BufWriter, Write};
|
||||
|
||||
pub struct CountingWriter<W> {
|
||||
underlying: W,
|
||||
@@ -16,41 +13,87 @@ impl<W: Write> CountingWriter<W> {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn written_bytes(&self) -> u64 {
|
||||
self.written_bytes
|
||||
}
|
||||
|
||||
/// Returns the underlying write object.
|
||||
/// Note that this method does not trigger any flushing.
|
||||
#[inline]
|
||||
pub fn finish(self) -> W {
|
||||
self.underlying
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write> Write for CountingWriter<W> {
|
||||
#[inline]
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
let written_size = self.underlying.write(buf)?;
|
||||
self.written_bytes += written_size as u64;
|
||||
Ok(written_size)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||
self.underlying.write_all(buf)?;
|
||||
self.written_bytes += buf.len() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.underlying.flush()
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
|
||||
#[inline]
|
||||
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
|
||||
self.underlying.terminate_ref(token)
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite.html#tymethod.terminate_ref) directly
|
||||
///
|
||||
/// The point is that while the type is public, it cannot be built by anyone
|
||||
/// outside of this module.
|
||||
pub struct AntiCallToken(());
|
||||
|
||||
/// Trait used to indicate when no more write need to be done on a writer
|
||||
pub trait TerminatingWrite: Write {
|
||||
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
|
||||
fn terminate(mut self) -> io::Result<()>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
self.terminate_ref(AntiCallToken(()))
|
||||
}
|
||||
|
||||
/// You should implement this function to define custom behavior.
|
||||
/// This function should flush any buffer it may hold.
|
||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()>;
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite + ?Sized> TerminatingWrite for Box<W> {
|
||||
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
|
||||
self.as_mut().terminate_ref(token)
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
|
||||
fn terminate_ref(&mut self, a: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()?;
|
||||
self.get_mut().terminate_ref(a)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TerminatingWrite for &'a mut Vec<u8> {
|
||||
fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
- [Segments](./basis.md)
|
||||
- [Defining your schema](./schema.md)
|
||||
- [Facetting](./facetting.md)
|
||||
- [Index Sorting](./index_sorting.md)
|
||||
- [Innerworkings](./innerworkings.md)
|
||||
- [Inverted index](./inverted_index.md)
|
||||
- [Best practise](./inverted_index.md)
|
||||
|
||||
61
doc/src/index_sorting.md
Normal file
61
doc/src/index_sorting.md
Normal file
@@ -0,0 +1,61 @@
|
||||
|
||||
- [Index Sorting](#index-sorting)
|
||||
+ [Why Sorting](#why-sorting)
|
||||
* [Compression](#compression)
|
||||
* [Top-N Optimization](#top-n-optimization)
|
||||
* [Pruning](#pruning)
|
||||
* [Other](#other)
|
||||
+ [Usage](#usage)
|
||||
|
||||
# Index Sorting
|
||||
|
||||
Tantivy allows you to sort the index according to a property.
|
||||
|
||||
## Why Sorting
|
||||
|
||||
Presorting an index has several advantages:
|
||||
|
||||
###### Compression
|
||||
|
||||
When data is sorted it is easier to compress the data. E.g. the numbers sequence [5, 2, 3, 1, 4] would be sorted to [1, 2, 3, 4, 5].
|
||||
If we apply delta encoding this list would be unsorted [5, -3, 1, -2, 3] vs. [1, 1, 1, 1, 1].
|
||||
Compression ratio is mainly affected on the fast field of the sorted property, every thing else is likely unaffected.
|
||||
###### Top-N Optimization
|
||||
|
||||
When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
|
||||
E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
|
||||
|
||||
Note: Tantivy 0.16 does not do this optimization yet.
|
||||
|
||||
###### Pruning
|
||||
|
||||
Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
|
||||
|
||||
Note: Tantivy 0.16 does not do this optimization yet.
|
||||
|
||||
###### Other?
|
||||
|
||||
In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?)
|
||||
|
||||
## Usage
|
||||
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used.
|
||||
|
||||
```
|
||||
let settings = IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
let mut index_builder = Index::builder().schema(schema);
|
||||
index_builder = index_builder.settings(settings);
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
```
|
||||
|
||||
## Implementation details
|
||||
|
||||
Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073).
|
||||
|
||||
In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets).
|
||||
|
||||
@@ -96,7 +96,7 @@ fn main() -> tantivy::Result<()> {
|
||||
);
|
||||
|
||||
// ... and add it to the `IndexWriter`.
|
||||
index_writer.add_document(old_man_doc);
|
||||
index_writer.add_document(old_man_doc)?;
|
||||
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
@@ -110,7 +110,7 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
))?;
|
||||
|
||||
// Multivalued field just need to be repeated.
|
||||
index_writer.add_document(doc!(
|
||||
@@ -120,7 +120,7 @@ fn main() -> tantivy::Result<()> {
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
));
|
||||
))?;
|
||||
|
||||
// This is an example, so we will only index 3 documents
|
||||
// here. You can check out tantivy's tutorial to index
|
||||
|
||||
@@ -86,12 +86,10 @@ impl Collector for StatsCollector {
|
||||
|
||||
fn merge_fruits(&self, segment_stats: Vec<Option<Stats>>) -> tantivy::Result<Option<Stats>> {
|
||||
let mut stats = Stats::default();
|
||||
for segment_stats_opt in segment_stats {
|
||||
if let Some(segment_stats) = segment_stats_opt {
|
||||
stats.count += segment_stats.count;
|
||||
stats.sum += segment_stats.sum;
|
||||
stats.squared_sum += segment_stats.squared_sum;
|
||||
}
|
||||
for segment_stats in segment_stats.into_iter().flatten() {
|
||||
stats.count += segment_stats.count;
|
||||
stats.sum += segment_stats.sum;
|
||||
stats.squared_sum += segment_stats.squared_sum;
|
||||
}
|
||||
Ok(stats.non_zero_count())
|
||||
}
|
||||
@@ -139,7 +137,7 @@ fn main() -> tantivy::Result<()> {
|
||||
//
|
||||
// Lets index a bunch of fake documents for the sake of
|
||||
// this example.
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
index_writer.add_document(doc!(
|
||||
@@ -147,23 +145,23 @@ fn main() -> tantivy::Result<()> {
|
||||
product_description => "While it is ok for short distance travel, this broom \
|
||||
was designed quiditch. It will up your game.",
|
||||
price => 30_200u64
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Turbulobroom",
|
||||
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
|
||||
You'll enjoy its sharp turns, and rapid acceleration",
|
||||
price => 29_240u64
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Broomio",
|
||||
product_description => "Great value for the price. This broom is a market favorite",
|
||||
price => 21_240u64
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Whack a Mole",
|
||||
product_description => "Prime quality bat.",
|
||||
price => 5_200u64
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
|
||||
@@ -79,14 +79,14 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent
|
||||
limbs and branches that arch over the pool"#
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and
|
||||
increasing confidence in the success of my undertaking."#
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -76,15 +76,15 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer.add_document(doc!(
|
||||
isbn => "978-0099908401",
|
||||
title => "The old Man and the see"
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
isbn => "978-0140177398",
|
||||
title => "Of Mice and Men",
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankentein", //< Oops there is a typo here.
|
||||
isbn => "978-9176370711",
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -122,7 +122,7 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
isbn => "978-9176370711",
|
||||
));
|
||||
))?;
|
||||
|
||||
// You are guaranteed that your clients will only observe your index in
|
||||
// the state it was in after a commit.
|
||||
|
||||
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let name = schema_builder.add_text_field("felin_name", TEXT | STORED);
|
||||
// this is our faceted field: its scientific classification
|
||||
let classification = schema_builder.add_facet_field("classification", INDEXED);
|
||||
let classification = schema_builder.add_facet_field("classification", FacetOptions::default());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -35,35 +35,35 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer.add_document(doc!(
|
||||
name => "Cat",
|
||||
classification => Facet::from("/Felidae/Felinae/Felis")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Canada lynx",
|
||||
classification => Facet::from("/Felidae/Felinae/Lynx")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Cheetah",
|
||||
classification => Facet::from("/Felidae/Felinae/Acinonyx")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Tiger",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Lion",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Jaguar",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Sunda clouded leopard",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Neofelis")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Fossa",
|
||||
classification => Facet::from("/Eupleridae/Cryptoprocta")
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -9,10 +9,10 @@ fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let title = schema_builder.add_text_field("title", STORED);
|
||||
let ingredient = schema_builder.add_facet_field("ingredient", INDEXED);
|
||||
let ingredient = schema_builder.add_facet_field("ingredient", FacetOptions::default());
|
||||
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer(30_000_000)?;
|
||||
|
||||
@@ -20,14 +20,14 @@ fn main() -> tantivy::Result<()> {
|
||||
title => "Fried egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Scrambled egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/butter"),
|
||||
ingredient => Facet::from("/ingredient/milk"),
|
||||
ingredient => Facet::from("/ingredient/salt"),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Egg rolls",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
@@ -36,7 +36,7 @@ fn main() -> tantivy::Result<()> {
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
ingredient => Facet::from("/ingredient/tortilla-wrap"),
|
||||
ingredient => Facet::from("/ingredient/mushroom"),
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let query = BooleanQuery::new_multiterms_query(
|
||||
facets
|
||||
.iter()
|
||||
.map(|key| Term::from_facet(ingredient, &key))
|
||||
.map(|key| Term::from_facet(ingredient, key))
|
||||
.collect(),
|
||||
);
|
||||
let top_docs_by_custom_score =
|
||||
|
||||
@@ -7,7 +7,7 @@ use tantivy::query::RangeQuery;
|
||||
use tantivy::schema::{Schema, INDEXED};
|
||||
use tantivy::{doc, Index, Result};
|
||||
|
||||
fn run() -> Result<()> {
|
||||
fn main() -> Result<()> {
|
||||
// For the sake of simplicity, this schema will only have 1 field
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -19,7 +19,7 @@ fn run() -> Result<()> {
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
|
||||
for year in 1950u64..2019u64 {
|
||||
index_writer.add_document(doc!(year_field => year));
|
||||
index_writer.add_document(doc!(year_field => year))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
// The index will be a range of years
|
||||
@@ -33,7 +33,3 @@ fn run() -> Result<()> {
|
||||
assert_eq!(num_60s_books, 10);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() {
|
||||
run().unwrap()
|
||||
}
|
||||
|
||||
@@ -22,12 +22,12 @@ fn main() -> tantivy::Result<()> {
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"));
|
||||
index_writer.add_document(doc!(title => "Of Mice and Men"));
|
||||
index_writer.add_document(doc!(title => "The modern Promotheus"));
|
||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
|
||||
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
|
||||
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -29,7 +29,7 @@ use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tantivy::schema::{Schema, STORED, TEXT};
|
||||
use tantivy::{doc, Index, IndexWriter, Opstamp};
|
||||
use tantivy::{doc, Index, IndexWriter, Opstamp, TantivyError};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
@@ -59,10 +59,11 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
))?;
|
||||
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
}
|
||||
Result::<(), TantivyError>::Ok(())
|
||||
});
|
||||
|
||||
// # Second indexing thread.
|
||||
@@ -78,11 +79,12 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer_rlock.add_document(doc!(
|
||||
title => "Manufacturing consent",
|
||||
body => "Some great book description..."
|
||||
))
|
||||
))?
|
||||
};
|
||||
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(10));
|
||||
}
|
||||
Result::<(), TantivyError>::Ok(())
|
||||
});
|
||||
|
||||
// # In the main thread, we commit 10 times, once every 500ms.
|
||||
@@ -90,7 +92,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let opstamp: Opstamp = {
|
||||
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
|
||||
let mut index_writer_wlock = index_writer.write().unwrap();
|
||||
index_writer_wlock.commit().unwrap()
|
||||
index_writer_wlock.commit()?
|
||||
};
|
||||
println!("committed with opstamp {}", opstamp);
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
|
||||
@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let old_man_doc = doc!(title => title_tok, body => body_tok);
|
||||
|
||||
// ... now let's just add it to the IndexWriter
|
||||
index_writer.add_document(old_man_doc);
|
||||
index_writer.add_document(old_man_doc)?;
|
||||
|
||||
// Pretokenized text can also be fed as JSON
|
||||
let short_man_json = r#"{
|
||||
@@ -82,9 +82,9 @@ fn main() -> tantivy::Result<()> {
|
||||
}]
|
||||
}"#;
|
||||
|
||||
let short_man_doc = schema.parse_document(&short_man_json)?;
|
||||
let short_man_doc = schema.parse_document(short_man_json)?;
|
||||
|
||||
index_writer.add_document(short_man_doc);
|
||||
index_writer.add_document(short_man_doc)?;
|
||||
|
||||
// Let's commit changes
|
||||
index_writer.commit()?;
|
||||
@@ -106,9 +106,7 @@ fn main() -> tantivy::Result<()> {
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let (top_docs, count) = searcher
|
||||
.search(&query, &(TopDocs::with_limit(2), Count))
|
||||
.unwrap();
|
||||
let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
|
||||
|
||||
assert_eq!(count, 2);
|
||||
|
||||
@@ -129,9 +127,7 @@ fn main() -> tantivy::Result<()> {
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let (_top_docs, count) = searcher
|
||||
.search(&query, &(TopDocs::with_limit(2), Count))
|
||||
.unwrap();
|
||||
let (_top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
|
||||
|
||||
assert_eq!(count, 0);
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
let index = Index::create_in_dir(&index_path, schema)?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
@@ -40,7 +40,7 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
))?;
|
||||
// ...
|
||||
index_writer.commit()?;
|
||||
|
||||
@@ -70,13 +70,13 @@ fn highlight(snippet: Snippet) -> String {
|
||||
let mut start_from = 0;
|
||||
|
||||
for fragment_range in snippet.highlighted() {
|
||||
result.push_str(&snippet.fragments()[start_from..fragment_range.start]);
|
||||
result.push_str(&snippet.fragment()[start_from..fragment_range.start]);
|
||||
result.push_str(" --> ");
|
||||
result.push_str(&snippet.fragments()[fragment_range.clone()]);
|
||||
result.push_str(&snippet.fragment()[fragment_range.clone()]);
|
||||
result.push_str(" <-- ");
|
||||
start_from = fragment_range.end;
|
||||
}
|
||||
|
||||
result.push_str(&snippet.fragments()[start_from..]);
|
||||
result.push_str(&snippet.fragment()[start_from..]);
|
||||
result
|
||||
}
|
||||
|
||||
@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
));
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
@@ -80,7 +80,7 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
@@ -88,7 +88,7 @@ fn main() -> tantivy::Result<()> {
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
));
|
||||
))?;
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
|
||||
223
examples/warmer.rs
Normal file
223
examples/warmer.rs
Normal file
@@ -0,0 +1,223 @@
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::fastfield::FastFieldReader;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Field, Schema, FAST, TEXT};
|
||||
use tantivy::{doc, DocAddress, DocId, Index, IndexReader, SegmentReader, TrackedObject};
|
||||
use tantivy::{Opstamp, Searcher, SearcherGeneration, SegmentId, Warmer};
|
||||
|
||||
// This example shows how warmers can be used to
|
||||
// load a values from an external sources using the Warmer API.
|
||||
//
|
||||
// In this example, we assume an e-commerce search engine.
|
||||
|
||||
type ProductId = u64;
|
||||
|
||||
/// Price
|
||||
type Price = u32;
|
||||
|
||||
pub trait PriceFetcher: Send + Sync + 'static {
|
||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
|
||||
}
|
||||
|
||||
struct DynamicPriceColumn {
|
||||
field: Field,
|
||||
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
|
||||
price_fetcher: Box<dyn PriceFetcher>,
|
||||
}
|
||||
|
||||
impl DynamicPriceColumn {
|
||||
pub fn with_product_id_field<T: PriceFetcher>(field: Field, price_fetcher: T) -> Self {
|
||||
DynamicPriceColumn {
|
||||
field,
|
||||
price_cache: Default::default(),
|
||||
price_fetcher: Box::new(price_fetcher),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn price_for_segment(&self, segment_reader: &SegmentReader) -> Option<Arc<Vec<Price>>> {
|
||||
let segment_key = (segment_reader.segment_id(), segment_reader.delete_opstamp());
|
||||
self.price_cache.read().unwrap().get(&segment_key).cloned()
|
||||
}
|
||||
}
|
||||
impl Warmer for DynamicPriceColumn {
|
||||
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
|
||||
for segment in searcher.segment_readers() {
|
||||
let key = (segment.segment_id(), segment.delete_opstamp());
|
||||
let product_id_reader = segment.fast_fields().u64(self.field)?;
|
||||
let product_ids: Vec<ProductId> = segment
|
||||
.doc_ids_alive()
|
||||
.map(|doc| product_id_reader.get(doc))
|
||||
.collect();
|
||||
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
|
||||
let mut price_vals: Vec<Price> = Vec::new();
|
||||
for doc in 0..segment.max_doc() {
|
||||
if segment.is_deleted(doc) {
|
||||
price_vals.push(0);
|
||||
} else {
|
||||
price_vals.push(prices_it.next().unwrap())
|
||||
}
|
||||
}
|
||||
self.price_cache
|
||||
.write()
|
||||
.unwrap()
|
||||
.insert(key, Arc::new(price_vals));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn garbage_collect(&self, live_generations: &[TrackedObject<SearcherGeneration>]) {
|
||||
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
|
||||
live_generations
|
||||
.iter()
|
||||
.flat_map(|gen| gen.segments())
|
||||
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
|
||||
.collect();
|
||||
let mut price_cache_wrt = self.price_cache.write().unwrap();
|
||||
// let price_cache = std::mem::take(&mut *price_cache_wrt);
|
||||
// Drain would be nicer here.
|
||||
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
|
||||
.into_iter()
|
||||
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
|
||||
.collect();
|
||||
}
|
||||
}
|
||||
|
||||
/// For the sake of this example, the table is just an editable HashMap behind a RwLock.
|
||||
/// This map represents a map (ProductId -> Price)
|
||||
///
|
||||
/// In practise, it could be fetching things from an external service, like a SQL table.
|
||||
///
|
||||
#[derive(Default, Clone)]
|
||||
pub struct ExternalPriceTable {
|
||||
prices: Arc<RwLock<HashMap<ProductId, Price>>>,
|
||||
}
|
||||
|
||||
impl ExternalPriceTable {
|
||||
pub fn update_price(&self, product_id: ProductId, price: Price) {
|
||||
let mut prices_wrt = self.prices.write().unwrap();
|
||||
prices_wrt.insert(product_id, price);
|
||||
}
|
||||
}
|
||||
|
||||
impl PriceFetcher for ExternalPriceTable {
|
||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
|
||||
let prices_read = self.prices.read().unwrap();
|
||||
product_ids
|
||||
.iter()
|
||||
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Declaring our schema.
|
||||
let mut schema_builder = Schema::builder();
|
||||
// The product id is assumed to be a primary id for our external price source.
|
||||
let product_id = schema_builder.add_u64_field("product_id", FAST);
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema: Schema = schema_builder.build();
|
||||
|
||||
let price_table = ExternalPriceTable::default();
|
||||
let price_dynamic_column = Arc::new(DynamicPriceColumn::with_product_id_field(
|
||||
product_id,
|
||||
price_table.clone(),
|
||||
));
|
||||
price_table.update_price(OLIVE_OIL, 12);
|
||||
price_table.update_price(GLOVES, 13);
|
||||
price_table.update_price(SNEAKERS, 80);
|
||||
|
||||
const OLIVE_OIL: ProductId = 323423;
|
||||
const GLOVES: ProductId = 3966623;
|
||||
const SNEAKERS: ProductId = 23222;
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
|
||||
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
|
||||
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
|
||||
writer.commit()?;
|
||||
|
||||
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
|
||||
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
|
||||
)];
|
||||
let reader: IndexReader = index
|
||||
.reader_builder()
|
||||
.warmers(warmers)
|
||||
.num_searchers(1)
|
||||
.try_into()?;
|
||||
reader.reload()?;
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![text]);
|
||||
let query = query_parser.parse_query("cooking")?;
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let score_by_price = move |segment_reader: &SegmentReader| {
|
||||
let price = price_dynamic_column
|
||||
.price_for_segment(segment_reader)
|
||||
.unwrap();
|
||||
move |doc_id: DocId| Reverse(price[doc_id as usize])
|
||||
};
|
||||
|
||||
let most_expensive_first = TopDocs::with_limit(10).custom_score(score_by_price);
|
||||
|
||||
let hits = searcher.search(&query, &most_expensive_first)?;
|
||||
assert_eq!(
|
||||
&hits,
|
||||
&[
|
||||
(
|
||||
Reverse(12u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 0u32
|
||||
}
|
||||
),
|
||||
(
|
||||
Reverse(13u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 1u32
|
||||
}
|
||||
),
|
||||
]
|
||||
);
|
||||
|
||||
// Olive oil just got more expensive!
|
||||
price_table.update_price(OLIVE_OIL, 15);
|
||||
|
||||
// The price update are directly reflected on `reload`.
|
||||
//
|
||||
// Be careful here though!...
|
||||
// You may have spotted that we are still using the same `Searcher`.
|
||||
//
|
||||
// It is up to the `Warmer` implementer to decide how
|
||||
// to control this behavior.
|
||||
|
||||
reader.reload()?;
|
||||
|
||||
let hits_with_new_prices = searcher.search(&query, &most_expensive_first)?;
|
||||
assert_eq!(
|
||||
&hits_with_new_prices,
|
||||
&[
|
||||
(
|
||||
Reverse(13u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 1u32
|
||||
}
|
||||
),
|
||||
(
|
||||
Reverse(15u32),
|
||||
DocAddress {
|
||||
segment_ord: 0,
|
||||
doc_id: 0u32
|
||||
}
|
||||
),
|
||||
]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,4 +1,3 @@
|
||||
use tantivy;
|
||||
use tantivy::schema::*;
|
||||
|
||||
// # Document from json
|
||||
@@ -22,7 +21,7 @@ fn main() -> tantivy::Result<()> {
|
||||
}"#;
|
||||
|
||||
// We can parse our document
|
||||
let _mice_and_men_doc = schema.parse_document(&mice_and_men_doc_json)?;
|
||||
let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?;
|
||||
|
||||
// Multi-valued field are allowed, they are
|
||||
// expressed in JSON by an array.
|
||||
@@ -31,7 +30,7 @@ fn main() -> tantivy::Result<()> {
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"year": 1818
|
||||
}"#;
|
||||
let _frankenstein_doc = schema.parse_document(&frankenstein_json)?;
|
||||
let _frankenstein_doc = schema.parse_document(frankenstein_json)?;
|
||||
|
||||
// Note that the schema is saved in your index directory.
|
||||
//
|
||||
|
||||
24
fastfield_codecs/Cargo.toml
Normal file
24
fastfield_codecs/Cargo.toml
Normal file
@@ -0,0 +1,24 @@
|
||||
[package]
|
||||
name = "fastfield_codecs"
|
||||
version = "0.1.0"
|
||||
authors = ["Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
description = "Fast field codecs used by tantivy"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
common = { version = "0.1", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
|
||||
prettytable-rs = {version="0.8.0", optional= true}
|
||||
rand = {version="0.8.3", optional= true}
|
||||
|
||||
[dev-dependencies]
|
||||
more-asserts = "0.2.1"
|
||||
rand = "0.8.3"
|
||||
|
||||
[features]
|
||||
bin = ["prettytable-rs", "rand"]
|
||||
default = ["bin"]
|
||||
|
||||
68
fastfield_codecs/README.md
Normal file
68
fastfield_codecs/README.md
Normal file
@@ -0,0 +1,68 @@
|
||||
|
||||
|
||||
# Fast Field Codecs
|
||||
|
||||
This crate contains various fast field codecs, used to compress/decompress fast field data in tantivy.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributing is pretty straightforward. Since the bitpacking is the simplest compressor, you can check it for reference.
|
||||
|
||||
A codec needs to implement 2 traits:
|
||||
|
||||
- A reader implementing `FastFieldCodecReader` to read the codec.
|
||||
- A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id.
|
||||
|
||||
### Tests
|
||||
|
||||
Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`).
|
||||
|
||||
Make sure to add the codec to the main.rs, which tests the compression ratio and estimation against different data sets. You can run it with:
|
||||
```
|
||||
cargo run --features bin
|
||||
```
|
||||
|
||||
### TODO
|
||||
- Add real world data sets in comparison
|
||||
- Add codec to cover sparse data sets
|
||||
|
||||
|
||||
### Codec Comparison
|
||||
```
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| | Compression Ratio | Compression Estimation |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Autoincrement | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.000039572664 | 0.000004396963 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.1477348 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing concave | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.26562938 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.190665 | 0.1883836 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing convex | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.28125438 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.18676 | 0.2040086 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Almost monotonically increasing | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.14066513 | 0.1562544 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.16335973 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
|
||||
```
|
||||
108
fastfield_codecs/benches/bench.rs
Normal file
108
fastfield_codecs/benches/bench.rs
Normal file
@@ -0,0 +1,108 @@
|
||||
#![feature(test)]
|
||||
|
||||
extern crate test;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use fastfield_codecs::{
|
||||
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
|
||||
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
|
||||
multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
},
|
||||
*,
|
||||
};
|
||||
|
||||
fn get_data() -> Vec<u64> {
|
||||
let mut data: Vec<_> = (100..55000_u64)
|
||||
.map(|num| num + rand::random::<u8>() as u64)
|
||||
.collect();
|
||||
data.push(99_000);
|
||||
data.insert(1000, 2000);
|
||||
data.insert(2000, 100);
|
||||
data.insert(3000, 4100);
|
||||
data.insert(4000, 100);
|
||||
data.insert(5000, 800);
|
||||
data
|
||||
}
|
||||
|
||||
fn value_iter() -> impl Iterator<Item = u64> {
|
||||
0..20_000
|
||||
}
|
||||
fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
b: &mut Bencher,
|
||||
data: &[u64],
|
||||
) {
|
||||
let mut bytes = vec![];
|
||||
S::serialize(
|
||||
&mut bytes,
|
||||
&data,
|
||||
stats_from_vec(data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
let reader = R::open_from_bytes(&bytes).unwrap();
|
||||
b.iter(|| {
|
||||
for pos in value_iter() {
|
||||
reader.get_u64(pos as u64, &bytes);
|
||||
}
|
||||
});
|
||||
}
|
||||
fn bench_create<S: FastFieldCodecSerializer>(b: &mut Bencher, data: &[u64]) {
|
||||
let mut bytes = vec![];
|
||||
b.iter(|| {
|
||||
S::serialize(
|
||||
&mut bytes,
|
||||
&data,
|
||||
stats_from_vec(data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
use test::Bencher;
|
||||
#[bench]
|
||||
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<BitpackedFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<LinearInterpolFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<MultiLinearInterpolFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>(
|
||||
b, &data,
|
||||
);
|
||||
}
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
let max_value = data.iter().cloned().max().unwrap_or(0);
|
||||
FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: data.len() as u64,
|
||||
}
|
||||
}
|
||||
}
|
||||
176
fastfield_codecs/src/bitpacked.rs
Normal file
176
fastfield_codecs/src/bitpacked.rs
Normal file
@@ -0,0 +1,176 @@
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use common::BinarySerializable;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct BitpackedFastFieldReader {
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub min_value_u64: u64,
|
||||
pub max_value_u64: u64,
|
||||
}
|
||||
|
||||
impl<'data> FastFieldCodecReader for BitpackedFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - 16);
|
||||
let min_value = u64::deserialize(&mut footer)?;
|
||||
let amplitude = u64::deserialize(&mut footer)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(BitpackedFastFieldReader {
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
self.min_value_u64 + self.bit_unpacker.get(doc, data)
|
||||
}
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.min_value_u64
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value_u64
|
||||
}
|
||||
}
|
||||
pub struct BitpackedFastFieldSerializerLegacy<'a, W: 'a + Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
amplitude: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> BitpackedFastFieldSerializerLegacy<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
let amplitude = max_value - min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(BitpackedFastFieldSerializerLegacy {
|
||||
bit_packer,
|
||||
write,
|
||||
min_value,
|
||||
amplitude,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
#[inline]
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
pub fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)?;
|
||||
self.min_value.serialize(&mut self.write)?;
|
||||
self.amplitude.serialize(&mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BitpackedFastFieldSerializer {}
|
||||
|
||||
impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
||||
const NAME: &'static str = "Bitpacked";
|
||||
const ID: u8 = 1;
|
||||
/// Serializes data with the BitpackedFastFieldSerializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut serializer =
|
||||
BitpackedFastFieldSerializerLegacy::open(write, stats.min_value, stats.max_value)?;
|
||||
|
||||
for val in data_iter {
|
||||
serializer.add_val(val)?;
|
||||
}
|
||||
serializer.close_field()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
_stats: FastFieldStats,
|
||||
) -> bool {
|
||||
true
|
||||
}
|
||||
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(
|
||||
data, name,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bitpacked_fast_field_rand() {
|
||||
for _ in 0..500 {
|
||||
let mut data = (0..1 + rand::random::<u8>() as usize)
|
||||
.map(|_| rand::random::<i64>() as u64 / 2)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data, "rand");
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "rand");
|
||||
}
|
||||
}
|
||||
}
|
||||
225
fastfield_codecs/src/lib.rs
Normal file
225
fastfield_codecs/src/lib.rs
Normal file
@@ -0,0 +1,225 @@
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate more_asserts;
|
||||
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
pub mod bitpacked;
|
||||
pub mod linearinterpol;
|
||||
pub mod multilinearinterpol;
|
||||
|
||||
pub trait FastFieldCodecReader: Sized {
|
||||
/// reads the metadata and returns the CodecReader
|
||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
|
||||
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64;
|
||||
|
||||
fn min_value(&self) -> u64;
|
||||
fn max_value(&self) -> u64;
|
||||
}
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldCodecSerializer {
|
||||
/// A codex needs to provide a unique name and id, which is
|
||||
/// used for debugging and de/serialization.
|
||||
const NAME: &'static str;
|
||||
const ID: u8;
|
||||
|
||||
/// Check if the Codec is able to compress the data
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> bool;
|
||||
|
||||
/// Returns an estimate of the compression ratio.
|
||||
/// The baseline is uncompressed 64bit data.
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
|
||||
|
||||
/// Serializes the data using the serializer into write.
|
||||
/// There are multiple iterators, in case the codec needs to read the data multiple times.
|
||||
/// The iterators should be preferred over using fastfield_accessor for performance reasons.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
|
||||
pub trait FastFieldDataAccess {
|
||||
/// Return the value associated to the given position.
|
||||
///
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `position` is greater than the index.
|
||||
fn get_val(&self, position: u64) -> u64;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FastFieldStats {
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
pub num_vals: u64,
|
||||
}
|
||||
|
||||
impl<'a> FastFieldDataAccess for &'a [u64] {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self[position as usize]
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldDataAccess for Vec<u64> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
self[position as usize]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
|
||||
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
|
||||
multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
},
|
||||
};
|
||||
|
||||
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
data: &[u64],
|
||||
name: &str,
|
||||
) -> (f32, f32) {
|
||||
if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) {
|
||||
return (f32::MAX, 0.0);
|
||||
}
|
||||
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
&data,
|
||||
crate::tests::stats_from_vec(data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = R::open_from_bytes(&out).unwrap();
|
||||
for (doc, orig_val) in data.iter().enumerate() {
|
||||
let val = reader.get_u64(doc as u64, &out);
|
||||
if val != *orig_val {
|
||||
panic!(
|
||||
"val {:?} does not match orig_val {:?}, in data set {}, data {:?}",
|
||||
val, orig_val, name, data
|
||||
);
|
||||
}
|
||||
}
|
||||
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
|
||||
(estimation, actual_compression)
|
||||
}
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "simple monotonically increasing"));
|
||||
|
||||
data_and_names.push((
|
||||
vec![5, 6, 7, 8, 9, 10, 99, 100],
|
||||
"offset in linear interpol",
|
||||
));
|
||||
data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
|
||||
data_and_names.push((vec![10], "single value"));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
fn test_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>() {
|
||||
let codec_name = S::NAME;
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let (estimate, actual) =
|
||||
crate::tests::create_and_validate::<S, R>(&data, data_set_name);
|
||||
let result = if estimate == f32::MAX {
|
||||
"Disabled".to_string()
|
||||
} else {
|
||||
format!("Estimate {:?} Actual {:?} ", estimate, actual)
|
||||
};
|
||||
println!(
|
||||
"Codec {}, DataSet {}, {}",
|
||||
codec_name, data_set_name, result
|
||||
);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_bitpacking() {
|
||||
test_codec::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>();
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_interpolation() {
|
||||
test_codec::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>();
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_multi_interpolation() {
|
||||
test_codec::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>();
|
||||
}
|
||||
|
||||
use super::*;
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
let max_value = data.iter().cloned().max().unwrap_or(0);
|
||||
FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: data.len() as u64,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.01);
|
||||
|
||||
let multi_linear_interpol_estimation =
|
||||
MultiLinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(multi_linear_interpol_estimation, 0.2);
|
||||
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case() {
|
||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.32);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
|
||||
let mut data = (200..=20000_u64).collect::<Vec<_>>();
|
||||
data.push(1_000_000);
|
||||
|
||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.35);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, 0.32);
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
}
|
||||
305
fastfield_codecs/src/linearinterpol.rs
Normal file
305
fastfield_codecs/src/linearinterpol.rs
Normal file
@@ -0,0 +1,305 @@
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::FixedSize;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct LinearInterpolFastFieldReader {
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub footer: LinearInterpolFooter,
|
||||
pub slope: f32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LinearInterpolFooter {
|
||||
pub relative_max_value: u64,
|
||||
pub offset: u64,
|
||||
pub first_val: u64,
|
||||
pub last_val: u64,
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
}
|
||||
|
||||
impl BinarySerializable for LinearInterpolFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.relative_max_value.serialize(write)?;
|
||||
self.offset.serialize(write)?;
|
||||
self.first_val.serialize(write)?;
|
||||
self.last_val.serialize(write)?;
|
||||
self.num_vals.serialize(write)?;
|
||||
self.min_value.serialize(write)?;
|
||||
self.max_value.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<LinearInterpolFooter> {
|
||||
Ok(LinearInterpolFooter {
|
||||
relative_max_value: u64::deserialize(reader)?,
|
||||
offset: u64::deserialize(reader)?,
|
||||
first_val: u64::deserialize(reader)?,
|
||||
last_val: u64::deserialize(reader)?,
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for LinearInterpolFooter {
|
||||
const SIZE_IN_BYTES: usize = 56;
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES);
|
||||
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
|
||||
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
|
||||
|
||||
let num_bits = compute_num_bits(footer.relative_max_value);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(LinearInterpolFastFieldReader {
|
||||
bit_unpacker,
|
||||
footer,
|
||||
slope,
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
|
||||
(calculated_value + self.bit_unpacker.get(doc, data)) - self.footer.offset
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.footer.min_value
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.footer.max_value
|
||||
}
|
||||
}
|
||||
|
||||
/// Fastfield serializer, which tries to guess values by linear interpolation
|
||||
/// and stores the difference bitpacked.
|
||||
pub struct LinearInterpolFastFieldSerializer {}
|
||||
|
||||
#[inline]
|
||||
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
||||
if num_vals <= 1 {
|
||||
return 0.0;
|
||||
}
|
||||
// We calculate the slope with f64 high precision and use the result in lower precision f32
|
||||
// This is done in order to handle estimations for very large values like i64::MAX
|
||||
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
first_val + (pos as f32 * slope) as u64
|
||||
}
|
||||
|
||||
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "LinearInterpol";
|
||||
const ID: u8 = 2;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
// calculate offset to ensure all values are positive
|
||||
let mut offset = 0;
|
||||
let mut rel_positive_max = 0;
|
||||
for (pos, actual_value) in data_iter1.enumerate() {
|
||||
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
|
||||
if calculated_value > actual_value {
|
||||
// negative value we need to apply an offset
|
||||
// we ignore negative values in the max value calculation, because negative values
|
||||
// will be offset to 0
|
||||
offset = offset.max(calculated_value - actual_value);
|
||||
} else {
|
||||
//positive value no offset reuqired
|
||||
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
|
||||
}
|
||||
}
|
||||
|
||||
// rel_positive_max will be adjusted by offset
|
||||
let relative_max_value = rel_positive_max + offset;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value);
|
||||
let mut bit_packer = BitPacker::new();
|
||||
for (pos, val) in data_iter.enumerate() {
|
||||
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
|
||||
let diff = (val + offset) - calculated_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = LinearInterpolFooter {
|
||||
relative_max_value,
|
||||
offset,
|
||||
first_val,
|
||||
last_val,
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 3 {
|
||||
return false; //disable compressor for this case
|
||||
}
|
||||
// On serialisation the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algortihm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%
|
||||
let num_vals = stats.num_vals as f32 / 100.0;
|
||||
let sample_positions = (0..20)
|
||||
.map(|pos| (num_vals * pos as f32 * 5.0) as usize)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let max_distance = sample_positions
|
||||
.iter()
|
||||
.map(|pos| {
|
||||
let calculated_value = get_calculated_value(first_val, *pos as u64, slope);
|
||||
let actual_value = fastfield_accessor.get_val(*pos as u64);
|
||||
distance(calculated_value, actual_value)
|
||||
})
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within 50%
|
||||
// threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
//
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
+ LinearInterpolFooter::SIZE_IN_BYTES as u64;
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
if x < y {
|
||||
y - x
|
||||
} else {
|
||||
x - y
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<
|
||||
LinearInterpolFastFieldSerializer,
|
||||
LinearInterpolFastFieldReader,
|
||||
>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
|
||||
assert!(actual_compression < 0.01);
|
||||
assert!(estimate < 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_large_amplitude() {
|
||||
let data = vec![
|
||||
i64::MAX as u64 / 2,
|
||||
i64::MAX as u64 / 3,
|
||||
i64::MAX as u64 / 2,
|
||||
];
|
||||
|
||||
create_and_validate(&data, "large amplitude");
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_concave_data() {
|
||||
let data = vec![0, 1, 2, 5, 8, 10, 20, 50];
|
||||
create_and_validate(&data, "concave data");
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_convex_data() {
|
||||
let data = vec![0, 40, 60, 70, 75, 77];
|
||||
create_and_validate(&data, "convex data");
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..5000 {
|
||||
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
127
fastfield_codecs/src/main.rs
Normal file
127
fastfield_codecs/src/main.rs
Normal file
@@ -0,0 +1,127 @@
|
||||
#[macro_use]
|
||||
extern crate prettytable;
|
||||
use fastfield_codecs::{
|
||||
linearinterpol::LinearInterpolFastFieldSerializer,
|
||||
multilinearinterpol::MultiLinearInterpolFastFieldSerializer, FastFieldCodecSerializer,
|
||||
FastFieldStats,
|
||||
};
|
||||
use prettytable::{Cell, Row, Table};
|
||||
|
||||
fn main() {
|
||||
let mut table = Table::new();
|
||||
|
||||
// Add a row per time
|
||||
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
|
||||
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let mut results = vec![];
|
||||
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
|
||||
&data,
|
||||
);
|
||||
results.push(res);
|
||||
|
||||
//let best_estimation_codec = results
|
||||
//.iter()
|
||||
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
|
||||
//.unwrap();
|
||||
let best_compression_ratio_codec = results
|
||||
.iter()
|
||||
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
||||
for (is_applicable, est, comp, name) in results {
|
||||
let (est_cell, ratio_cell) = if !is_applicable {
|
||||
("Codec Disabled".to_string(), "".to_string())
|
||||
} else {
|
||||
(est.to_string(), comp.to_string())
|
||||
};
|
||||
#[allow(clippy::all)]
|
||||
let style = if comp == best_compression_ratio_codec.1 {
|
||||
"Fb"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
|
||||
table.add_row(Row::new(vec![
|
||||
Cell::new(&name.to_string()).style_spec("bFg"),
|
||||
Cell::new(&ratio_cell).style_spec(style),
|
||||
Cell::new(&est_cell).style_spec(""),
|
||||
]));
|
||||
}
|
||||
}
|
||||
|
||||
table.printstd();
|
||||
}
|
||||
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
|
||||
let data = (1000..=200_000_u64).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Autoincrement"));
|
||||
|
||||
let mut current_cumulative = 0;
|
||||
let data = (1..=200_000_u64)
|
||||
.map(|num| {
|
||||
let num = (num as f32 + num as f32).log10() as u64;
|
||||
current_cumulative += num;
|
||||
current_cumulative
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
//let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing concave"));
|
||||
|
||||
let mut current_cumulative = 0;
|
||||
let data = (1..=200_000_u64)
|
||||
.map(|num| {
|
||||
let num = (200_000.0 - num as f32).log10() as u64;
|
||||
current_cumulative += num;
|
||||
current_cumulative
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing convex"));
|
||||
|
||||
let data = (1000..=200_000_u64)
|
||||
.map(|num| num + rand::random::<u8>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Almost monotonically increasing"));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
data: &[u64],
|
||||
) -> (bool, f32, f32, &'static str) {
|
||||
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
|
||||
if !is_applicable {
|
||||
return (false, 0.0, 0.0, S::NAME);
|
||||
}
|
||||
let estimation = S::estimate(&data, stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
&data,
|
||||
stats_from_vec(data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
(true, estimation, actual_compression, S::NAME)
|
||||
}
|
||||
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
let max_value = data.iter().cloned().max().unwrap_or(0);
|
||||
FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: data.len() as u64,
|
||||
}
|
||||
}
|
||||
438
fastfield_codecs/src/multilinearinterpol.rs
Normal file
438
fastfield_codecs/src/multilinearinterpol.rs
Normal file
@@ -0,0 +1,438 @@
|
||||
/*!
|
||||
|
||||
MultiLinearInterpol compressor uses linear interpolation to guess a values and stores the offset, but in blocks of 512.
|
||||
|
||||
With a CHUNK_SIZE of 512 and 29 byte metadata per block, we get a overhead for metadata of 232 / 512 = 0,45 bits per element.
|
||||
The additional space required per element in a block is the the maximum deviation of the linear interpolation estimation function.
|
||||
|
||||
E.g. if the maximum deviation of an element is 12, all elements cost 4bits.
|
||||
|
||||
Size per block:
|
||||
Num Elements * Maximum Deviation from Interpolation + 29 Byte Metadata
|
||||
|
||||
*/
|
||||
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use common::CountingWriter;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::DeserializeFrom;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
const CHUNK_SIZE: u64 = 512;
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct MultiLinearInterpolFastFieldReader {
|
||||
pub footer: MultiLinearInterpolFooter,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct Function {
|
||||
// The offset in the data is required, because we have diffrent bit_widths per block
|
||||
data_start_offset: u64,
|
||||
// start_pos in the block will be CHUNK_SIZE * BLOCK_NUM
|
||||
start_pos: u64,
|
||||
// only used during serialization, 0 after deserialization
|
||||
end_pos: u64,
|
||||
// only used during serialization, 0 after deserialization
|
||||
value_start_pos: u64,
|
||||
// only used during serialization, 0 after deserialization
|
||||
value_end_pos: u64,
|
||||
slope: f32,
|
||||
// The offset so that all values are positive when writing them
|
||||
positive_val_offset: u64,
|
||||
num_bits: u8,
|
||||
bit_unpacker: BitUnpacker,
|
||||
}
|
||||
|
||||
impl Function {
|
||||
fn calc_slope(&mut self) {
|
||||
let num_vals = self.end_pos - self.start_pos;
|
||||
self.slope = get_slope(self.value_start_pos, self.value_end_pos, num_vals);
|
||||
}
|
||||
// split the interpolation into two function, change self and return the second split
|
||||
fn split(&mut self, split_pos: u64, split_pos_value: u64) -> Function {
|
||||
let mut new_function = Function {
|
||||
start_pos: split_pos,
|
||||
end_pos: self.end_pos,
|
||||
value_start_pos: split_pos_value,
|
||||
value_end_pos: self.value_end_pos,
|
||||
..Default::default()
|
||||
};
|
||||
new_function.calc_slope();
|
||||
self.end_pos = split_pos;
|
||||
self.value_end_pos = split_pos_value;
|
||||
self.calc_slope();
|
||||
new_function
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for Function {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.data_start_offset.serialize(write)?;
|
||||
self.value_start_pos.serialize(write)?;
|
||||
self.positive_val_offset.serialize(write)?;
|
||||
self.slope.serialize(write)?;
|
||||
self.num_bits.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Function> {
|
||||
let data_start_offset = u64::deserialize(reader)?;
|
||||
let value_start_pos = u64::deserialize(reader)?;
|
||||
let offset = u64::deserialize(reader)?;
|
||||
let slope = f32::deserialize(reader)?;
|
||||
let num_bits = u8::deserialize(reader)?;
|
||||
let interpolation = Function {
|
||||
data_start_offset,
|
||||
value_start_pos,
|
||||
positive_val_offset: offset,
|
||||
num_bits,
|
||||
bit_unpacker: BitUnpacker::new(num_bits),
|
||||
slope,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
Ok(interpolation)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct MultiLinearInterpolFooter {
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
interpolations: Vec<Function>,
|
||||
}
|
||||
|
||||
impl BinarySerializable for MultiLinearInterpolFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
let mut out = vec![];
|
||||
self.num_vals.serialize(&mut out)?;
|
||||
self.min_value.serialize(&mut out)?;
|
||||
self.max_value.serialize(&mut out)?;
|
||||
self.interpolations.serialize(&mut out)?;
|
||||
write.write_all(&out)?;
|
||||
(out.len() as u32).serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<MultiLinearInterpolFooter> {
|
||||
let mut footer = MultiLinearInterpolFooter {
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
interpolations: Vec::<Function>::deserialize(reader)?,
|
||||
};
|
||||
for (num, interpol) in footer.interpolations.iter_mut().enumerate() {
|
||||
interpol.start_pos = CHUNK_SIZE * num as u64;
|
||||
}
|
||||
Ok(footer)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_interpolation_position(doc: u64) -> usize {
|
||||
let index = doc / CHUNK_SIZE;
|
||||
index as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Function {
|
||||
&interpolations[get_interpolation_position(doc)]
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||
let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?;
|
||||
|
||||
Ok(MultiLinearInterpolFastFieldReader { footer })
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
|
||||
let doc = doc - interpolation.start_pos;
|
||||
let calculated_value =
|
||||
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
|
||||
let diff = interpolation
|
||||
.bit_unpacker
|
||||
.get(doc, &data[interpolation.data_start_offset as usize..]);
|
||||
(calculated_value + diff) - interpolation.positive_val_offset
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.footer.min_value
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.footer.max_value
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
||||
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
(first_val as i64 + (pos as f32 * slope) as i64) as u64
|
||||
}
|
||||
|
||||
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||
pub struct MultiLinearInterpolFastFieldSerializer {}
|
||||
|
||||
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "MultiLinearInterpol";
|
||||
const ID: u8 = 3;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
|
||||
let mut first_function = Function {
|
||||
end_pos: stats.num_vals,
|
||||
value_start_pos: first_val,
|
||||
value_end_pos: last_val,
|
||||
..Default::default()
|
||||
};
|
||||
first_function.calc_slope();
|
||||
let mut interpolations = vec![first_function];
|
||||
|
||||
// Since we potentially apply multiple passes over the data, the data is cached.
|
||||
// Multiple iteration can be expensive (merge with index sorting can add lot of overhead per
|
||||
// iteration)
|
||||
let data = data_iter.collect::<Vec<_>>();
|
||||
|
||||
//// let's split this into chunks of CHUNK_SIZE
|
||||
for data_pos in (0..data.len() as u64).step_by(CHUNK_SIZE as usize).skip(1) {
|
||||
let new_fun = {
|
||||
let current_interpolation = interpolations.last_mut().unwrap();
|
||||
current_interpolation.split(data_pos, data[data_pos as usize])
|
||||
};
|
||||
interpolations.push(new_fun);
|
||||
}
|
||||
// calculate offset and max (-> numbits) for each function
|
||||
for interpolation in &mut interpolations {
|
||||
let mut offset = 0;
|
||||
let mut rel_positive_max = 0;
|
||||
for (pos, actual_value) in data
|
||||
[interpolation.start_pos as usize..interpolation.end_pos as usize]
|
||||
.iter()
|
||||
.cloned()
|
||||
.enumerate()
|
||||
{
|
||||
let calculated_value = get_calculated_value(
|
||||
interpolation.value_start_pos,
|
||||
pos as u64,
|
||||
interpolation.slope,
|
||||
);
|
||||
if calculated_value > actual_value {
|
||||
// negative value we need to apply an offset
|
||||
// we ignore negative values in the max value calculation, because negative values
|
||||
// will be offset to 0
|
||||
offset = offset.max(calculated_value - actual_value);
|
||||
} else {
|
||||
//positive value no offset reuqired
|
||||
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
|
||||
}
|
||||
}
|
||||
|
||||
interpolation.positive_val_offset = offset;
|
||||
interpolation.num_bits = compute_num_bits(rel_positive_max + offset);
|
||||
}
|
||||
let mut bit_packer = BitPacker::new();
|
||||
|
||||
let write = &mut CountingWriter::wrap(write);
|
||||
for interpolation in &mut interpolations {
|
||||
interpolation.data_start_offset = write.written_bytes();
|
||||
let num_bits = interpolation.num_bits;
|
||||
for (pos, actual_value) in data
|
||||
[interpolation.start_pos as usize..interpolation.end_pos as usize]
|
||||
.iter()
|
||||
.cloned()
|
||||
.enumerate()
|
||||
{
|
||||
let calculated_value = get_calculated_value(
|
||||
interpolation.value_start_pos,
|
||||
pos as u64,
|
||||
interpolation.slope,
|
||||
);
|
||||
let diff = (actual_value + interpolation.positive_val_offset) - calculated_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.flush(write)?;
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = MultiLinearInterpolFooter {
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
interpolations,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 5_000 {
|
||||
return false;
|
||||
}
|
||||
// On serialization the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algortihm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
|
||||
let last_val_in_first_block =
|
||||
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
|
||||
let slope = get_slope(
|
||||
first_val_in_first_block,
|
||||
last_val_in_first_block,
|
||||
stats.num_vals,
|
||||
);
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
|
||||
let sample_positions = (0..20)
|
||||
.map(|pos| (last_elem_in_first_chunk as f32 / 100.0 * pos as f32 * 5.0) as usize)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let max_distance = sample_positions
|
||||
.iter()
|
||||
.map(|pos| {
|
||||
let calculated_value =
|
||||
get_calculated_value(first_val_in_first_block, *pos as u64, slope);
|
||||
let actual_value = fastfield_accessor.get_val(*pos as u64);
|
||||
distance(calculated_value, actual_value)
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
// Estimate one block and extrapolate the cost to all blocks.
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within 50%
|
||||
// threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
//
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
// function metadata per block
|
||||
+ 29 * (stats.num_vals / CHUNK_SIZE);
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
if x < y {
|
||||
y - x
|
||||
} else {
|
||||
x - y
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<
|
||||
MultiLinearInterpolFastFieldSerializer,
|
||||
MultiLinearInterpolFastFieldReader,
|
||||
>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.15);
|
||||
assert!(actual_compression > 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn border_cases_1() {
|
||||
let data = (0..1024).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn border_case_2() {
|
||||
let data = (0..1025).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn rand() {
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) = create_and_validate(&data, "random");
|
||||
dbg!(estimate);
|
||||
dbg!(actual_compression);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
11
ownedbytes/Cargo.toml
Normal file
11
ownedbytes/Cargo.toml
Normal file
@@ -0,0 +1,11 @@
|
||||
[package]
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
name = "ownedbytes"
|
||||
version = "0.2.0"
|
||||
edition = "2018"
|
||||
description = "Expose data as static slice"
|
||||
license = "MIT"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
stable_deref_trait = "1.2.0"
|
||||
347
ownedbytes/src/lib.rs
Normal file
347
ownedbytes/src/lib.rs
Normal file
@@ -0,0 +1,347 @@
|
||||
#![allow(clippy::return_self_not_must_use)]
|
||||
|
||||
use stable_deref_trait::StableDeref;
|
||||
use std::convert::TryInto;
|
||||
use std::mem;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, io};
|
||||
|
||||
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
|
||||
/// this data as a static slice.
|
||||
///
|
||||
/// The backing object is required to be `StableDeref`.
|
||||
#[derive(Clone)]
|
||||
pub struct OwnedBytes {
|
||||
data: &'static [u8],
|
||||
box_stable_deref: Arc<dyn Deref<Target = [u8]> + Sync + Send>,
|
||||
}
|
||||
|
||||
impl OwnedBytes {
|
||||
/// Creates an empty `OwnedBytes`.
|
||||
pub fn empty() -> OwnedBytes {
|
||||
OwnedBytes::new(&[][..])
|
||||
}
|
||||
|
||||
/// Creates an `OwnedBytes` intance given a `StableDeref` object.
|
||||
pub fn new<T: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync>(
|
||||
data_holder: T,
|
||||
) -> OwnedBytes {
|
||||
let box_stable_deref = Arc::new(data_holder);
|
||||
let bytes: &[u8] = box_stable_deref.as_ref();
|
||||
let data = unsafe { mem::transmute::<_, &'static [u8]>(bytes.deref()) };
|
||||
OwnedBytes {
|
||||
data,
|
||||
box_stable_deref,
|
||||
}
|
||||
}
|
||||
|
||||
/// creates a fileslice that is just a view over a slice of the data.
|
||||
#[must_use]
|
||||
#[inline]
|
||||
pub fn slice(&self, range: Range<usize>) -> Self {
|
||||
OwnedBytes {
|
||||
data: &self.data[range],
|
||||
box_stable_deref: self.box_stable_deref.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the underlying slice of data.
|
||||
/// `Deref` and `AsRef` are also available.
|
||||
#[inline]
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
self.data
|
||||
}
|
||||
|
||||
/// Returns the len of the slice.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.data.len()
|
||||
}
|
||||
|
||||
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
|
||||
///
|
||||
/// Left will hold `split_len` bytes.
|
||||
///
|
||||
/// This operation is cheap and does not require to copy any memory.
|
||||
/// On the other hand, both `left` and `right` retain a handle over
|
||||
/// the entire slice of memory. In other words, the memory will only
|
||||
/// be released when both left and right are dropped.
|
||||
#[inline]
|
||||
#[must_use]
|
||||
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
|
||||
let right_box_stable_deref = self.box_stable_deref.clone();
|
||||
let left = OwnedBytes {
|
||||
data: &self.data[..split_len],
|
||||
box_stable_deref: self.box_stable_deref,
|
||||
};
|
||||
let right = OwnedBytes {
|
||||
data: &self.data[split_len..],
|
||||
box_stable_deref: right_box_stable_deref,
|
||||
};
|
||||
(left, right)
|
||||
}
|
||||
|
||||
/// Splits the right part of the `OwnedBytes` at the given offset.
|
||||
///
|
||||
/// `self` is truncated to `split_len`, left with the remaining bytes.
|
||||
pub fn split_off(&mut self, split_len: usize) -> OwnedBytes {
|
||||
let right_box_stable_deref = self.box_stable_deref.clone();
|
||||
let right_piece = OwnedBytes {
|
||||
data: &self.data[split_len..],
|
||||
box_stable_deref: right_box_stable_deref,
|
||||
};
|
||||
self.data = &self.data[..split_len];
|
||||
right_piece
|
||||
}
|
||||
|
||||
/// Returns true iff this `OwnedBytes` is empty.
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.as_slice().is_empty()
|
||||
}
|
||||
|
||||
/// Drops the left most `advance_len` bytes.
|
||||
///
|
||||
#[inline]
|
||||
pub fn advance(&mut self, advance_len: usize) {
|
||||
self.data = &self.data[advance_len..]
|
||||
}
|
||||
|
||||
/// Reads an `u8` from the `OwnedBytes` and advance by one byte.
|
||||
#[inline]
|
||||
pub fn read_u8(&mut self) -> u8 {
|
||||
assert!(!self.is_empty());
|
||||
|
||||
let byte = self.as_slice()[0];
|
||||
self.advance(1);
|
||||
byte
|
||||
}
|
||||
|
||||
/// Reads an `u64` encoded as little-endian from the `OwnedBytes` and advance by 8 bytes.
|
||||
#[inline]
|
||||
pub fn read_u64(&mut self) -> u64 {
|
||||
assert!(self.len() > 7);
|
||||
|
||||
let octlet: [u8; 8] = self.as_slice()[..8].try_into().unwrap();
|
||||
self.advance(8);
|
||||
u64::from_le_bytes(octlet)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for OwnedBytes {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// We truncate the bytes in order to make sure the debug string
|
||||
// is not too long.
|
||||
let bytes_truncated: &[u8] = if self.len() > 8 {
|
||||
&self.as_slice()[..10]
|
||||
} else {
|
||||
self.as_slice()
|
||||
};
|
||||
write!(f, "OwnedBytes({:?}, len={})", bytes_truncated, self.len())
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for OwnedBytes {
|
||||
fn eq(&self, other: &OwnedBytes) -> bool {
|
||||
self.as_slice() == other.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for OwnedBytes {}
|
||||
|
||||
impl PartialEq<[u8]> for OwnedBytes {
|
||||
fn eq(&self, other: &[u8]) -> bool {
|
||||
self.as_slice() == other
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq<str> for OwnedBytes {
|
||||
fn eq(&self, other: &str) -> bool {
|
||||
self.as_slice() == other.as_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: ?Sized> PartialEq<&'a T> for OwnedBytes
|
||||
where
|
||||
OwnedBytes: PartialEq<T>,
|
||||
{
|
||||
fn eq(&self, other: &&'a T) -> bool {
|
||||
*self == **other
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for OwnedBytes {
|
||||
type Target = [u8];
|
||||
|
||||
#[inline]
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl io::Read for OwnedBytes {
|
||||
#[inline]
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
let read_len = {
|
||||
let data = self.as_slice();
|
||||
if data.len() >= buf.len() {
|
||||
let buf_len = buf.len();
|
||||
buf.copy_from_slice(&data[..buf_len]);
|
||||
buf.len()
|
||||
} else {
|
||||
let data_len = data.len();
|
||||
buf[..data_len].copy_from_slice(data);
|
||||
data_len
|
||||
}
|
||||
};
|
||||
self.advance(read_len);
|
||||
Ok(read_len)
|
||||
}
|
||||
#[inline]
|
||||
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
|
||||
let read_len = {
|
||||
let data = self.as_slice();
|
||||
buf.extend(data);
|
||||
data.len()
|
||||
};
|
||||
self.advance(read_len);
|
||||
Ok(read_len)
|
||||
}
|
||||
#[inline]
|
||||
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
|
||||
let read_len = self.read(buf)?;
|
||||
if read_len != buf.len() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
"failed to fill whole buffer",
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for OwnedBytes {
|
||||
#[inline]
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io::{self, Read};
|
||||
|
||||
use super::OwnedBytes;
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_debug() {
|
||||
let short_bytes = OwnedBytes::new(b"abcd".as_ref());
|
||||
assert_eq!(
|
||||
format!("{:?}", short_bytes),
|
||||
"OwnedBytes([97, 98, 99, 100], len=4)"
|
||||
);
|
||||
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
|
||||
assert_eq!(
|
||||
format!("{:?}", long_bytes),
|
||||
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105, 106], len=17)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcdefghiklmnopqrstuvwxyz".as_ref());
|
||||
{
|
||||
let mut buf = [0u8; 5];
|
||||
bytes.read_exact(&mut buf[..]).unwrap();
|
||||
assert_eq!(&buf, b"abcde");
|
||||
assert_eq!(bytes.as_slice(), b"fghiklmnopqrstuvwxyz")
|
||||
}
|
||||
{
|
||||
let mut buf = [0u8; 2];
|
||||
bytes.read_exact(&mut buf[..]).unwrap();
|
||||
assert_eq!(&buf, b"fg");
|
||||
assert_eq!(bytes.as_slice(), b"hiklmnopqrstuvwxyz")
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_right_at_the_end() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
||||
let mut buf = [0u8; 5];
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
|
||||
assert_eq!(&buf, b"abcde");
|
||||
assert_eq!(bytes.as_slice(), b"");
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
|
||||
assert_eq!(&buf, b"abcde");
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_owned_bytes_read_incomplete() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
||||
let mut buf = [0u8; 7];
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
|
||||
assert_eq!(&buf[..5], b"abcde");
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_to_end() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
||||
let mut buf = Vec::new();
|
||||
bytes.read_to_end(&mut buf)?;
|
||||
assert_eq!(buf.as_slice(), b"abcde".as_ref());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_u8() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"\xFF".as_ref());
|
||||
assert_eq!(bytes.read_u8(), 255);
|
||||
assert_eq!(bytes.len(), 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_u64() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"\0\xFF\xFF\xFF\xFF\xFF\xFF\xFF".as_ref());
|
||||
assert_eq!(bytes.read_u64(), u64::MAX - 255);
|
||||
assert_eq!(bytes.len(), 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_split() {
|
||||
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
||||
let (left, right) = bytes.split(3);
|
||||
assert_eq!(left.as_slice(), b"abc");
|
||||
assert_eq!(right.as_slice(), b"defghi");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_split_boundary() {
|
||||
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
||||
{
|
||||
let (left, right) = bytes.clone().split(0);
|
||||
assert_eq!(left.as_slice(), b"");
|
||||
assert_eq!(right.as_slice(), b"abcdefghi");
|
||||
}
|
||||
{
|
||||
let (left, right) = bytes.split(9);
|
||||
assert_eq!(left.as_slice(), b"abcdefghi");
|
||||
assert_eq!(right.as_slice(), b"");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_off() {
|
||||
let mut data = OwnedBytes::new(b"abcdef".as_ref());
|
||||
assert_eq!(data, "abcdef");
|
||||
assert_eq!(data.split_off(2), "cdef");
|
||||
assert_eq!(data, "ab");
|
||||
assert_eq!(data.split_off(1), "b");
|
||||
assert_eq!(data, "a");
|
||||
}
|
||||
}
|
||||
@@ -5,12 +5,14 @@ authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
description = """Search engine library"""
|
||||
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
|
||||
homepage = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
documentation = "https://quickwit-inc.github.io/tantivy/tantivy/index.html"
|
||||
homepage = "https://github.com/quickwit-inc/tantivy"
|
||||
repository = "https://github.com/quickwit-inc/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
combine = {version="4", default-features=false, features=[] }
|
||||
once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
|
||||
@@ -1,21 +1,44 @@
|
||||
use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
use crate::Occur;
|
||||
use combine::parser::char::{char, digit, letter, space, spaces, string};
|
||||
use combine::parser::char::{char, digit, space, spaces, string};
|
||||
use combine::parser::range::{take_while, take_while1};
|
||||
use combine::parser::repeat::escaped;
|
||||
use combine::parser::Parser;
|
||||
use combine::{
|
||||
attempt, choice, eof, many, many1, one_of, optional, parser, satisfy, skip_many1, value,
|
||||
};
|
||||
use combine::{error::StringStreamError, parser::combinator::recognize};
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
fn field<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
(
|
||||
(letter().or(char('_'))),
|
||||
many(satisfy(|c: char| {
|
||||
c.is_alphanumeric() || c == '_' || c == '-'
|
||||
})),
|
||||
)
|
||||
.skip(char(':'))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to special characters.
|
||||
const SPECIAL_CHARS: &[char] = &[
|
||||
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
|
||||
];
|
||||
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*| )"#;
|
||||
|
||||
/// Parses a field_name
|
||||
/// A field name must have at least one character and be followed by a colon.
|
||||
/// All characters are allowed including special characters `SPECIAL_CHARS`, but these
|
||||
/// need to be escaped with a backslack character '\'.
|
||||
fn field_name<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
static ESCAPED_SPECIAL_CHARS_RE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(ESCAPED_SPECIAL_CHARS_PATTERN).unwrap());
|
||||
|
||||
recognize::<String, _, _>(escaped(
|
||||
(
|
||||
take_while1(|c| !SPECIAL_CHARS.contains(&c) && c != '-'),
|
||||
take_while(|c| !SPECIAL_CHARS.contains(&c)),
|
||||
),
|
||||
'\\',
|
||||
satisfy(|c| SPECIAL_CHARS.contains(&c)),
|
||||
))
|
||||
.skip(char(':'))
|
||||
.map(|s| ESCAPED_SPECIAL_CHARS_RE.replace_all(&s, "$1").to_string())
|
||||
.and_then(|s: String| match s.is_empty() {
|
||||
true => Err(StringStreamError::UnexpectedParse),
|
||||
_ => Ok(s),
|
||||
})
|
||||
}
|
||||
|
||||
fn word<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
@@ -98,7 +121,7 @@ fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
|
||||
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
|
||||
let term_val_with_field = negative_number().or(term_val());
|
||||
(field(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
|
||||
(field_name(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
})
|
||||
@@ -195,7 +218,7 @@ fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
|
||||
);
|
||||
|
||||
(
|
||||
optional(field()).skip(spaces()),
|
||||
optional(field_name()).skip(spaces()),
|
||||
// try elastic first, if it matches, the range is unbounded
|
||||
attempt(elastic_unbounded_range).or(lower_to_upper),
|
||||
)
|
||||
@@ -464,21 +487,21 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_parse_elastic_query_ranges() {
|
||||
test_parse_query_to_ast_helper("title: >a", "title:{\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("title:>=a", "title:[\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("title: <a", "title:{\"*\" TO \"a\"}");
|
||||
test_parse_query_to_ast_helper("title:<=a", "title:{\"*\" TO \"a\"]");
|
||||
test_parse_query_to_ast_helper("title:<=bsd", "title:{\"*\" TO \"bsd\"]");
|
||||
test_parse_query_to_ast_helper("title: >a", "\"title\":{\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("title:>=a", "\"title\":[\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("title: <a", "\"title\":{\"*\" TO \"a\"}");
|
||||
test_parse_query_to_ast_helper("title:<=a", "\"title\":{\"*\" TO \"a\"]");
|
||||
test_parse_query_to_ast_helper("title:<=bsd", "\"title\":{\"*\" TO \"bsd\"]");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: >70", "weight:{\"70\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight:>=70", "weight:[\"70\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight: <70", "weight:{\"*\" TO \"70\"}");
|
||||
test_parse_query_to_ast_helper("weight:<=70", "weight:{\"*\" TO \"70\"]");
|
||||
test_parse_query_to_ast_helper("weight: >60.7", "weight:{\"60.7\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight: >70", "\"weight\":{\"70\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight:>=70", "\"weight\":[\"70\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight: <70", "\"weight\":{\"*\" TO \"70\"}");
|
||||
test_parse_query_to_ast_helper("weight:<=70", "\"weight\":{\"*\" TO \"70\"]");
|
||||
test_parse_query_to_ast_helper("weight: >60.7", "\"weight\":{\"60.7\" TO \"*\"}");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: <= 70", "weight:{\"*\" TO \"70\"]");
|
||||
test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: <= 70.5", "weight:{\"*\" TO \"70.5\"]");
|
||||
test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -491,22 +514,43 @@ mod test {
|
||||
#[test]
|
||||
fn test_field_name() -> TestParseResult {
|
||||
assert_eq!(
|
||||
super::field().parse("my-field-name:a")?,
|
||||
("my-field-name".to_string(), "a")
|
||||
super::field_name().parse(".my.field.name:a"),
|
||||
Ok((".my.field.name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field().parse("my_field_name:a")?,
|
||||
("my_field_name".to_string(), "a")
|
||||
super::field_name().parse("my\\ field\\ name:a"),
|
||||
Ok(("my field name".to_string(), "a"))
|
||||
);
|
||||
assert!(super::field().parse(":a").is_err());
|
||||
assert!(super::field().parse("-my_field:a").is_err());
|
||||
assert!(super::field_name().parse("my field:a").is_err());
|
||||
assert_eq!(
|
||||
super::field().parse("_my_field:a")?,
|
||||
super::field_name().parse("\\(1\\+1\\):2"),
|
||||
Ok(("(1+1)".to_string(), "2"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse("my_field_name:a"),
|
||||
Ok(("my_field_name".to_string(), "a"))
|
||||
);
|
||||
assert!(super::field_name().parse("my_field_name").is_err());
|
||||
assert!(super::field_name().parse(":a").is_err());
|
||||
assert!(super::field_name().parse("-my_field:a").is_err());
|
||||
assert_eq!(
|
||||
super::field_name().parse("_my_field:a")?,
|
||||
("_my_field".to_string(), "a")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_field_name_re() {
|
||||
let escaped_special_chars_re = Regex::new(ESCAPED_SPECIAL_CHARS_PATTERN).unwrap();
|
||||
for special_char in SPECIAL_CHARS.iter() {
|
||||
assert_eq!(
|
||||
escaped_special_chars_re.replace_all(&format!("\\{}", special_char), "$1"),
|
||||
special_char.to_string()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_parser() {
|
||||
// testing the range() parser separately
|
||||
@@ -600,12 +644,14 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_single_term_with_field() {
|
||||
test_parse_query_to_ast_helper("abc:toto", "abc:\"toto\"");
|
||||
test_parse_query_to_ast_helper("abc:toto", "\"abc\":\"toto\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_term_with_float() {
|
||||
test_parse_query_to_ast_helper("abc:1.1", "abc:\"1.1\"");
|
||||
test_parse_query_to_ast_helper("abc:1.1", "\"abc\":\"1.1\"");
|
||||
test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":\"1.1\"");
|
||||
test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":\"1.1\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -621,22 +667,27 @@ mod test {
|
||||
#[test]
|
||||
fn test_parse_test_query_other() {
|
||||
test_parse_query_to_ast_helper("(+a +b) d", "(*(+\"a\" +\"b\") *\"d\")");
|
||||
test_parse_query_to_ast_helper("+abc:toto", "abc:\"toto\"");
|
||||
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+abc:\"toto\" -\"titi\")");
|
||||
test_parse_query_to_ast_helper("-abc:toto", "(-abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("abc:a b", "(*abc:\"a\" *\"b\")");
|
||||
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("+abc:toto", "\"abc\":\"toto\"");
|
||||
test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":\"toto\"");
|
||||
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":\"toto\" -\"titi\")");
|
||||
test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":\"toto\")");
|
||||
test_is_parse_err("--abc:toto");
|
||||
test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":\"a\" *\"b\")");
|
||||
test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\"");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_with_range() {
|
||||
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:{\"*\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1.1 TO *}", "foo:[\"1.1\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("foo:{a TO z}", "\"foo\":{\"a\" TO \"z\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO toto}", "\"foo\":[\"1\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[* TO toto}", "\"foo\":{\"*\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO *}", "\"foo\":[\"1\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper(
|
||||
"1.2.foo.bar:[1.1 TO *}",
|
||||
"\"1.2.foo.bar\":[\"1.1\" TO \"*\"}",
|
||||
);
|
||||
test_is_parse_err("abc + ");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ impl Debug for UserInputLeaf {
|
||||
ref upper,
|
||||
} => {
|
||||
if let Some(ref field) = field {
|
||||
write!(formatter, "{}:", field)?;
|
||||
write!(formatter, "\"{}\":", field)?;
|
||||
}
|
||||
lower.display_lower(formatter)?;
|
||||
write!(formatter, " TO ")?;
|
||||
@@ -45,7 +45,7 @@ pub struct UserInputLiteral {
|
||||
impl fmt::Debug for UserInputLiteral {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
match self.field_name {
|
||||
Some(ref field_name) => write!(formatter, "{}:\"{}\"", field_name, self.phrase),
|
||||
Some(ref field_name) => write!(formatter, "\"{}\":\"{}\"", field_name, self.phrase),
|
||||
None => write!(formatter, "\"{}\"", self.phrase),
|
||||
}
|
||||
}
|
||||
@@ -79,7 +79,7 @@ impl UserInputBound {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref contents) => contents,
|
||||
UserInputBound::Exclusive(ref contents) => contents,
|
||||
UserInputBound::Unbounded => &"*",
|
||||
UserInputBound::Unbounded => "*",
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -91,6 +91,7 @@ pub enum UserInputAst {
|
||||
}
|
||||
|
||||
impl UserInputAst {
|
||||
#[must_use]
|
||||
pub fn unary(self, occur: Occur) -> UserInputAst {
|
||||
UserInputAst::Clause(vec![(Some(occur), self)])
|
||||
}
|
||||
|
||||
@@ -20,10 +20,10 @@ use crate::SegmentReader;
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind")).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib")).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow")).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl")).unwrap();
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
|
||||
@@ -83,7 +83,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// ```rust
|
||||
/// use tantivy::collector::FacetCollector;
|
||||
/// use tantivy::query::AllQuery;
|
||||
/// use tantivy::schema::{Facet, Schema, INDEXED, TEXT};
|
||||
/// use tantivy::schema::{Facet, Schema, FacetOptions, TEXT};
|
||||
/// use tantivy::{doc, Index};
|
||||
///
|
||||
/// fn example() -> tantivy::Result<()> {
|
||||
@@ -92,7 +92,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// // Facet have their own specific type.
|
||||
/// // It is not a bad practise to put all of your
|
||||
/// // facet information in the same field.
|
||||
/// let facet = schema_builder.add_facet_field("facet", INDEXED);
|
||||
/// let facet = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
@@ -103,23 +103,23 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// title => "The Name of the Wind",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/fiction/fantasy")
|
||||
/// ));
|
||||
/// ))?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "Dune",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/fiction/sci-fi")
|
||||
/// ));
|
||||
/// ))?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "La Vénus d'Ille",
|
||||
/// facet => Facet::from("/lang/fr"),
|
||||
/// facet => Facet::from("/category/fiction/fantasy"),
|
||||
/// facet => Facet::from("/category/fiction/horror")
|
||||
/// ));
|
||||
/// ))?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/biography")
|
||||
/// ));
|
||||
/// ))?;
|
||||
/// index_writer.commit()?;
|
||||
/// }
|
||||
/// let reader = index.reader()?;
|
||||
@@ -297,10 +297,8 @@ impl Collector for FacetCollector {
|
||||
if depth == collapse_depth + 1 {
|
||||
collapsed_id = collapse_facet_ords.len();
|
||||
collapse_facet_ords.push(facet_streamer.term_ord());
|
||||
collapse_mapping.push(collapsed_id);
|
||||
} else {
|
||||
collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -402,7 +400,7 @@ impl<'a> Iterator for FacetChildIterator<'a> {
|
||||
|
||||
impl FacetCounts {
|
||||
/// Returns an iterator over all of the facet count pairs inside this result.
|
||||
/// See the documentation for `FacetCollector` for a usage example.
|
||||
/// See the documentation for [FacetCollector] for a usage example.
|
||||
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator<'_>
|
||||
where
|
||||
Facet: From<T>,
|
||||
@@ -423,7 +421,7 @@ impl FacetCounts {
|
||||
}
|
||||
|
||||
/// Returns a vector of top `k` facets with their counts, sorted highest-to-lowest by counts.
|
||||
/// See the documentation for `FacetCollector` for a usage example.
|
||||
/// See the documentation for [FacetCollector] for a usage example.
|
||||
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
|
||||
where
|
||||
Facet: From<T>,
|
||||
@@ -464,7 +462,7 @@ mod tests {
|
||||
use crate::collector::Count;
|
||||
use crate::core::Index;
|
||||
use crate::query::{AllQuery, QueryParser, TermQuery};
|
||||
use crate::schema::{Document, Facet, Field, IndexRecordOption, Schema, INDEXED};
|
||||
use crate::schema::{Document, Facet, FacetOptions, Field, IndexRecordOption, Schema};
|
||||
use crate::Term;
|
||||
use rand::distributions::Uniform;
|
||||
use rand::prelude::SliceRandom;
|
||||
@@ -472,13 +470,13 @@ mod tests {
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_drilldown() {
|
||||
fn test_facet_collector_drilldown() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let num_facets: usize = 3 * 4 * 5;
|
||||
let facets: Vec<Facet> = (0..num_facets)
|
||||
.map(|mut n| {
|
||||
@@ -493,14 +491,14 @@ mod tests {
|
||||
for i in 0..num_facets * 10 {
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet(Facet::from("/top1"));
|
||||
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
let counts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
|
||||
{
|
||||
let facets: Vec<(String, u64)> = counts
|
||||
@@ -520,6 +518,7 @@ mod tests {
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -532,48 +531,49 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_unsorted_multifacet() {
|
||||
fn test_doc_unsorted_multifacet() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facets", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facets", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/subjects/A/a").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/B/a").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/A/b").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/B/b").unwrap(),
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 1);
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet("/subjects");
|
||||
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
let counts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
|
||||
assert_eq!(facets[0].1, 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_search_by_facet() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/A").unwrap(),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/B").unwrap(),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/C/A").unwrap(),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/D/C/A").unwrap(),
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -615,7 +615,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_facet_collector_topk() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -639,7 +639,7 @@ mod tests {
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
@@ -664,7 +664,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_facet_collector_topk_tie_break() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -679,7 +679,7 @@ mod tests {
|
||||
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
@@ -727,7 +727,7 @@ mod bench {
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::schema::Field;
|
||||
use crate::{Score, SegmentReader, TantivyError};
|
||||
|
||||
/// The `FilterCollector` collector filters docs using a u64 fast field value and a predicate.
|
||||
/// The `FilterCollector` filters docs using a fast field value and a predicate.
|
||||
/// Only the documents for which the predicate returned "true" will be passed on to the next collector.
|
||||
///
|
||||
/// ```rust
|
||||
@@ -25,34 +25,37 @@ use crate::{Score, SegmentReader, TantivyError};
|
||||
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64));
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary").unwrap();
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
|
||||
/// let top_docs = searcher.search(&query, &no_filter_collector).unwrap();
|
||||
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
|
||||
///
|
||||
/// assert_eq!(top_docs.len(), 1);
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
///
|
||||
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
|
||||
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
|
||||
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
|
||||
///
|
||||
/// assert_eq!(filtered_top_docs.len(), 0);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
|
||||
where
|
||||
|
||||
@@ -226,10 +226,10 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||
writer.add_document(doc!(val_field=>12i64));
|
||||
writer.add_document(doc!(val_field=>-30i64));
|
||||
writer.add_document(doc!(val_field=>-12i64));
|
||||
writer.add_document(doc!(val_field=>-10i64));
|
||||
writer.add_document(doc!(val_field=>12i64))?;
|
||||
writer.add_document(doc!(val_field=>-30i64))?;
|
||||
writer.add_document(doc!(val_field=>-12i64))?;
|
||||
writer.add_document(doc!(val_field=>-10i64))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -247,13 +247,13 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||
writer.add_document(doc!(val_field=>12i64));
|
||||
writer.add_document(doc!(val_field=>12i64))?;
|
||||
writer.commit()?;
|
||||
writer.add_document(doc!(val_field=>-30i64));
|
||||
writer.add_document(doc!(val_field=>-30i64))?;
|
||||
writer.commit()?;
|
||||
writer.add_document(doc!(val_field=>-12i64));
|
||||
writer.add_document(doc!(val_field=>-12i64))?;
|
||||
writer.commit()?;
|
||||
writer.add_document(doc!(val_field=>-10i64));
|
||||
writer.add_document(doc!(val_field=>-10i64))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -271,9 +271,9 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)));
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)));
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)));
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)))?;
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)))?;
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
@@ -48,10 +48,10 @@ use tantivy::collector::{Count, TopDocs};
|
||||
# let mut index_writer = index.writer(3_000_000)?;
|
||||
# index_writer.add_document(doc!(
|
||||
# title => "The Name of the Wind",
|
||||
# ));
|
||||
# ))?;
|
||||
# index_writer.add_document(doc!(
|
||||
# title => "The Diary of Muadib",
|
||||
# ));
|
||||
# ))?;
|
||||
# index_writer.commit()?;
|
||||
# let reader = index.reader()?;
|
||||
# let searcher = reader.searcher();
|
||||
@@ -178,9 +178,9 @@ pub trait Collector: Sync + Send {
|
||||
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
|
||||
let mut segment_collector = self.for_segment(segment_ord as u32, reader)?;
|
||||
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
if let Some(alive_bitset) = reader.alive_bitset() {
|
||||
weight.for_each(reader, &mut |doc, score| {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
if alive_bitset.is_alive(doc) {
|
||||
segment_collector.collect(doc, score);
|
||||
}
|
||||
})?;
|
||||
|
||||
@@ -112,19 +112,19 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let mut collectors = MultiCollector::new();
|
||||
@@ -139,6 +139,8 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
///
|
||||
/// assert_eq!(count, 2);
|
||||
/// assert_eq!(top_docs.len(), 2);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[derive(Default)]
|
||||
@@ -252,33 +254,34 @@ mod tests {
|
||||
use crate::Term;
|
||||
|
||||
#[test]
|
||||
fn test_multi_collector() {
|
||||
fn test_multi_collector() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc"));
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!(text=>""));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.commit().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text=>"abc"))?;
|
||||
index_writer.add_document(doc!(text=>"abc abc abc"))?;
|
||||
index_writer.add_document(doc!(text=>"abc abc"))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(text=>""))?;
|
||||
index_writer.add_document(doc!(text=>"abc abc abc abc"))?;
|
||||
index_writer.add_document(doc!(text=>"abc"))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let term = Term::from_field_text(text, "abc");
|
||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
|
||||
let mut collectors = MultiCollector::new();
|
||||
let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2));
|
||||
let count_handler = collectors.add_collector(Count);
|
||||
let mut multifruits = searcher.search(&query, &mut collectors).unwrap();
|
||||
let mut multifruits = searcher.search(&query, &collectors).unwrap();
|
||||
|
||||
assert_eq!(count_handler.extract(&mut multifruits), 5);
|
||||
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ pub const TEST_COLLECTOR_WITHOUT_SCORE: TestCollector = TestCollector {
|
||||
};
|
||||
|
||||
#[test]
|
||||
pub fn test_filter_collector() {
|
||||
pub fn test_filter_collector() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let price = schema_builder.add_u64_field("price", FAST);
|
||||
@@ -33,25 +33,25 @@ pub fn test_filter_collector() {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()));
|
||||
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()));
|
||||
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()));
|
||||
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()));
|
||||
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
let query = query_parser.parse_query("diary").unwrap();
|
||||
let query = query_parser.parse_query("diary")?;
|
||||
let filter_some_collector = FilterCollector::new(
|
||||
price,
|
||||
&|value: u64| value > 20_120u64,
|
||||
TopDocs::with_limit(2),
|
||||
);
|
||||
let top_docs = searcher.search(&query, &filter_some_collector).unwrap();
|
||||
let top_docs = searcher.search(&query, &filter_some_collector)?;
|
||||
|
||||
assert_eq!(top_docs.len(), 1);
|
||||
assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
@@ -67,9 +67,10 @@ pub fn test_filter_collector() {
|
||||
}
|
||||
|
||||
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
|
||||
let filtered_date_docs = searcher.search(&query, &filter_dates_collector).unwrap();
|
||||
let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?;
|
||||
|
||||
assert_eq!(filtered_date_docs.len(), 2);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
@@ -274,8 +275,8 @@ fn make_test_searcher() -> crate::Result<crate::LeasedItem<Searcher>> {
|
||||
let schema = Schema::builder().build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.commit()?;
|
||||
Ok(index.reader()?.searcher())
|
||||
}
|
||||
|
||||
@@ -70,9 +70,7 @@ where
|
||||
/// # Panics
|
||||
/// The method panics if limit is 0
|
||||
pub fn with_limit(limit: usize) -> TopCollector<T> {
|
||||
if limit < 1 {
|
||||
panic!("Limit must be strictly greater than 0.");
|
||||
}
|
||||
assert!(limit >= 1, "Limit must be strictly greater than 0.");
|
||||
Self {
|
||||
limit,
|
||||
offset: 0,
|
||||
|
||||
@@ -94,27 +94,30 @@ where
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary").unwrap();
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2))?;
|
||||
///
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct TopDocs(TopCollector<Score>);
|
||||
|
||||
@@ -180,30 +183,34 @@ impl TopDocs {
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"));
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary").unwrap();
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1)).unwrap();
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1))?;
|
||||
///
|
||||
/// assert_eq!(top_docs.len(), 2);
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 4));
|
||||
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
|
||||
/// Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
#[must_use]
|
||||
pub fn and_offset(self, offset: usize) -> TopDocs {
|
||||
TopDocs(self.0.and_offset(offset))
|
||||
}
|
||||
@@ -234,11 +241,11 @@ impl TopDocs {
|
||||
/// #
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64));
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
|
||||
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
|
||||
/// # assert!(index_writer.commit().is_ok());
|
||||
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64))?;
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64))?;
|
||||
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64))?;
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64))?;
|
||||
/// # index_writer.commit()?;
|
||||
/// # let reader = index.reader()?;
|
||||
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
|
||||
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
|
||||
@@ -316,9 +323,9 @@ impl TopDocs {
|
||||
/// #
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64));
|
||||
/// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64));
|
||||
/// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64));
|
||||
/// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64))?;
|
||||
/// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64))?;
|
||||
/// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64))?;
|
||||
/// # assert!(index_writer.commit().is_ok());
|
||||
/// # let reader = index.reader()?;
|
||||
/// # let top_docs = docs_sorted_by_revenue(&reader.searcher(), &AllQuery, rating)?;
|
||||
@@ -417,9 +424,9 @@ impl TopDocs {
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// let product_name = index.schema().get_field("product_name").unwrap();
|
||||
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
|
||||
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
|
||||
/// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64));
|
||||
/// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64));
|
||||
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64))?;
|
||||
/// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64))?;
|
||||
/// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64))?;
|
||||
/// index_writer.commit()?;
|
||||
/// Ok(index)
|
||||
/// }
|
||||
@@ -527,9 +534,9 @@ impl TopDocs {
|
||||
/// #
|
||||
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
|
||||
/// let boosted: Field = index.schema().get_field("boosted").unwrap();
|
||||
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64));
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64));
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64));
|
||||
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64))?;
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64))?;
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64))?;
|
||||
/// # index_writer.commit()?;
|
||||
/// // ...
|
||||
/// # let user_query = "diary";
|
||||
@@ -629,10 +636,10 @@ impl Collector for TopDocs {
|
||||
let heap_len = self.0.limit + self.0.offset;
|
||||
let mut heap: BinaryHeap<ComparableDoc<Score, DocId>> = BinaryHeap::with_capacity(heap_len);
|
||||
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
if let Some(alive_bitset) = reader.alive_bitset() {
|
||||
let mut threshold = Score::MIN;
|
||||
weight.for_each_pruning(threshold, reader, &mut |doc, score| {
|
||||
if delete_bitset.is_deleted(doc) {
|
||||
if alive_bitset.is_deleted(doc) {
|
||||
return threshold;
|
||||
}
|
||||
let heap_item = ComparableDoc {
|
||||
@@ -713,20 +720,18 @@ mod tests {
|
||||
use crate::Score;
|
||||
use crate::{DocAddress, DocId, SegmentReader};
|
||||
|
||||
fn make_index() -> Index {
|
||||
fn make_index() -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
|
||||
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
|
||||
index_writer.add_document(doc!(text_field=>"I like Droopy"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."))?;
|
||||
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"))?;
|
||||
index_writer.add_document(doc!(text_field=>"I like Droopy"))?;
|
||||
index_writer.commit()?;
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
fn assert_results_equals(results: &[(Score, DocAddress)], expected: &[(Score, DocAddress)]) {
|
||||
@@ -737,17 +742,15 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity_without_offset() {
|
||||
let index = make_index();
|
||||
fn test_top_collector_not_at_capacity_without_offset() -> crate::Result<()> {
|
||||
let index = make_index()?;
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let text_query = query_parser.parse_query("droopy tax")?;
|
||||
let score_docs: Vec<(Score, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
.reader()?
|
||||
.searcher()
|
||||
.search(&text_query, &TopDocs::with_limit(4))
|
||||
.unwrap();
|
||||
.search(&text_query, &TopDocs::with_limit(4))?;
|
||||
assert_results_equals(
|
||||
&score_docs,
|
||||
&[
|
||||
@@ -756,11 +759,12 @@ mod tests {
|
||||
(0.48527452, DocAddress::new(0, 0)),
|
||||
],
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity_with_offset() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
@@ -775,7 +779,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_at_capacity() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
@@ -796,7 +800,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_at_capacity_with_offset() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
@@ -817,7 +821,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_stable_sorting() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
|
||||
// using AllQuery to get a constant score
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
@@ -848,29 +852,35 @@ mod tests {
|
||||
const SIZE: &str = "size";
|
||||
|
||||
#[test]
|
||||
fn test_top_field_collector_not_at_capacity() {
|
||||
fn test_top_field_collector_not_at_capacity() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, query) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "growler of beer",
|
||||
size => 64u64,
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "pint of beer",
|
||||
size => 16u64,
|
||||
));
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "growler of beer",
|
||||
size => 64u64,
|
||||
))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "pint of beer",
|
||||
size => 16u64,
|
||||
))
|
||||
.unwrap();
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
&[
|
||||
@@ -879,6 +889,7 @@ mod tests {
|
||||
(12, DocAddress::new(0, 0))
|
||||
]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -894,12 +905,12 @@ mod tests {
|
||||
index_writer.add_document(doc!(
|
||||
name => "Paul Robeson",
|
||||
birthday => pr_birthday
|
||||
));
|
||||
))?;
|
||||
let mr_birthday = crate::DateTime::from_str("1947-11-08T00:00:00+00:00")?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Minnie Riperton",
|
||||
birthday => mr_birthday
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
|
||||
@@ -926,11 +937,11 @@ mod tests {
|
||||
index_writer.add_document(doc!(
|
||||
city => "georgetown",
|
||||
altitude => -1i64,
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
city => "tokyo",
|
||||
altitude => 40i64,
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
|
||||
@@ -956,11 +967,11 @@ mod tests {
|
||||
index_writer.add_document(doc!(
|
||||
city => "georgetown",
|
||||
altitude => -1.0f64,
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
city => "tokyo",
|
||||
altitude => 40f64,
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
|
||||
@@ -983,10 +994,12 @@ mod tests {
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, _) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
))
|
||||
.unwrap();
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
|
||||
@@ -1003,7 +1016,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(size=>1u64));
|
||||
index_writer.add_document(doc!(size=>1u64))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
@@ -1020,7 +1033,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(size=>1u64));
|
||||
index_writer.add_document(doc!(size=>1u64))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
@@ -1033,30 +1046,26 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tweak_score_top_collector_with_offset() {
|
||||
let index = make_index();
|
||||
fn test_tweak_score_top_collector_with_offset() -> crate::Result<()> {
|
||||
let index = make_index()?;
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let text_query = query_parser.parse_query("droopy tax")?;
|
||||
let collector = TopDocs::with_limit(2).and_offset(1).tweak_score(
|
||||
move |_segment_reader: &SegmentReader| move |doc: DocId, _original_score: Score| doc,
|
||||
);
|
||||
let score_docs: Vec<(u32, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
.searcher()
|
||||
.search(&text_query, &collector)
|
||||
.unwrap();
|
||||
|
||||
let score_docs: Vec<(u32, DocAddress)> =
|
||||
index.reader()?.searcher().search(&text_query, &collector)?;
|
||||
assert_eq!(
|
||||
score_docs,
|
||||
vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_custom_score_top_collector_with_offset() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
@@ -1080,7 +1089,7 @@ mod tests {
|
||||
query: &str,
|
||||
query_field: Field,
|
||||
schema: Schema,
|
||||
mut doc_adder: impl FnMut(&mut IndexWriter) -> (),
|
||||
mut doc_adder: impl FnMut(&mut IndexWriter),
|
||||
) -> (Index, Box<dyn Query>) {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
|
||||
@@ -1,396 +0,0 @@
|
||||
use std::fmt;
|
||||
use std::u64;
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub(crate) struct TinySet(u64);
|
||||
|
||||
impl fmt::Debug for TinySet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.into_iter().collect::<Vec<u32>>().fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TinySetIterator(TinySet);
|
||||
impl Iterator for TinySetIterator {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0.pop_lowest()
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for TinySet {
|
||||
type Item = u32;
|
||||
type IntoIter = TinySetIterator;
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
TinySetIterator(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl TinySet {
|
||||
/// Returns an empty `TinySet`.
|
||||
pub fn empty() -> TinySet {
|
||||
TinySet(0u64)
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
fn complement(self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` contains the element `el`.
|
||||
pub fn contains(self, el: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the TinySet.
|
||||
pub fn len(self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
|
||||
/// Returns the intersection of `self` and `other`
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
}
|
||||
|
||||
/// Creates a new `TinySet` containing only one element
|
||||
/// within `[0; 64[`
|
||||
#[inline]
|
||||
pub fn singleton(el: u32) -> TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.insert(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 | other.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` is empty.
|
||||
#[inline]
|
||||
pub fn is_empty(self) -> bool {
|
||||
self.0 == 0u64
|
||||
}
|
||||
|
||||
/// Returns the lowest element in the `TinySet`
|
||||
/// and removes it.
|
||||
#[inline]
|
||||
pub fn pop_lowest(&mut self) -> Option<u32> {
|
||||
if self.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let lowest = self.0.trailing_zeros() as u32;
|
||||
self.0 ^= TinySet::singleton(lowest).0;
|
||||
Some(lowest)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` than contains all values up
|
||||
/// to limit excluded.
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_lower(upper_bound: u32) -> TinySet {
|
||||
TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` that contains all values greater
|
||||
/// or equal to the given limit, included. (and up to 63)
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
|
||||
TinySet::range_lower(from_included).complement()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BitSet {
|
||||
tinysets: Box<[TinySet]>,
|
||||
len: usize,
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
fn num_buckets(max_val: u32) -> u32 {
|
||||
(max_val + 63u32) / 64u32
|
||||
}
|
||||
|
||||
impl BitSet {
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val[`.
|
||||
pub fn with_max_value(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
||||
BitSet {
|
||||
tinysets: tinybisets,
|
||||
len: 0,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all elements from the `BitSet`.
|
||||
pub fn clear(&mut self) {
|
||||
for tinyset in self.tinysets.iter_mut() {
|
||||
*tinyset = TinySet::empty();
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the `BitSet`.
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
pub fn insert(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
self.tinyset(el / 64u32).contains(el % 64)
|
||||
}
|
||||
|
||||
/// Returns the first non-empty `TinySet` associated to a bucket lower
|
||||
/// or greater than bucket.
|
||||
///
|
||||
/// Reminder: the tiny set with the bucket `bucket`, represents the
|
||||
/// elements from `bucket * 64` to `(bucket+1) * 64`.
|
||||
pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
|
||||
self.tinysets[bucket as usize..]
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|tinyset| !tinyset.is_empty())
|
||||
.map(|delta_bucket| bucket + delta_bucket as u32)
|
||||
}
|
||||
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
/// Returns the tiny bitset representing the
|
||||
/// the set restricted to the number range from
|
||||
/// `bucket * 64` to `(bucket + 1) * 64`.
|
||||
pub(crate) fn tinyset(&self, bucket: u32) -> TinySet {
|
||||
self.tinysets[bucket as usize]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::query::BitSetDocSet;
|
||||
use crate::tests;
|
||||
use crate::tests::generate_nonunique_unsorted;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
assert!(TinySet::empty().is_empty());
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32).insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(2u32);
|
||||
assert_eq!(u.pop_lowest(), Some(2u32));
|
||||
u.insert_mut(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(63u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset() {
|
||||
let test_against_hashset = |els: &[u32], max_value: u32| {
|
||||
let mut hashset: HashSet<u32> = HashSet::new();
|
||||
let mut bitset = BitSet::with_max_value(max_value);
|
||||
for &el in els {
|
||||
assert!(el < max_value);
|
||||
hashset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for el in 0..max_value {
|
||||
assert_eq!(hashset.contains(&el), bitset.contains(el));
|
||||
}
|
||||
assert_eq!(bitset.max_value(), max_value);
|
||||
};
|
||||
|
||||
test_against_hashset(&[], 0);
|
||||
test_against_hashset(&[], 1);
|
||||
test_against_hashset(&[0u32], 1);
|
||||
test_against_hashset(&[0u32], 100);
|
||||
test_against_hashset(&[1u32, 2u32], 4);
|
||||
test_against_hashset(&[99u32], 100);
|
||||
test_against_hashset(&[63u32], 64);
|
||||
test_against_hashset(&[62u32, 63u32], 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_large() {
|
||||
let arr = generate_nonunique_unsorted(100_000, 5_000);
|
||||
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
|
||||
let mut bitset = BitSet::with_max_value(100_000);
|
||||
for el in arr {
|
||||
btreeset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for i in 0..100_000 {
|
||||
assert_eq!(btreeset.contains(&i), bitset.contains(i));
|
||||
}
|
||||
assert_eq!(btreeset.len(), bitset.len());
|
||||
let mut bitset_docset = BitSetDocSet::from(bitset);
|
||||
let mut remaining = true;
|
||||
for el in btreeset.into_iter() {
|
||||
assert!(remaining);
|
||||
assert_eq!(bitset_docset.doc(), el);
|
||||
remaining = bitset_docset.advance() != TERMINATED;
|
||||
}
|
||||
assert!(!remaining);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_num_buckets() {
|
||||
use super::num_buckets;
|
||||
assert_eq!(num_buckets(0u32), 0);
|
||||
assert_eq!(num_buckets(1u32), 1);
|
||||
assert_eq!(num_buckets(64u32), 1);
|
||||
assert_eq!(num_buckets(65u32), 2);
|
||||
assert_eq!(num_buckets(128u32), 2);
|
||||
assert_eq!(num_buckets(129u32), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tinyset_range() {
|
||||
assert_eq!(
|
||||
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1, 2]
|
||||
);
|
||||
assert!(TinySet::range_lower(0).is_empty());
|
||||
assert_eq!(
|
||||
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
|
||||
(0u32..63u32).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
|
||||
[0]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_greater_or_equal(3)
|
||||
.into_iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
(3u32..64u32).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_len() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(104u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_clear() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
let els = tests::sample(1_000, 0.01f64);
|
||||
for &el in &els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
assert!(els.iter().all(|el| bitset.contains(*el)));
|
||||
bitset.clear();
|
||||
for el in 0u32..1000u32 {
|
||||
assert!(!bitset.contains(el));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use test;
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let mut tinyset = TinySet::singleton(test::black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
b.iter(|| {
|
||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
}
|
||||
@@ -42,7 +42,7 @@ fn load_metas(
|
||||
"Meta file does not contain valid utf8 file.".to_string(),
|
||||
)
|
||||
})?;
|
||||
IndexMeta::deserialize(&meta_string, &inventory)
|
||||
IndexMeta::deserialize(&meta_string, inventory)
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
META_FILEPATH.to_path_buf(),
|
||||
@@ -120,11 +120,11 @@ impl IndexBuilder {
|
||||
/// Creates a new index in a given filepath.
|
||||
/// The index will use the `MMapDirectory`.
|
||||
///
|
||||
/// If a previous index was in this directory, then its meta file will be destroyed.
|
||||
/// If a previous index was in this directory, it returns an `IndexAlreadyExists` error.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
if Index::exists(&mmap_directory)? {
|
||||
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::open(directory_path)?);
|
||||
if Index::exists(&*mmap_directory)? {
|
||||
return Err(TantivyError::IndexAlreadyExists);
|
||||
}
|
||||
self.create(mmap_directory)
|
||||
@@ -139,7 +139,7 @@ impl IndexBuilder {
|
||||
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_from_tempdir(self) -> crate::Result<Index> {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
||||
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::create_from_tempdir()?);
|
||||
self.create(mmap_directory)
|
||||
}
|
||||
fn get_expect_schema(&self) -> crate::Result<Schema> {
|
||||
@@ -149,8 +149,9 @@ impl IndexBuilder {
|
||||
.ok_or(TantivyError::IndexBuilderMissingArgument("schema"))
|
||||
}
|
||||
/// Opens or creates a new index in the provided directory
|
||||
pub fn open_or_create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
|
||||
if !Index::exists(&dir)? {
|
||||
pub fn open_or_create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
|
||||
let dir = dir.into();
|
||||
if !Index::exists(&*dir)? {
|
||||
return self.create(dir);
|
||||
}
|
||||
let index = Index::open(dir)?;
|
||||
@@ -165,7 +166,8 @@ impl IndexBuilder {
|
||||
/// Creates a new index given an implementation of the trait `Directory`.
|
||||
///
|
||||
/// If a directory previously existed, it will be erased.
|
||||
fn create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
|
||||
fn create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
|
||||
let dir = dir.into();
|
||||
let directory = ManagedDirectory::wrap(dir)?;
|
||||
save_new_metas(
|
||||
self.get_expect_schema()?,
|
||||
@@ -198,7 +200,7 @@ impl Index {
|
||||
/// Examines the directory to see if it contains an index.
|
||||
///
|
||||
/// Effectively, it only checks for the presence of the `meta.json` file.
|
||||
pub fn exists<Dir: Directory>(dir: &Dir) -> Result<bool, OpenReadError> {
|
||||
pub fn exists(dir: &dyn Directory) -> Result<bool, OpenReadError> {
|
||||
dir.exists(&META_FILEPATH)
|
||||
}
|
||||
|
||||
@@ -215,7 +217,7 @@ impl Index {
|
||||
/// Replace the default single thread search executor pool
|
||||
/// by a thread pool with a given number of threads.
|
||||
pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> {
|
||||
self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-")?);
|
||||
self.executor = Arc::new(Executor::multi_thread(num_threads, "tantivy-search-")?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -229,7 +231,8 @@ impl Index {
|
||||
/// Creates a new index using the `RamDirectory`.
|
||||
///
|
||||
/// The index will be allocated in anonymous memory.
|
||||
/// This should only be used for unit tests.
|
||||
/// This is useful for indexing small set of documents
|
||||
/// for instances like unit test or temporary in memory index.
|
||||
pub fn create_in_ram(schema: Schema) -> Index {
|
||||
IndexBuilder::new().schema(schema).create_in_ram().unwrap()
|
||||
}
|
||||
@@ -237,7 +240,7 @@ impl Index {
|
||||
/// Creates a new index in a given filepath.
|
||||
/// The index will use the `MMapDirectory`.
|
||||
///
|
||||
/// If a previous index was in this directory, then its meta file will be destroyed.
|
||||
/// If a previous index was in this directory, then it returns an `IndexAlreadyExists` error.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_in_dir<P: AsRef<Path>>(
|
||||
directory_path: P,
|
||||
@@ -249,7 +252,11 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Opens or creates a new index in the provided directory
|
||||
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> crate::Result<Index> {
|
||||
pub fn open_or_create<T: Into<Box<dyn Directory>>>(
|
||||
dir: T,
|
||||
schema: Schema,
|
||||
) -> crate::Result<Index> {
|
||||
let dir = dir.into();
|
||||
IndexBuilder::new().schema(schema).open_or_create(dir)
|
||||
}
|
||||
|
||||
@@ -269,11 +276,12 @@ impl Index {
|
||||
/// Creates a new index given an implementation of the trait `Directory`.
|
||||
///
|
||||
/// If a directory previously existed, it will be erased.
|
||||
pub fn create<Dir: Directory>(
|
||||
dir: Dir,
|
||||
pub fn create<T: Into<Box<dyn Directory>>>(
|
||||
dir: T,
|
||||
schema: Schema,
|
||||
settings: IndexSettings,
|
||||
) -> crate::Result<Index> {
|
||||
let dir: Box<dyn Directory> = dir.into();
|
||||
let mut builder = IndexBuilder::new().schema(schema);
|
||||
builder = builder.settings(settings);
|
||||
builder.create(dir)
|
||||
@@ -364,7 +372,8 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Open the index using the provided directory
|
||||
pub fn open<D: Directory>(directory: D) -> crate::Result<Index> {
|
||||
pub fn open<T: Into<Box<dyn Directory>>>(directory: T) -> crate::Result<Index> {
|
||||
let directory = directory.into();
|
||||
let directory = ManagedDirectory::wrap(directory)?;
|
||||
let inventory = SegmentMetaInventory::default();
|
||||
let metas = load_metas(&directory, &inventory)?;
|
||||
@@ -394,9 +403,7 @@ impl Index {
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
|
||||
///
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument`
|
||||
pub fn writer_with_num_threads(
|
||||
&self,
|
||||
num_threads: usize,
|
||||
@@ -438,14 +445,13 @@ impl Index {
|
||||
/// Creates a multithreaded writer
|
||||
///
|
||||
/// Tantivy will automatically define the number of threads to use, but
|
||||
/// no more than [`MAX_NUM_THREAD`] threads.
|
||||
/// no more than 8 threads.
|
||||
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split
|
||||
/// between a given number of threads.
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument`
|
||||
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> crate::Result<IndexWriter> {
|
||||
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
@@ -523,7 +529,22 @@ impl Index {
|
||||
|
||||
/// Returns the set of corrupted files
|
||||
pub fn validate_checksum(&self) -> crate::Result<HashSet<PathBuf>> {
|
||||
self.directory.list_damaged().map_err(Into::into)
|
||||
let managed_files = self.directory.list_managed_files();
|
||||
let active_segments_files: HashSet<PathBuf> = self
|
||||
.searchable_segment_metas()?
|
||||
.iter()
|
||||
.flat_map(|segment_meta| segment_meta.list_files())
|
||||
.collect();
|
||||
let active_existing_files: HashSet<&PathBuf> =
|
||||
active_segments_files.intersection(&managed_files).collect();
|
||||
|
||||
let mut damaged_files = HashSet::new();
|
||||
for path in active_existing_files {
|
||||
if !self.directory.validate_checksum(path)? {
|
||||
damaged_files.insert((*path).clone());
|
||||
}
|
||||
}
|
||||
Ok(damaged_files)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -561,15 +582,15 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_index_exists() {
|
||||
let directory = RamDirectory::create();
|
||||
assert!(!Index::exists(&directory).unwrap());
|
||||
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
|
||||
assert!(!Index::exists(directory.as_ref()).unwrap());
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
throw_away_schema(),
|
||||
IndexSettings::default()
|
||||
)
|
||||
.is_ok());
|
||||
assert!(Index::exists(&directory).unwrap());
|
||||
assert!(Index::exists(directory.as_ref()).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -582,29 +603,29 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn open_or_create_should_open() {
|
||||
let directory = RamDirectory::create();
|
||||
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
throw_away_schema(),
|
||||
IndexSettings::default()
|
||||
)
|
||||
.is_ok());
|
||||
assert!(Index::exists(&directory).unwrap());
|
||||
assert!(Index::exists(directory.as_ref()).unwrap());
|
||||
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn create_should_wipeoff_existing() {
|
||||
let directory = RamDirectory::create();
|
||||
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
throw_away_schema(),
|
||||
IndexSettings::default()
|
||||
)
|
||||
.is_ok());
|
||||
assert!(Index::exists(&directory).unwrap());
|
||||
assert!(Index::exists(directory.as_ref()).unwrap());
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
directory,
|
||||
Schema::builder().build(),
|
||||
IndexSettings::default()
|
||||
)
|
||||
@@ -636,7 +657,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy() {
|
||||
fn test_index_on_commit_reload_policy() -> crate::Result<()> {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -646,7 +667,7 @@ mod tests {
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &index, &reader);
|
||||
test_index_on_commit_reload_policy_aux(field, &index, &reader)
|
||||
}
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
@@ -658,7 +679,7 @@ mod tests {
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_mmap() {
|
||||
fn test_index_on_commit_reload_policy_mmap() -> crate::Result<()> {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
@@ -670,7 +691,7 @@ mod tests {
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &index, &reader);
|
||||
test_index_on_commit_reload_policy_aux(field, &index, &reader)
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -685,7 +706,7 @@ mod tests {
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()?;
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64));
|
||||
writer.add_document(doc!(field=>1u64))?;
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _handle = index.directory_mut().watch(WatchCallback::new(move || {
|
||||
let _ = sender.send(());
|
||||
@@ -699,7 +720,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_different_directories() {
|
||||
fn test_index_on_commit_reload_policy_different_directories() -> crate::Result<()> {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
@@ -712,10 +733,14 @@ mod tests {
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
|
||||
test_index_on_commit_reload_policy_aux(field, &write_index, &reader)
|
||||
}
|
||||
}
|
||||
fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
|
||||
fn test_index_on_commit_reload_policy_aux(
|
||||
field: Field,
|
||||
index: &Index,
|
||||
reader: &IndexReader,
|
||||
) -> crate::Result<()> {
|
||||
let mut reader_index = reader.index();
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _watch_handle = reader_index
|
||||
@@ -723,9 +748,9 @@ mod tests {
|
||||
.watch(WatchCallback::new(move || {
|
||||
let _ = sender.send(());
|
||||
}));
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64));
|
||||
writer.add_document(doc!(field=>1u64))?;
|
||||
writer.commit().unwrap();
|
||||
// We need a loop here because it is possible for notify to send more than
|
||||
// one modify event. It was observed on CI on MacOS.
|
||||
@@ -735,7 +760,7 @@ mod tests {
|
||||
break;
|
||||
}
|
||||
}
|
||||
writer.add_document(doc!(field=>2u64));
|
||||
writer.add_document(doc!(field=>2u64))?;
|
||||
writer.commit().unwrap();
|
||||
// ... Same as above
|
||||
loop {
|
||||
@@ -744,37 +769,37 @@ mod tests {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// This test will not pass on windows, because windows
|
||||
// prevent deleting files that are MMapped.
|
||||
#[cfg(not(target_os = "windows"))]
|
||||
#[test]
|
||||
fn garbage_collect_works_as_intended() {
|
||||
fn garbage_collect_works_as_intended() -> crate::Result<()> {
|
||||
let directory = RamDirectory::create();
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create(directory.clone(), schema, IndexSettings::default()).unwrap();
|
||||
let index = Index::create(directory.clone(), schema, IndexSettings::default())?;
|
||||
|
||||
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
|
||||
for i in 0u64..8_000u64 {
|
||||
writer.add_document(doc!(field => i));
|
||||
writer.add_document(doc!(field => i))?;
|
||||
}
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _handle = directory.watch(WatchCallback::new(move || {
|
||||
let _ = sender.send(());
|
||||
}));
|
||||
writer.commit().unwrap();
|
||||
writer.commit()?;
|
||||
let mem_right_after_commit = directory.total_mem_usage();
|
||||
assert!(receiver.recv().is_ok());
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
.try_into()?;
|
||||
|
||||
assert_eq!(reader.searcher().num_docs(), 8_000);
|
||||
writer.wait_merging_threads().unwrap();
|
||||
writer.wait_merging_threads()?;
|
||||
let mem_right_after_merge_finished = directory.total_mem_usage();
|
||||
|
||||
reader.reload().unwrap();
|
||||
@@ -786,5 +811,6 @@ mod tests {
|
||||
mem_right_after_merge_finished,
|
||||
mem_right_after_commit
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use super::SegmentComponent;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use crate::{core::SegmentId, store::Compressor};
|
||||
use census::{Inventory, TrackedObject};
|
||||
use crate::{Inventory, TrackedObject};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::PathBuf;
|
||||
use std::{collections::HashSet, sync::atomic::AtomicBool};
|
||||
@@ -101,6 +101,7 @@ impl SegmentMeta {
|
||||
|
||||
/// Returns the list of files that
|
||||
/// are required for the segment meta.
|
||||
/// Note: Some of the returned files may not exist depending on the state of the segment.
|
||||
///
|
||||
/// This is useful as the way tantivy removes files
|
||||
/// is by removing all files that have been created by tantivy
|
||||
@@ -188,6 +189,10 @@ impl SegmentMeta {
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
||||
assert!(
|
||||
num_deleted_docs <= self.max_doc(),
|
||||
"There cannot be more deleted docs than there are docs."
|
||||
);
|
||||
let delete_meta = DeleteMeta {
|
||||
num_deleted_docs,
|
||||
opstamp,
|
||||
@@ -369,7 +374,6 @@ mod tests {
|
||||
schema::{Schema, TEXT},
|
||||
IndexSettings, IndexSortByField, Order,
|
||||
};
|
||||
use serde_json;
|
||||
|
||||
#[test]
|
||||
fn test_serialize_metas() {
|
||||
@@ -394,7 +398,7 @@ mod tests {
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::io;
|
||||
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::TermInfo;
|
||||
@@ -8,6 +7,7 @@ use crate::postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::termdict::TermDictionary;
|
||||
use common::BinarySerializable;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
|
||||
@@ -14,9 +14,8 @@ pub use self::index_meta::{
|
||||
IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta, SegmentMetaInventory,
|
||||
};
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
pub use self::searcher::Searcher;
|
||||
pub use self::searcher::{Searcher, SearcherGeneration};
|
||||
pub use self::segment::Segment;
|
||||
pub use self::segment::SerializableSegment;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_reader::SegmentReader;
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use crate::collector::Collector;
|
||||
use crate::core::Executor;
|
||||
|
||||
use crate::core::SegmentReader;
|
||||
use crate::query::Query;
|
||||
use crate::schema::Document;
|
||||
@@ -10,9 +9,62 @@ use crate::space_usage::SearcherSpaceUsage;
|
||||
use crate::store::StoreReader;
|
||||
use crate::DocAddress;
|
||||
use crate::Index;
|
||||
use crate::Opstamp;
|
||||
use crate::SegmentId;
|
||||
use crate::TrackedObject;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::{fmt, io};
|
||||
|
||||
/// Identifies the searcher generation accessed by a [Searcher].
|
||||
///
|
||||
/// While this might seem redundant, a [SearcherGeneration] contains
|
||||
/// both a `generation_id` AND a list of `(SegmentId, DeleteOpstamp)`.
|
||||
///
|
||||
/// This is on purpose. This object is used by the `Warmer` API.
|
||||
/// Having both information makes it possible to identify which
|
||||
/// artifact should be refreshed or garbage collected.
|
||||
///
|
||||
/// Depending on the use case, `Warmer`'s implementers can decide to
|
||||
/// produce artifacts per:
|
||||
/// - `generation_id` (e.g. some searcher level aggregates)
|
||||
/// - `(segment_id, delete_opstamp)` (e.g. segment level aggregates)
|
||||
/// - `segment_id` (e.g. for immutable document level information)
|
||||
/// - `(generation_id, segment_id)` (e.g. for consistent dynamic column)
|
||||
/// - ...
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct SearcherGeneration {
|
||||
segments: BTreeMap<SegmentId, Option<Opstamp>>,
|
||||
generation_id: u64,
|
||||
}
|
||||
|
||||
impl SearcherGeneration {
|
||||
pub(crate) fn from_segment_readers(
|
||||
segment_readers: &[SegmentReader],
|
||||
generation_id: u64,
|
||||
) -> Self {
|
||||
let mut segment_id_to_del_opstamp = BTreeMap::new();
|
||||
for segment_reader in segment_readers {
|
||||
segment_id_to_del_opstamp
|
||||
.insert(segment_reader.segment_id(), segment_reader.delete_opstamp());
|
||||
}
|
||||
Self {
|
||||
segments: segment_id_to_del_opstamp,
|
||||
generation_id,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the searcher generation id.
|
||||
pub fn generation_id(&self) -> u64 {
|
||||
self.generation_id
|
||||
}
|
||||
|
||||
/// Return a `(SegmentId -> DeleteOpstamp)` mapping.
|
||||
pub fn segments(&self) -> &BTreeMap<SegmentId, Option<Opstamp>> {
|
||||
&self.segments
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
/// It guarantees that the `Segment` will not be removed before
|
||||
@@ -23,6 +75,7 @@ pub struct Searcher {
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
store_readers: Vec<StoreReader>,
|
||||
generation: TrackedObject<SearcherGeneration>,
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
@@ -31,6 +84,7 @@ impl Searcher {
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
generation: TrackedObject<SearcherGeneration>,
|
||||
) -> io::Result<Searcher> {
|
||||
let store_readers: Vec<StoreReader> = segment_readers
|
||||
.iter()
|
||||
@@ -41,6 +95,7 @@ impl Searcher {
|
||||
index,
|
||||
segment_readers,
|
||||
store_readers,
|
||||
generation,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -49,6 +104,11 @@ impl Searcher {
|
||||
&self.index
|
||||
}
|
||||
|
||||
/// [SearcherGeneration] which identifies the version of the snapshot held by this `Searcher`.
|
||||
pub fn generation(&self) -> &SearcherGeneration {
|
||||
self.generation.as_ref()
|
||||
}
|
||||
|
||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||
///
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
@@ -88,7 +148,7 @@ impl Searcher {
|
||||
&self.segment_readers
|
||||
}
|
||||
|
||||
/// Returns the segment_reader associated with the given segment_ordinal
|
||||
/// Returns the segment_reader associated with the given segment_ord
|
||||
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
|
||||
&self.segment_readers[segment_ord as usize]
|
||||
}
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
use super::SegmentComponent;
|
||||
use crate::core::Index;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::directory::error::{OpenReadError, OpenWriteError};
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::{FileSlice, WritePtr};
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use crate::{core::Index, indexer::doc_id_mapping::DocIdMapping};
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -90,20 +89,3 @@ impl Segment {
|
||||
Ok(write)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait SerializableSegment {
|
||||
/// Writes a view of a segment by pushing information
|
||||
/// to the `SegmentSerializer`.
|
||||
///
|
||||
/// # Returns
|
||||
/// The number of documents in the segment.
|
||||
///
|
||||
/// doc_id_map is used when index is created and sorted, to map to the new doc_id order.
|
||||
/// It is not used by the `IndexMerger`, since the doc_id_mapping on cross-segments works
|
||||
/// differently
|
||||
fn write(
|
||||
&self,
|
||||
serializer: SegmentSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> crate::Result<u32>;
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ use std::sync::atomic;
|
||||
pub struct SegmentId(Uuid);
|
||||
|
||||
#[cfg(test)]
|
||||
static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(|| atomic::AtomicUsize::default());
|
||||
static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(atomic::AtomicUsize::default);
|
||||
|
||||
#[cfg(test)]
|
||||
const ZERO_ARRAY: [u8; 8] = [0u8; 8];
|
||||
@@ -108,6 +108,12 @@ impl fmt::Debug for SegmentId {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SegmentId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "Seg({:?})", self.short_uuid_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for SegmentId {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
use crate::common::HasLen;
|
||||
use crate::core::InvertedIndexReader;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::intersect_alive_bitsets;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::fastfield::FacetReader;
|
||||
use crate::fastfield::FastFieldReaders;
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
|
||||
@@ -15,7 +17,7 @@ use crate::space_usage::SegmentSpaceUsage;
|
||||
use crate::store::StoreReader;
|
||||
use crate::termdict::TermDictionary;
|
||||
use crate::DocId;
|
||||
use crate::{common::CompositeFile, error::DataCorruption};
|
||||
use crate::Opstamp;
|
||||
use fail::fail_point;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
@@ -32,14 +34,13 @@ use std::{collections::HashMap, io};
|
||||
///
|
||||
/// The segment reader has a very low memory footprint,
|
||||
/// as close to all of the memory data is mmapped.
|
||||
///
|
||||
///
|
||||
/// TODO fix not decoding docfreq
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentReader {
|
||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||
|
||||
segment_id: SegmentId,
|
||||
delete_opstamp: Option<Opstamp>,
|
||||
|
||||
max_doc: DocId,
|
||||
num_docs: DocId,
|
||||
|
||||
@@ -50,24 +51,19 @@ pub struct SegmentReader {
|
||||
fieldnorm_readers: FieldNormReaders,
|
||||
|
||||
store_file: FileSlice,
|
||||
delete_bitset_opt: Option<DeleteBitSet>,
|
||||
alive_bitset_opt: Option<AliveBitSet>,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
impl SegmentReader {
|
||||
/// Returns the highest document id ever attributed in
|
||||
/// this segment + 1.
|
||||
/// Today, `tantivy` does not handle deletes, so it happens
|
||||
/// to also be the number of documents in the index.
|
||||
pub fn max_doc(&self) -> DocId {
|
||||
self.max_doc
|
||||
}
|
||||
|
||||
/// Returns the number of documents.
|
||||
/// Returns the number of alive documents.
|
||||
/// Deleted documents are not counted.
|
||||
///
|
||||
/// Today, `tantivy` does not handle deletes so max doc and
|
||||
/// num_docs are the same.
|
||||
pub fn num_docs(&self) -> DocId {
|
||||
self.num_docs
|
||||
}
|
||||
@@ -80,14 +76,12 @@ impl SegmentReader {
|
||||
/// Return the number of documents that have been
|
||||
/// deleted in the segment.
|
||||
pub fn num_deleted_docs(&self) -> DocId {
|
||||
self.delete_bitset()
|
||||
.map(|delete_set| delete_set.len() as DocId)
|
||||
.unwrap_or(0u32)
|
||||
self.max_doc - self.num_docs
|
||||
}
|
||||
|
||||
/// Returns true iff some of the documents of the segment have been deleted.
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.delete_bitset().is_some()
|
||||
self.num_deleted_docs() > 0
|
||||
}
|
||||
|
||||
/// Accessor to a segment's fast field reader given a field.
|
||||
@@ -109,7 +103,7 @@ impl SegmentReader {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
|
||||
match field_entry.field_type() {
|
||||
FieldType::HierarchicalFacet(_) => {
|
||||
FieldType::Facet(_) => {
|
||||
let term_ords_reader = self.fast_fields().u64s(field)?;
|
||||
let termdict = self
|
||||
.termdict_composite
|
||||
@@ -136,13 +130,17 @@ impl SegmentReader {
|
||||
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
|
||||
let field_name = self.schema.get_field_name(field);
|
||||
let err_msg = format!(
|
||||
"Field norm not found for field {:?}. Was it marked as indexed during indexing?",
|
||||
"Field norm not found for field {:?}. Was the field set to record norm during indexing?",
|
||||
field_name
|
||||
);
|
||||
crate::TantivyError::SchemaError(err_msg)
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn fieldnorms_readers(&self) -> &FieldNormReaders {
|
||||
&self.fieldnorm_readers
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `StoreReader`.
|
||||
pub fn get_store_reader(&self) -> io::Result<StoreReader> {
|
||||
StoreReader::open(self.store_file.clone())
|
||||
@@ -150,6 +148,14 @@ impl SegmentReader {
|
||||
|
||||
/// Open a new segment for reading.
|
||||
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
|
||||
Self::open_with_custom_alive_set(segment, None)
|
||||
}
|
||||
|
||||
/// Open a new segment for reading.
|
||||
pub fn open_with_custom_alive_set(
|
||||
segment: &Segment,
|
||||
custom_bitset: Option<AliveBitSet>,
|
||||
) -> crate::Result<SegmentReader> {
|
||||
let termdict_file = segment.open_read(SegmentComponent::Terms)?;
|
||||
let termdict_composite = CompositeFile::open(&termdict_file)?;
|
||||
|
||||
@@ -174,29 +180,37 @@ impl SegmentReader {
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
let fast_field_readers =
|
||||
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
|
||||
|
||||
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
|
||||
let delete_bitset_opt = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::Delete)?;
|
||||
let delete_bitset = DeleteBitSet::open(delete_data)?;
|
||||
Some(delete_bitset)
|
||||
let original_bitset = if segment.meta().has_deletes() {
|
||||
let delete_file_slice = segment.open_read(SegmentComponent::Delete)?;
|
||||
let delete_data = delete_file_slice.read_bytes()?;
|
||||
Some(AliveBitSet::open(delete_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let alive_bitset_opt = intersect_alive_bitset(original_bitset, custom_bitset);
|
||||
|
||||
let max_doc = segment.meta().max_doc();
|
||||
let num_docs = alive_bitset_opt
|
||||
.as_ref()
|
||||
.map(|alive_bitset| alive_bitset.num_alive_docs() as u32)
|
||||
.unwrap_or(max_doc);
|
||||
|
||||
Ok(SegmentReader {
|
||||
inv_idx_reader_cache: Default::default(),
|
||||
max_doc: segment.meta().max_doc(),
|
||||
num_docs: segment.meta().num_docs(),
|
||||
num_docs,
|
||||
max_doc,
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_readers: fast_field_readers,
|
||||
fieldnorm_readers,
|
||||
segment_id: segment.id(),
|
||||
delete_opstamp: segment.meta().delete_opstamp(),
|
||||
store_file,
|
||||
delete_bitset_opt,
|
||||
alive_bitset_opt,
|
||||
positions_composite,
|
||||
schema,
|
||||
})
|
||||
@@ -280,23 +294,32 @@ impl SegmentReader {
|
||||
self.segment_id
|
||||
}
|
||||
|
||||
/// Returns the delete opstamp
|
||||
pub fn delete_opstamp(&self) -> Option<Opstamp> {
|
||||
self.delete_opstamp
|
||||
}
|
||||
|
||||
/// Returns the bitset representing
|
||||
/// the documents that have been deleted.
|
||||
pub fn delete_bitset(&self) -> Option<&DeleteBitSet> {
|
||||
self.delete_bitset_opt.as_ref()
|
||||
pub fn alive_bitset(&self) -> Option<&AliveBitSet> {
|
||||
self.alive_bitset_opt.as_ref()
|
||||
}
|
||||
|
||||
/// Returns true iff the `doc` is marked
|
||||
/// as deleted.
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
self.delete_bitset()
|
||||
self.alive_bitset()
|
||||
.map(|delete_set| delete_set.is_deleted(doc))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Returns an iterator that will iterate over the alive document ids
|
||||
pub fn doc_ids_alive(&self) -> impl Iterator<Item = DocId> + '_ {
|
||||
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
|
||||
pub fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
|
||||
if let Some(alive_bitset) = &self.alive_bitset_opt {
|
||||
Box::new(alive_bitset.iter_alive())
|
||||
} else {
|
||||
Box::new(0u32..self.max_doc)
|
||||
}
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this segment.
|
||||
@@ -309,14 +332,29 @@ impl SegmentReader {
|
||||
self.fast_fields_readers.space_usage(),
|
||||
self.fieldnorm_readers.space_usage(),
|
||||
self.get_store_reader()?.space_usage(),
|
||||
self.delete_bitset_opt
|
||||
self.alive_bitset_opt
|
||||
.as_ref()
|
||||
.map(DeleteBitSet::space_usage)
|
||||
.map(AliveBitSet::space_usage)
|
||||
.unwrap_or(0),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
fn intersect_alive_bitset(
|
||||
left_opt: Option<AliveBitSet>,
|
||||
right_opt: Option<AliveBitSet>,
|
||||
) -> Option<AliveBitSet> {
|
||||
match (left_opt, right_opt) {
|
||||
(Some(left), Some(right)) => {
|
||||
assert_eq!(left.bitset().max_value(), right.bitset().max_value());
|
||||
Some(intersect_alive_bitsets(left, right))
|
||||
}
|
||||
(Some(left), None) => Some(left),
|
||||
(None, Some(right)) => Some(right),
|
||||
(None, None) => None,
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentReader {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "SegmentReader({:?})", self.segment_id)
|
||||
@@ -329,6 +367,32 @@ mod test {
|
||||
use crate::schema::{Schema, Term, STORED, TEXT};
|
||||
use crate::DocId;
|
||||
|
||||
#[test]
|
||||
fn test_num_alive() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("name", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
let name = schema.get_field("name").unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(name => "tantivy"))?;
|
||||
index_writer.add_document(doc!(name => "horse"))?;
|
||||
index_writer.add_document(doc!(name => "jockey"))?;
|
||||
index_writer.add_document(doc!(name => "cap"))?;
|
||||
// we should now have one segment with two docs
|
||||
index_writer.delete_term(Term::from_field_text(name, "horse"));
|
||||
index_writer.delete_term(Term::from_field_text(name, "cap"));
|
||||
|
||||
// ok, now we should have a deleted doc
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let searcher = index.reader()?.searcher();
|
||||
assert_eq!(2, searcher.segment_reader(0).num_docs());
|
||||
assert_eq!(4, searcher.segment_reader(0).max_doc());
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_alive_docs_iterator() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -339,10 +403,10 @@ mod test {
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(name => "tantivy"));
|
||||
index_writer.add_document(doc!(name => "horse"));
|
||||
index_writer.add_document(doc!(name => "jockey"));
|
||||
index_writer.add_document(doc!(name => "cap"));
|
||||
index_writer.add_document(doc!(name => "tantivy"))?;
|
||||
index_writer.add_document(doc!(name => "horse"))?;
|
||||
index_writer.add_document(doc!(name => "jockey"))?;
|
||||
index_writer.add_document(doc!(name => "cap"))?;
|
||||
// we should now have one segment with two docs
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
@@ -1,18 +1,17 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CountingWriter;
|
||||
use crate::common::VInt;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::{TerminatingWrite, WritePtr};
|
||||
use crate::schema::Field;
|
||||
use crate::space_usage::FieldUsage;
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use common::HasLen;
|
||||
use common::VInt;
|
||||
use std::collections::HashMap;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::iter::ExactSizeIterator;
|
||||
use std::ops::Range;
|
||||
|
||||
use super::HasLen;
|
||||
|
||||
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
|
||||
pub struct FileAddr {
|
||||
field: Field,
|
||||
@@ -188,10 +187,10 @@ impl CompositeFile {
|
||||
mod test {
|
||||
|
||||
use super::{CompositeFile, CompositeWrite};
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::VInt;
|
||||
use crate::directory::{Directory, RamDirectory};
|
||||
use crate::schema::Field;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
@@ -43,10 +43,8 @@ impl RetryPolicy {
|
||||
}
|
||||
|
||||
/// The `DirectoryLock` is an object that represents a file lock.
|
||||
/// See [`LockType`](struct.LockType.html)
|
||||
///
|
||||
/// It is transparently associated to a lock file, that gets deleted
|
||||
/// on `Drop.` The lock is released automatically on `Drop`.
|
||||
/// It is associated to a lock file, that gets deleted on `Drop.`
|
||||
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
|
||||
|
||||
struct DirectoryLockGuard {
|
||||
@@ -142,10 +140,16 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Opens a writer for the *virtual file* associated with
|
||||
/// a Path.
|
||||
///
|
||||
/// Right after this call, the file should be created
|
||||
/// and any subsequent call to `open_read` for the
|
||||
/// Right after this call, for the span of the execution of the program
|
||||
/// the file should be created and any subsequent call to `open_read` for the
|
||||
/// same path should return a `FileSlice`.
|
||||
///
|
||||
/// However, depending on the directory implementation,
|
||||
/// it might be required to call `sync_directory` to ensure
|
||||
/// that the file is durably created.
|
||||
/// (The semantics here are the same when dealing with
|
||||
/// a posix filesystem.)
|
||||
///
|
||||
/// Write operations may be aggressively buffered.
|
||||
/// The client of this trait is responsible for calling flush
|
||||
/// to ensure that subsequent `read` operations
|
||||
@@ -176,6 +180,12 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// The file may or may not previously exist.
|
||||
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()>;
|
||||
|
||||
/// Sync the directory.
|
||||
///
|
||||
/// This call is required to ensure that newly created files are
|
||||
/// effectively stored durably.
|
||||
fn sync_directory(&self) -> io::Result<()>;
|
||||
|
||||
/// Acquire a lock in the given directory.
|
||||
///
|
||||
/// The method is blocking or not depending on the `Lock` object.
|
||||
@@ -230,3 +240,15 @@ where
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Box<dyn Directory> {
|
||||
fn clone(&self) -> Self {
|
||||
self.box_clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Directory + 'static> From<T> for Box<dyn Directory> {
|
||||
fn from(t: T) -> Self {
|
||||
Box::new(t)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,8 +7,8 @@ use std::path::PathBuf;
|
||||
/// [`LockParams`](./enum.LockParams.html).
|
||||
/// Tantivy itself uses only two locks but client application
|
||||
/// can use the directory facility to define their own locks.
|
||||
/// - [INDEX_WRITER_LOCK](./struct.INDEX_WRITER_LOCK.html)
|
||||
/// - [META_LOCK](./struct.META_LOCK.html)
|
||||
/// - [INDEX_WRITER_LOCK]
|
||||
/// - [META_LOCK]
|
||||
///
|
||||
/// Check out these locks documentation for more information.
|
||||
///
|
||||
|
||||
@@ -39,6 +39,16 @@ pub enum OpenDirectoryError {
|
||||
},
|
||||
}
|
||||
|
||||
impl OpenDirectoryError {
|
||||
/// Wraps an io error.
|
||||
pub fn wrap_io_error(io_error: io::Error, directory_path: PathBuf) -> Self {
|
||||
Self::IoError {
|
||||
io_error,
|
||||
directory_path,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that may occur when starting to write in a file
|
||||
#[derive(Debug, Error)]
|
||||
pub enum OpenWriteError {
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use stable_deref_trait::StableDeref;
|
||||
|
||||
use crate::common::HasLen;
|
||||
use crate::directory::OwnedBytes;
|
||||
use common::HasLen;
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::sync::{Arc, Weak};
|
||||
@@ -32,12 +32,6 @@ impl FileHandle for &'static [u8] {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Deref<Target = [u8]>> HasLen for T {
|
||||
fn len(&self) -> usize {
|
||||
self.deref().len()
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> From<B> for FileSlice
|
||||
where
|
||||
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync,
|
||||
@@ -72,6 +66,7 @@ impl FileSlice {
|
||||
|
||||
/// Wraps a FileHandle.
|
||||
#[doc(hidden)]
|
||||
#[must_use]
|
||||
pub fn new_with_num_bytes(file_handle: Box<dyn FileHandle>, num_bytes: usize) -> Self {
|
||||
FileSlice {
|
||||
data: Arc::from(file_handle),
|
||||
@@ -178,7 +173,7 @@ impl HasLen for FileSlice {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{FileHandle, FileSlice};
|
||||
use crate::common::HasLen;
|
||||
use common::HasLen;
|
||||
use std::io;
|
||||
|
||||
#[test]
|
||||
@@ -211,7 +206,7 @@ mod tests {
|
||||
assert_eq!(right.read_bytes()?.as_slice(), b"");
|
||||
}
|
||||
{
|
||||
let (left, right) = file_slice.clone().split_from_end(2);
|
||||
let (left, right) = file_slice.split_from_end(2);
|
||||
assert_eq!(left.read_bytes()?.as_slice(), b"abcd");
|
||||
assert_eq!(right.read_bytes()?.as_slice(), b"ef");
|
||||
}
|
||||
|
||||
@@ -43,14 +43,16 @@ impl FileWatcher {
|
||||
thread::Builder::new()
|
||||
.name("thread-tantivy-meta-file-watcher".to_string())
|
||||
.spawn(move || {
|
||||
let mut current_checksum = None;
|
||||
let mut current_checksum_opt = None;
|
||||
|
||||
while state.load(Ordering::SeqCst) == 1 {
|
||||
if let Ok(checksum) = FileWatcher::compute_checksum(&path) {
|
||||
// `None.unwrap_or_else(|| !checksum) != checksum` evaluates to `true`
|
||||
if current_checksum.unwrap_or_else(|| !checksum) != checksum {
|
||||
let metafile_has_changed = current_checksum_opt
|
||||
.map(|current_checksum| current_checksum != checksum)
|
||||
.unwrap_or(true);
|
||||
if metafile_has_changed {
|
||||
info!("Meta file {:?} was modified", path);
|
||||
current_checksum = Some(checksum);
|
||||
current_checksum_opt = Some(checksum);
|
||||
futures::executor::block_on(callbacks.broadcast());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use crate::directory::error::Incompatibility;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::{
|
||||
common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen},
|
||||
directory::{AntiCallToken, TerminatingWrite},
|
||||
Version, INDEX_FORMAT_VERSION,
|
||||
};
|
||||
use common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen};
|
||||
use crc32fast::Hasher;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::io;
|
||||
@@ -156,10 +156,8 @@ mod tests {
|
||||
|
||||
use crate::directory::footer::Footer;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::{
|
||||
common::BinarySerializable,
|
||||
directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice},
|
||||
};
|
||||
use crate::directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice};
|
||||
use common::BinarySerializable;
|
||||
use std::io;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::core::{MANAGED_FILEPATH, META_FILEPATH};
|
||||
use crate::core::MANAGED_FILEPATH;
|
||||
use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::footer::{Footer, FooterProxy};
|
||||
use crate::directory::GarbageCollectionResult;
|
||||
@@ -64,7 +64,7 @@ fn save_managed_paths(
|
||||
|
||||
impl ManagedDirectory {
|
||||
/// Wraps a directory as managed directory.
|
||||
pub fn wrap<Dir: Directory>(directory: Dir) -> crate::Result<ManagedDirectory> {
|
||||
pub fn wrap(directory: Box<dyn Directory>) -> crate::Result<ManagedDirectory> {
|
||||
match directory.atomic_read(&MANAGED_FILEPATH) {
|
||||
Ok(data) => {
|
||||
let managed_files_json = String::from_utf8_lossy(&data);
|
||||
@@ -76,14 +76,14 @@ impl ManagedDirectory {
|
||||
)
|
||||
})?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: Box::new(directory),
|
||||
directory,
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
})),
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
|
||||
directory: Box::new(directory),
|
||||
directory,
|
||||
meta_informations: Arc::default(),
|
||||
}),
|
||||
io_err @ Err(OpenReadError::IoError { .. }) => Err(io_err.err().unwrap().into()),
|
||||
@@ -192,6 +192,7 @@ impl ManagedDirectory {
|
||||
for delete_file in &deleted_files {
|
||||
managed_paths_write.remove(delete_file);
|
||||
}
|
||||
self.directory.sync_directory()?;
|
||||
save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?;
|
||||
}
|
||||
|
||||
@@ -222,9 +223,22 @@ impl ManagedDirectory {
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
||||
if has_changed {
|
||||
save_managed_paths(self.directory.as_ref(), &meta_wlock)?;
|
||||
if !has_changed {
|
||||
return Ok(());
|
||||
}
|
||||
save_managed_paths(self.directory.as_ref(), &meta_wlock)?;
|
||||
// This is not the first file we add.
|
||||
// Therefore, we are sure that `.managed.json` has been already
|
||||
// properly created and we do not need to sync its parent directory.
|
||||
//
|
||||
// (It might seem like a nicer solution to create the managed_json on the
|
||||
// creation of the ManagedDirectory instance but it would actually
|
||||
// prevent the use of read-only directories..)
|
||||
let managed_file_definitely_already_exists = meta_wlock.managed_paths.len() > 1;
|
||||
if managed_file_definitely_already_exists {
|
||||
return Ok(());
|
||||
}
|
||||
self.directory.sync_directory()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -248,24 +262,15 @@ impl ManagedDirectory {
|
||||
Ok(footer.crc() == crc)
|
||||
}
|
||||
|
||||
/// List files for which checksum does not match content
|
||||
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
|
||||
let mut managed_paths = self
|
||||
/// List all managed files
|
||||
pub fn list_managed_files(&self) -> HashSet<PathBuf> {
|
||||
let managed_paths = self
|
||||
.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in list damaged.")
|
||||
.managed_paths
|
||||
.clone();
|
||||
|
||||
managed_paths.remove(*META_FILEPATH);
|
||||
|
||||
let mut damaged_files = HashSet::new();
|
||||
for path in managed_paths {
|
||||
if !self.validate_checksum(&path)? {
|
||||
damaged_files.insert(path);
|
||||
}
|
||||
}
|
||||
Ok(damaged_files)
|
||||
managed_paths
|
||||
}
|
||||
}
|
||||
|
||||
@@ -319,6 +324,11 @@ impl Directory for ManagedDirectory {
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
self.directory.watch(watch_callback)
|
||||
}
|
||||
|
||||
fn sync_directory(&self) -> io::Result<()> {
|
||||
self.directory.sync_directory()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for ManagedDirectory {
|
||||
@@ -336,7 +346,6 @@ mod tests_mmap_specific {
|
||||
|
||||
use crate::directory::{Directory, ManagedDirectory, MmapDirectory, TerminatingWrite};
|
||||
use std::collections::HashSet;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tempfile::TempDir;
|
||||
@@ -350,7 +359,7 @@ mod tests_mmap_specific {
|
||||
let test_path2: &'static Path = Path::new("some_path_for_test_2");
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap();
|
||||
let write_file = managed_directory.open_write(test_path1).unwrap();
|
||||
write_file.terminate().unwrap();
|
||||
managed_directory
|
||||
@@ -365,7 +374,7 @@ mod tests_mmap_specific {
|
||||
}
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap();
|
||||
assert!(managed_directory.exists(test_path1).unwrap());
|
||||
assert!(!managed_directory.exists(test_path2).unwrap());
|
||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||
@@ -384,7 +393,7 @@ mod tests_mmap_specific {
|
||||
let living_files = HashSet::new();
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap();
|
||||
let mut write = managed_directory.open_write(test_path1).unwrap();
|
||||
write.write_all(&[0u8, 1u8]).unwrap();
|
||||
write.terminate().unwrap();
|
||||
@@ -402,44 +411,7 @@ mod tests_mmap_specific {
|
||||
// The file should still be in the list of managed file and
|
||||
// eventually be deleted once mmap is released.
|
||||
assert!(managed_directory.garbage_collect(|| living_files).is_ok());
|
||||
assert!(!managed_directory.exists(test_path1).unwrap());
|
||||
} else {
|
||||
assert!(!managed_directory.exists(test_path1).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_checksum() -> crate::Result<()> {
|
||||
let test_path1: &'static Path = Path::new("some_path_for_test");
|
||||
let test_path2: &'static Path = Path::new("other_test_path");
|
||||
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path)?;
|
||||
let managed_directory = ManagedDirectory::wrap(mmap_directory)?;
|
||||
let mut write = managed_directory.open_write(test_path1)?;
|
||||
write.write_all(&[0u8, 1u8])?;
|
||||
write.terminate()?;
|
||||
|
||||
let mut write = managed_directory.open_write(test_path2)?;
|
||||
write.write_all(&[3u8, 4u8, 5u8])?;
|
||||
write.terminate()?;
|
||||
|
||||
let read_file = managed_directory.open_read(test_path2)?.read_bytes()?;
|
||||
assert_eq!(read_file.as_slice(), &[3u8, 4u8, 5u8]);
|
||||
assert!(managed_directory.list_damaged().unwrap().is_empty());
|
||||
|
||||
let mut corrupted_path = tempdir_path.clone();
|
||||
corrupted_path.push(test_path2);
|
||||
let mut file = OpenOptions::new().write(true).open(&corrupted_path)?;
|
||||
file.write_all(&[255u8])?;
|
||||
file.flush()?;
|
||||
drop(file);
|
||||
|
||||
let damaged = managed_directory.list_damaged()?;
|
||||
assert_eq!(damaged.len(), 1);
|
||||
assert!(damaged.contains(test_path2));
|
||||
Ok(())
|
||||
assert!(!managed_directory.exists(test_path1).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ use crate::directory::{AntiCallToken, FileHandle, OwnedBytes};
|
||||
use crate::directory::{ArcBytes, WeakArcBytes};
|
||||
use crate::directory::{TerminatingWrite, WritePtr};
|
||||
use fs2::FileExt;
|
||||
use memmap::Mmap;
|
||||
use memmap2::Mmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use stable_deref_trait::StableDeref;
|
||||
use std::convert::From;
|
||||
@@ -53,7 +53,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
|
||||
return Ok(None);
|
||||
}
|
||||
unsafe {
|
||||
memmap::Mmap::map(&file)
|
||||
memmap2::Mmap::map(&file)
|
||||
.map(Some)
|
||||
.map_err(|io_err| OpenReadError::wrap_io_error(io_err, full_path.to_path_buf()))
|
||||
}
|
||||
@@ -74,20 +74,12 @@ pub struct CacheInfo {
|
||||
pub mmapped: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct MmapCache {
|
||||
counters: CacheCounters,
|
||||
cache: HashMap<PathBuf, WeakArcBytes>,
|
||||
}
|
||||
|
||||
impl Default for MmapCache {
|
||||
fn default() -> MmapCache {
|
||||
MmapCache {
|
||||
counters: CacheCounters::default(),
|
||||
cache: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MmapCache {
|
||||
fn get_info(&self) -> CacheInfo {
|
||||
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
|
||||
@@ -201,16 +193,19 @@ impl MmapDirectory {
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let directory_path: &Path = directory_path.as_ref();
|
||||
if !directory_path.exists() {
|
||||
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
|
||||
return Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
|
||||
directory_path,
|
||||
)))
|
||||
} else if !directory_path.is_dir() {
|
||||
Err(OpenDirectoryError::NotADirectory(PathBuf::from(
|
||||
directory_path,
|
||||
)))
|
||||
} else {
|
||||
Ok(MmapDirectory::new(PathBuf::from(directory_path), None))
|
||||
)));
|
||||
}
|
||||
let canonical_path: PathBuf = directory_path.canonicalize().map_err(|io_err| {
|
||||
OpenDirectoryError::wrap_io_error(io_err, PathBuf::from(directory_path))
|
||||
})?;
|
||||
if !canonical_path.is_dir() {
|
||||
return Err(OpenDirectoryError::NotADirectory(PathBuf::from(
|
||||
directory_path,
|
||||
)));
|
||||
}
|
||||
Ok(MmapDirectory::new(canonical_path, None))
|
||||
}
|
||||
|
||||
/// Joins a relative_path to the directory `root_path`
|
||||
@@ -219,33 +214,6 @@ impl MmapDirectory {
|
||||
self.inner.root_path.join(relative_path)
|
||||
}
|
||||
|
||||
/// Sync the root directory.
|
||||
/// In certain FS, this is required to persistently create
|
||||
/// a file.
|
||||
fn sync_directory(&self) -> Result<(), io::Error> {
|
||||
let mut open_opts = OpenOptions::new();
|
||||
|
||||
// Linux needs read to be set, otherwise returns EINVAL
|
||||
// write must not be set, or it fails with EISDIR
|
||||
open_opts.read(true);
|
||||
|
||||
// On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
|
||||
// and calling sync_all() only works if write access is requested.
|
||||
#[cfg(windows)]
|
||||
{
|
||||
use std::os::windows::fs::OpenOptionsExt;
|
||||
use winapi::um::winbase;
|
||||
|
||||
open_opts
|
||||
.write(true)
|
||||
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
|
||||
}
|
||||
|
||||
let fd = open_opts.open(&self.inner.root_path)?;
|
||||
fd.sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns some statistical information
|
||||
/// about the Mmap cache.
|
||||
///
|
||||
@@ -296,8 +264,7 @@ impl Write for SafeFileWriter {
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.0.flush()?;
|
||||
self.0.sync_all()
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -309,7 +276,9 @@ impl Seek for SafeFileWriter {
|
||||
|
||||
impl TerminatingWrite for SafeFileWriter {
|
||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()
|
||||
self.0.flush()?;
|
||||
self.0.sync_data()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -339,6 +308,7 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> {
|
||||
let mut tempfile = tempfile::Builder::new().tempfile_in(&parent_path)?;
|
||||
tempfile.write_all(content)?;
|
||||
tempfile.flush()?;
|
||||
tempfile.as_file_mut().sync_data()?;
|
||||
tempfile.into_temp_path().persist(path)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -373,22 +343,17 @@ impl Directory for MmapDirectory {
|
||||
/// removed before the file is deleted.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
let full_path = self.resolve_path(path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => self.sync_directory().map_err(|e| DeleteError::IoError {
|
||||
io_error: e,
|
||||
filepath: path.to_path_buf(),
|
||||
}),
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Err(DeleteError::FileDoesNotExist(path.to_owned()))
|
||||
} else {
|
||||
Err(DeleteError::IoError {
|
||||
io_error: e,
|
||||
filepath: path.to_path_buf(),
|
||||
})
|
||||
fs::remove_file(&full_path).map_err(|e| {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
DeleteError::FileDoesNotExist(path.to_owned())
|
||||
} else {
|
||||
DeleteError::IoError {
|
||||
io_error: e,
|
||||
filepath: path.to_path_buf(),
|
||||
}
|
||||
}
|
||||
}
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> Result<bool, OpenReadError> {
|
||||
@@ -417,10 +382,13 @@ impl Directory for MmapDirectory {
|
||||
file.flush()
|
||||
.map_err(|io_error| OpenWriteError::wrap_io_error(io_error, path.to_path_buf()))?;
|
||||
|
||||
// Apparetntly, on some filesystem syncing the parent
|
||||
// directory is required.
|
||||
self.sync_directory()
|
||||
.map_err(|io_err| OpenWriteError::wrap_io_error(io_err, path.to_path_buf()))?;
|
||||
// Note we actually do not sync the parent directory here.
|
||||
//
|
||||
// A newly created file, may, in some case, be created and even flushed to disk.
|
||||
// and then lost...
|
||||
//
|
||||
// The file will only be durably written after we terminate AND
|
||||
// sync_directory() is called.
|
||||
|
||||
let writer = SafeFileWriter::new(file);
|
||||
Ok(BufWriter::new(Box::new(writer)))
|
||||
@@ -450,7 +418,7 @@ impl Directory for MmapDirectory {
|
||||
debug!("Atomic Write {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
atomic_write(&full_path, content)?;
|
||||
self.sync_directory()
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
|
||||
@@ -476,6 +444,30 @@ impl Directory for MmapDirectory {
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
Ok(self.inner.watch(watch_callback))
|
||||
}
|
||||
|
||||
fn sync_directory(&self) -> Result<(), io::Error> {
|
||||
let mut open_opts = OpenOptions::new();
|
||||
|
||||
// Linux needs read to be set, otherwise returns EINVAL
|
||||
// write must not be set, or it fails with EISDIR
|
||||
open_opts.read(true);
|
||||
|
||||
// On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
|
||||
// and calling sync_all() only works if write access is requested.
|
||||
#[cfg(windows)]
|
||||
{
|
||||
use std::os::windows::fs::OpenOptionsExt;
|
||||
use winapi::um::winbase;
|
||||
|
||||
open_opts
|
||||
.write(true)
|
||||
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
|
||||
}
|
||||
|
||||
let fd = open_opts.open(&self.inner.root_path)?;
|
||||
fd.sync_data()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -485,13 +477,14 @@ mod tests {
|
||||
// The following tests are specific to the MmapDirectory
|
||||
|
||||
use super::*;
|
||||
use crate::indexer::LogMergePolicy;
|
||||
use crate::Index;
|
||||
use crate::ReloadPolicy;
|
||||
use crate::{common::HasLen, indexer::LogMergePolicy};
|
||||
use crate::{
|
||||
schema::{Schema, SchemaBuilder, TEXT},
|
||||
IndexSettings,
|
||||
};
|
||||
use common::HasLen;
|
||||
|
||||
#[test]
|
||||
fn test_open_non_existent_path() {
|
||||
@@ -581,8 +574,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mmap_released() {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
||||
fn test_mmap_released() -> crate::Result<()> {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
||||
let mut schema_builder: SchemaBuilder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
@@ -591,31 +584,30 @@ mod tests {
|
||||
let index =
|
||||
Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap();
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
log_merge_policy.set_min_num_segments(3);
|
||||
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
||||
for _num_commits in 0..10 {
|
||||
for _ in 0..10 {
|
||||
index_writer.add_document(doc!(text_field=>"abc"));
|
||||
index_writer.add_document(doc!(text_field=>"abc"))?;
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
.try_into()?;
|
||||
|
||||
for _ in 0..4 {
|
||||
index_writer.add_document(doc!(text_field=>"abc"));
|
||||
index_writer.commit().unwrap();
|
||||
reader.reload().unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"abc"))?;
|
||||
index_writer.commit()?;
|
||||
reader.reload()?;
|
||||
}
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
index_writer.wait_merging_threads()?;
|
||||
|
||||
reader.reload().unwrap();
|
||||
reader.reload()?;
|
||||
let num_segments = reader.searcher().segment_readers().len();
|
||||
assert!(num_segments <= 4);
|
||||
let num_components_except_deletes_and_tempstore =
|
||||
@@ -626,5 +618,6 @@ mod tests {
|
||||
);
|
||||
}
|
||||
assert!(mmap_directory.get_cache_info().mmapped.is_empty());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*!
|
||||
|
||||
WORM directory abstraction.
|
||||
WORM (Write Once Read Many) directory abstraction.
|
||||
|
||||
*/
|
||||
|
||||
@@ -20,6 +20,9 @@ mod watch_event_router;
|
||||
/// Errors specific to the directory module.
|
||||
pub mod error;
|
||||
|
||||
mod composite_file;
|
||||
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::directory::DirectoryLock;
|
||||
pub use self::directory::{Directory, DirectoryClone};
|
||||
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
|
||||
@@ -28,7 +31,9 @@ pub use self::file_slice::{FileHandle, FileSlice};
|
||||
pub use self::owned_bytes::OwnedBytes;
|
||||
pub use self::ram_directory::RamDirectory;
|
||||
pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
|
||||
use std::io::{self, BufWriter, Write};
|
||||
pub use common::AntiCallToken;
|
||||
pub use common::TerminatingWrite;
|
||||
use std::io::BufWriter;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Outcome of the Garbage collection
|
||||
@@ -50,47 +55,6 @@ pub use self::mmap_directory::MmapDirectory;
|
||||
|
||||
pub use self::managed_directory::ManagedDirectory;
|
||||
|
||||
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
|
||||
///
|
||||
/// The point is that while the type is public, it cannot be built by anyone
|
||||
/// outside of this module.
|
||||
pub struct AntiCallToken(());
|
||||
|
||||
/// Trait used to indicate when no more write need to be done on a writer
|
||||
pub trait TerminatingWrite: Write {
|
||||
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
|
||||
fn terminate(mut self) -> io::Result<()>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
self.terminate_ref(AntiCallToken(()))
|
||||
}
|
||||
|
||||
/// You should implement this function to define custom behavior.
|
||||
/// This function should flush any buffer it may hold.
|
||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()>;
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite + ?Sized> TerminatingWrite for Box<W> {
|
||||
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
|
||||
self.as_mut().terminate_ref(token)
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
|
||||
fn terminate_ref(&mut self, a: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()?;
|
||||
self.get_mut().terminate_ref(a)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<'a> TerminatingWrite for &'a mut Vec<u8> {
|
||||
fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()
|
||||
}
|
||||
}
|
||||
|
||||
/// Write object for Directory.
|
||||
///
|
||||
/// `WritePtr` are required to implement both Write
|
||||
|
||||
@@ -1,290 +1,11 @@
|
||||
use crate::directory::FileHandle;
|
||||
use stable_deref_trait::StableDeref;
|
||||
use std::convert::TryInto;
|
||||
use std::mem;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, io};
|
||||
use std::io;
|
||||
use std::ops::Range;
|
||||
|
||||
/// An OwnedBytes simply wraps an object that owns a slice of data and exposes
|
||||
/// this data as a static slice.
|
||||
///
|
||||
/// The backing object is required to be `StableDeref`.
|
||||
#[derive(Clone)]
|
||||
pub struct OwnedBytes {
|
||||
data: &'static [u8],
|
||||
box_stable_deref: Arc<dyn Deref<Target = [u8]> + Sync + Send>,
|
||||
}
|
||||
pub use ownedbytes::OwnedBytes;
|
||||
|
||||
impl FileHandle for OwnedBytes {
|
||||
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
|
||||
Ok(self.slice(range))
|
||||
}
|
||||
}
|
||||
|
||||
impl OwnedBytes {
|
||||
/// Creates an empty `OwnedBytes`.
|
||||
pub fn empty() -> OwnedBytes {
|
||||
OwnedBytes::new(&[][..])
|
||||
}
|
||||
|
||||
/// Creates an `OwnedBytes` intance given a `StableDeref` object.
|
||||
pub fn new<T: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync>(
|
||||
data_holder: T,
|
||||
) -> OwnedBytes {
|
||||
let box_stable_deref = Arc::new(data_holder);
|
||||
let bytes: &[u8] = box_stable_deref.as_ref();
|
||||
let data = unsafe { mem::transmute::<_, &'static [u8]>(bytes.deref()) };
|
||||
OwnedBytes {
|
||||
data,
|
||||
box_stable_deref,
|
||||
}
|
||||
}
|
||||
|
||||
/// creates a fileslice that is just a view over a slice of the data.
|
||||
pub fn slice(&self, range: Range<usize>) -> Self {
|
||||
OwnedBytes {
|
||||
data: &self.data[range],
|
||||
box_stable_deref: self.box_stable_deref.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the underlying slice of data.
|
||||
/// `Deref` and `AsRef` are also available.
|
||||
#[inline]
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
self.data
|
||||
}
|
||||
|
||||
/// Returns the len of the slice.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.data.len()
|
||||
}
|
||||
|
||||
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
|
||||
///
|
||||
/// Left will hold `split_len` bytes.
|
||||
///
|
||||
/// This operation is cheap and does not require to copy any memory.
|
||||
/// On the other hand, both `left` and `right` retain a handle over
|
||||
/// the entire slice of memory. In other words, the memory will only
|
||||
/// be released when both left and right are dropped.
|
||||
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
|
||||
let right_box_stable_deref = self.box_stable_deref.clone();
|
||||
let left = OwnedBytes {
|
||||
data: &self.data[..split_len],
|
||||
box_stable_deref: self.box_stable_deref,
|
||||
};
|
||||
let right = OwnedBytes {
|
||||
data: &self.data[split_len..],
|
||||
box_stable_deref: right_box_stable_deref,
|
||||
};
|
||||
(left, right)
|
||||
}
|
||||
|
||||
/// Returns true iff this `OwnedBytes` is empty.
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.as_slice().is_empty()
|
||||
}
|
||||
|
||||
/// Drops the left most `advance_len` bytes.
|
||||
///
|
||||
/// See also [.clip(clip_len: usize))](#method.clip).
|
||||
#[inline]
|
||||
pub fn advance(&mut self, advance_len: usize) {
|
||||
self.data = &self.data[advance_len..]
|
||||
}
|
||||
|
||||
/// Reads an `u8` from the `OwnedBytes` and advance by one byte.
|
||||
pub fn read_u8(&mut self) -> u8 {
|
||||
assert!(!self.is_empty());
|
||||
|
||||
let byte = self.as_slice()[0];
|
||||
self.advance(1);
|
||||
byte
|
||||
}
|
||||
|
||||
/// Reads an `u64` encoded as little-endian from the `OwnedBytes` and advance by 8 bytes.
|
||||
pub fn read_u64(&mut self) -> u64 {
|
||||
assert!(self.len() > 7);
|
||||
|
||||
let octlet: [u8; 8] = self.as_slice()[..8].try_into().unwrap();
|
||||
self.advance(8);
|
||||
u64::from_le_bytes(octlet)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for OwnedBytes {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// We truncate the bytes in order to make sure the debug string
|
||||
// is not too long.
|
||||
let bytes_truncated: &[u8] = if self.len() > 8 {
|
||||
&self.as_slice()[..10]
|
||||
} else {
|
||||
self.as_slice()
|
||||
};
|
||||
write!(f, "OwnedBytes({:?}, len={})", bytes_truncated, self.len())
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for OwnedBytes {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl io::Read for OwnedBytes {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
let read_len = {
|
||||
let data = self.as_slice();
|
||||
if data.len() >= buf.len() {
|
||||
let buf_len = buf.len();
|
||||
buf.copy_from_slice(&data[..buf_len]);
|
||||
buf.len()
|
||||
} else {
|
||||
let data_len = data.len();
|
||||
buf[..data_len].copy_from_slice(data);
|
||||
data_len
|
||||
}
|
||||
};
|
||||
self.advance(read_len);
|
||||
Ok(read_len)
|
||||
}
|
||||
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
|
||||
let read_len = {
|
||||
let data = self.as_slice();
|
||||
buf.extend(data);
|
||||
data.len()
|
||||
};
|
||||
self.advance(read_len);
|
||||
Ok(read_len)
|
||||
}
|
||||
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
|
||||
let read_len = self.read(buf)?;
|
||||
if read_len != buf.len() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
"failed to fill whole buffer",
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for OwnedBytes {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io::{self, Read};
|
||||
|
||||
use super::OwnedBytes;
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_debug() {
|
||||
let short_bytes = OwnedBytes::new(b"abcd".as_ref());
|
||||
assert_eq!(
|
||||
format!("{:?}", short_bytes),
|
||||
"OwnedBytes([97, 98, 99, 100], len=4)"
|
||||
);
|
||||
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
|
||||
assert_eq!(
|
||||
format!("{:?}", long_bytes),
|
||||
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105, 106], len=17)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcdefghiklmnopqrstuvwxyz".as_ref());
|
||||
{
|
||||
let mut buf = [0u8; 5];
|
||||
bytes.read_exact(&mut buf[..]).unwrap();
|
||||
assert_eq!(&buf, b"abcde");
|
||||
assert_eq!(bytes.as_slice(), b"fghiklmnopqrstuvwxyz")
|
||||
}
|
||||
{
|
||||
let mut buf = [0u8; 2];
|
||||
bytes.read_exact(&mut buf[..]).unwrap();
|
||||
assert_eq!(&buf, b"fg");
|
||||
assert_eq!(bytes.as_slice(), b"hiklmnopqrstuvwxyz")
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_right_at_the_end() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
||||
let mut buf = [0u8; 5];
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
|
||||
assert_eq!(&buf, b"abcde");
|
||||
assert_eq!(bytes.as_slice(), b"");
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
|
||||
assert_eq!(&buf, b"abcde");
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_owned_bytes_read_incomplete() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
||||
let mut buf = [0u8; 7];
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 5);
|
||||
assert_eq!(&buf[..5], b"abcde");
|
||||
assert_eq!(bytes.read(&mut buf[..]).unwrap(), 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_to_end() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"abcde".as_ref());
|
||||
let mut buf = Vec::new();
|
||||
bytes.read_to_end(&mut buf)?;
|
||||
assert_eq!(buf.as_slice(), b"abcde".as_ref());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_u8() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"\xFF".as_ref());
|
||||
assert_eq!(bytes.read_u8(), 255);
|
||||
assert_eq!(bytes.len(), 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_read_u64() -> io::Result<()> {
|
||||
let mut bytes = OwnedBytes::new(b"\0\xFF\xFF\xFF\xFF\xFF\xFF\xFF".as_ref());
|
||||
assert_eq!(bytes.read_u64(), u64::MAX - 255);
|
||||
assert_eq!(bytes.len(), 0);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_split() {
|
||||
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
||||
let (left, right) = bytes.split(3);
|
||||
assert_eq!(left.as_slice(), b"abc");
|
||||
assert_eq!(right.as_slice(), b"defghi");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_owned_bytes_split_boundary() {
|
||||
let bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
||||
{
|
||||
let (left, right) = bytes.clone().split(0);
|
||||
assert_eq!(left.as_slice(), b"");
|
||||
assert_eq!(right.as_slice(), b"abcdefghi");
|
||||
}
|
||||
{
|
||||
let (left, right) = bytes.split(9);
|
||||
assert_eq!(left.as_slice(), b"abcdefghi");
|
||||
assert_eq!(right.as_slice(), b"");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::AntiCallToken;
|
||||
use crate::directory::WatchCallbackList;
|
||||
use crate::directory::{Directory, FileSlice, WatchCallback, WatchHandle};
|
||||
use crate::directory::{TerminatingWrite, WritePtr};
|
||||
use crate::{common::HasLen, core::META_FILEPATH};
|
||||
use common::HasLen;
|
||||
use fail::fail_point;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
@@ -17,13 +18,6 @@ use super::FileHandle;
|
||||
/// Writer associated with the `RamDirectory`
|
||||
///
|
||||
/// The Writer just writes a buffer.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// On drop, if the writer was left in a *dirty* state.
|
||||
/// That is, if flush was not called after the last call
|
||||
/// to write.
|
||||
///
|
||||
struct VecWriter {
|
||||
path: PathBuf,
|
||||
shared_directory: RamDirectory,
|
||||
@@ -45,8 +39,8 @@ impl VecWriter {
|
||||
impl Drop for VecWriter {
|
||||
fn drop(&mut self) {
|
||||
if !self.is_flushed {
|
||||
panic!(
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
|
||||
warn!(
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.",
|
||||
self.path
|
||||
)
|
||||
}
|
||||
@@ -220,14 +214,8 @@ impl Directory for RamDirectory {
|
||||
}
|
||||
|
||||
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
fail_point!("RamDirectory::atomic_write", |msg| Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
)));
|
||||
let path_buf = PathBuf::from(path);
|
||||
|
||||
self.fs.write().unwrap().write(path_buf, data);
|
||||
|
||||
if path == *META_FILEPATH {
|
||||
let _ = self.fs.write().unwrap().watch_router.broadcast();
|
||||
}
|
||||
@@ -237,6 +225,10 @@ impl Directory for RamDirectory {
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
Ok(self.fs.write().unwrap().watch(watch_callback))
|
||||
}
|
||||
|
||||
fn sync_directory(&self) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -118,15 +118,6 @@ mod ram_directory_tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn ram_directory_panics_if_flush_forgotten() {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
let ram_directory = RamDirectory::create();
|
||||
let mut write_file = ram_directory.open_write(test_path).unwrap();
|
||||
assert!(write_file.write_all(&[4]).is_ok());
|
||||
}
|
||||
|
||||
fn test_simple(directory: &dyn Directory) -> crate::Result<()> {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
let mut write_file = directory.open_write(test_path)?;
|
||||
@@ -166,26 +157,26 @@ fn test_write_create_the_file(directory: &dyn Directory) {
|
||||
fn test_directory_delete(directory: &dyn Directory) -> crate::Result<()> {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
assert!(directory.open_read(test_path).is_err());
|
||||
let mut write_file = directory.open_write(&test_path)?;
|
||||
let mut write_file = directory.open_write(test_path)?;
|
||||
write_file.write_all(&[1, 2, 3, 4])?;
|
||||
write_file.flush()?;
|
||||
{
|
||||
let read_handle = directory.open_read(&test_path)?.read_bytes()?;
|
||||
let read_handle = directory.open_read(test_path)?.read_bytes()?;
|
||||
assert_eq!(read_handle.as_slice(), &[1u8, 2u8, 3u8, 4u8]);
|
||||
// Mapped files can't be deleted on Windows
|
||||
if !cfg!(windows) {
|
||||
assert!(directory.delete(&test_path).is_ok());
|
||||
assert!(directory.delete(test_path).is_ok());
|
||||
assert_eq!(read_handle.as_slice(), &[1u8, 2u8, 3u8, 4u8]);
|
||||
}
|
||||
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
|
||||
}
|
||||
|
||||
if cfg!(windows) {
|
||||
assert!(directory.delete(&test_path).is_ok());
|
||||
assert!(directory.delete(test_path).is_ok());
|
||||
}
|
||||
|
||||
assert!(directory.open_read(&test_path).is_err());
|
||||
assert!(directory.delete(&test_path).is_err());
|
||||
assert!(directory.open_read(test_path).is_err());
|
||||
assert!(directory.delete(test_path).is_err());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::DocId;
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
@@ -85,11 +85,11 @@ pub trait DocSet: Send {
|
||||
|
||||
/// Returns the number documents matching.
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
let mut count = 0u32;
|
||||
let mut doc = self.doc();
|
||||
while doc != TERMINATED {
|
||||
if !delete_bitset.is_deleted(doc) {
|
||||
if alive_bitset.is_alive(doc) {
|
||||
count += 1u32;
|
||||
}
|
||||
doc = self.advance();
|
||||
@@ -130,8 +130,8 @@ impl<'a> DocSet for &'a mut dyn DocSet {
|
||||
(**self).size_hint()
|
||||
}
|
||||
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
(**self).count(delete_bitset)
|
||||
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
(**self).count(alive_bitset)
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
@@ -160,9 +160,9 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
unboxed.size_hint()
|
||||
}
|
||||
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count(delete_bitset)
|
||||
unboxed.count(alive_bitset)
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
|
||||
224
src/fastfield/alive_bitset.rs
Normal file
224
src/fastfield/alive_bitset.rs
Normal file
@@ -0,0 +1,224 @@
|
||||
use crate::space_usage::ByteCount;
|
||||
use crate::DocId;
|
||||
use common::intersect_bitsets;
|
||||
use common::BitSet;
|
||||
use common::ReadOnlyBitSet;
|
||||
use ownedbytes::OwnedBytes;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
/// Write a alive `BitSet`
|
||||
///
|
||||
/// where `alive_bitset` is the set of alive `DocId`.
|
||||
/// Warning: this function does not call terminate. The caller is in charge of
|
||||
/// closing the writer properly.
|
||||
pub fn write_alive_bitset<T: Write>(alive_bitset: &BitSet, writer: &mut T) -> io::Result<()> {
|
||||
alive_bitset.serialize(writer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set of alive `DocId`s.
|
||||
#[derive(Clone)]
|
||||
pub struct AliveBitSet {
|
||||
num_alive_docs: usize,
|
||||
bitset: ReadOnlyBitSet,
|
||||
}
|
||||
|
||||
/// Intersects two AliveBitSets in a new one.
|
||||
/// The two bitsets need to have the same max_value.
|
||||
pub fn intersect_alive_bitsets(left: AliveBitSet, right: AliveBitSet) -> AliveBitSet {
|
||||
assert_eq!(left.bitset().max_value(), right.bitset().max_value());
|
||||
let bitset = intersect_bitsets(left.bitset(), right.bitset());
|
||||
let num_alive_docs = bitset.len();
|
||||
AliveBitSet {
|
||||
num_alive_docs,
|
||||
bitset,
|
||||
}
|
||||
}
|
||||
|
||||
impl AliveBitSet {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn for_test_from_deleted_docs(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet {
|
||||
assert!(deleted_docs.iter().all(|&doc| doc < max_doc));
|
||||
let mut bitset = BitSet::with_max_value_and_full(max_doc);
|
||||
for &doc in deleted_docs {
|
||||
bitset.remove(doc);
|
||||
}
|
||||
let mut alive_bitset_buffer = Vec::new();
|
||||
write_alive_bitset(&bitset, &mut alive_bitset_buffer).unwrap();
|
||||
let alive_bitset_bytes = OwnedBytes::new(alive_bitset_buffer);
|
||||
Self::open(alive_bitset_bytes)
|
||||
}
|
||||
|
||||
pub(crate) fn from_bitset(bitset: &BitSet) -> AliveBitSet {
|
||||
let readonly_bitset = ReadOnlyBitSet::from(bitset);
|
||||
AliveBitSet::from(readonly_bitset)
|
||||
}
|
||||
|
||||
/// Opens a delete bitset given its file.
|
||||
pub fn open(bytes: OwnedBytes) -> AliveBitSet {
|
||||
let bitset = ReadOnlyBitSet::open(bytes);
|
||||
AliveBitSet::from(bitset)
|
||||
}
|
||||
|
||||
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
||||
#[inline]
|
||||
pub fn is_alive(&self, doc: DocId) -> bool {
|
||||
self.bitset.contains(doc)
|
||||
}
|
||||
|
||||
/// Returns true iff the document has been marked as deleted.
|
||||
#[inline]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
!self.is_alive(doc)
|
||||
}
|
||||
|
||||
/// Iterate over the alive doc_ids.
|
||||
#[inline]
|
||||
pub fn iter_alive(&self) -> impl Iterator<Item = DocId> + '_ {
|
||||
self.bitset.iter()
|
||||
}
|
||||
|
||||
/// Get underlying bitset
|
||||
#[inline]
|
||||
pub fn bitset(&self) -> &ReadOnlyBitSet {
|
||||
&self.bitset
|
||||
}
|
||||
|
||||
/// The number of deleted docs
|
||||
pub fn num_alive_docs(&self) -> usize {
|
||||
self.num_alive_docs
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this bitset.
|
||||
pub fn space_usage(&self) -> ByteCount {
|
||||
self.bitset().num_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ReadOnlyBitSet> for AliveBitSet {
|
||||
fn from(bitset: ReadOnlyBitSet) -> AliveBitSet {
|
||||
let num_alive_docs = bitset.len();
|
||||
AliveBitSet {
|
||||
num_alive_docs,
|
||||
bitset,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::AliveBitSet;
|
||||
|
||||
#[test]
|
||||
fn test_alive_bitset_empty() {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[], 10);
|
||||
for doc in 0..10 {
|
||||
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
|
||||
assert!(!alive_bitset.is_deleted(doc));
|
||||
}
|
||||
assert_eq!(alive_bitset.num_alive_docs(), 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alive_bitset() {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[1, 9], 10);
|
||||
assert!(alive_bitset.is_alive(0));
|
||||
assert!(alive_bitset.is_deleted(1));
|
||||
assert!(alive_bitset.is_alive(2));
|
||||
assert!(alive_bitset.is_alive(3));
|
||||
assert!(alive_bitset.is_alive(4));
|
||||
assert!(alive_bitset.is_alive(5));
|
||||
assert!(alive_bitset.is_alive(6));
|
||||
assert!(alive_bitset.is_alive(6));
|
||||
assert!(alive_bitset.is_alive(7));
|
||||
assert!(alive_bitset.is_alive(8));
|
||||
assert!(alive_bitset.is_deleted(9));
|
||||
for doc in 0..10 {
|
||||
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
|
||||
}
|
||||
assert_eq!(alive_bitset.num_alive_docs(), 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alive_bitset_iter_minimal() {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[7], 8);
|
||||
|
||||
let data: Vec<_> = alive_bitset.iter_alive().collect();
|
||||
assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alive_bitset_iter_small() {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 2, 3, 6], 7);
|
||||
|
||||
let data: Vec<_> = alive_bitset.iter_alive().collect();
|
||||
assert_eq!(data, vec![1, 4, 5]);
|
||||
}
|
||||
#[test]
|
||||
fn test_alive_bitset_iter() {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000], 1001);
|
||||
|
||||
let data: Vec<_> = alive_bitset.iter_alive().collect();
|
||||
assert_eq!(data, (2..=999).collect::<Vec<_>>());
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::AliveBitSet;
|
||||
use rand::prelude::IteratorRandom;
|
||||
use rand::thread_rng;
|
||||
use test::Bencher;
|
||||
|
||||
fn get_alive() -> Vec<u32> {
|
||||
let mut data = (0..1_000_000_u32).collect::<Vec<u32>>();
|
||||
for _ in 0..(1_000_000) * 1 / 8 {
|
||||
remove_rand(&mut data);
|
||||
}
|
||||
data
|
||||
}
|
||||
|
||||
fn remove_rand(raw: &mut Vec<u32>) {
|
||||
let i = (0..raw.len()).choose(&mut thread_rng()).unwrap();
|
||||
raw.remove(i);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_access(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
(0..1_000_000_u32)
|
||||
.filter(|doc| alive_bitset.is_alive(*doc))
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
(0..1_000_000_u32)
|
||||
.filter(|doc| alive_bitset.is_alive(*doc))
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -18,11 +18,11 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(bytes_field=>vec![0u8, 1, 2, 3]));
|
||||
index_writer.add_document(doc!(bytes_field=>vec![]));
|
||||
index_writer.add_document(doc!(bytes_field=>vec![255u8]));
|
||||
index_writer.add_document(doc!(bytes_field=>vec![1u8, 3, 5, 7, 9]));
|
||||
index_writer.add_document(doc!(bytes_field=>vec![0u8; 1000]));
|
||||
index_writer.add_document(doc!(bytes_field=>vec![0u8, 1, 2, 3]))?;
|
||||
index_writer.add_document(doc!(bytes_field=>vec![]))?;
|
||||
index_writer.add_document(doc!(bytes_field=>vec![255u8]))?;
|
||||
index_writer.add_document(doc!(bytes_field=>vec![1u8, 3, 5, 7, 9]))?;
|
||||
index_writer.add_document(doc!(bytes_field=>vec![0u8; 1000]))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
@@ -47,7 +47,7 @@ mod tests {
|
||||
index_writer.add_document(doc!(
|
||||
field => b"tantivy".as_ref(),
|
||||
field => b"lucene".as_ref()
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
Ok(index.reader()?.searcher())
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::io;
|
||||
|
||||
use crate::fastfield::serializer::FastFieldSerializer;
|
||||
use crate::schema::{Document, Field, Value};
|
||||
use crate::DocId;
|
||||
use crate::{
|
||||
@@ -84,11 +83,11 @@ impl BytesFastFieldWriter {
|
||||
&'a self,
|
||||
doc_id_map: Option<&'b DocIdMapping>,
|
||||
) -> impl Iterator<Item = &'b [u8]> {
|
||||
let doc_id_iter = if let Some(doc_id_map) = doc_id_map {
|
||||
Box::new(doc_id_map.iter_old_doc_ids().cloned()) as Box<dyn Iterator<Item = u32>>
|
||||
let doc_id_iter: Box<dyn Iterator<Item = u32>> = if let Some(doc_id_map) = doc_id_map {
|
||||
Box::new(doc_id_map.iter_old_doc_ids())
|
||||
} else {
|
||||
Box::new(self.doc_index.iter().enumerate().map(|el| el.0 as u32))
|
||||
as Box<dyn Iterator<Item = u32>>
|
||||
let max_doc = self.doc_index.len() as u32;
|
||||
Box::new(0..max_doc)
|
||||
};
|
||||
doc_id_iter.map(move |doc_id| self.get_values_for_doc_id(doc_id))
|
||||
}
|
||||
|
||||
@@ -1,139 +0,0 @@
|
||||
use crate::common::{BitSet, HasLen};
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::space_usage::ByteCount;
|
||||
use crate::DocId;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
/// Write a delete `BitSet`
|
||||
///
|
||||
/// where `delete_bitset` is the set of deleted `DocId`.
|
||||
/// Warning: this function does not call terminate. The caller is in charge of
|
||||
/// closing the writer properly.
|
||||
pub fn write_delete_bitset(
|
||||
delete_bitset: &BitSet,
|
||||
max_doc: u32,
|
||||
writer: &mut WritePtr,
|
||||
) -> io::Result<()> {
|
||||
let mut byte = 0u8;
|
||||
let mut shift = 0u8;
|
||||
for doc in 0..max_doc {
|
||||
if delete_bitset.contains(doc) {
|
||||
byte |= 1 << shift;
|
||||
}
|
||||
if shift == 7 {
|
||||
writer.write_all(&[byte])?;
|
||||
shift = 0;
|
||||
byte = 0;
|
||||
} else {
|
||||
shift += 1;
|
||||
}
|
||||
}
|
||||
if max_doc % 8 > 0 {
|
||||
writer.write_all(&[byte])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set of deleted `DocId`s.
|
||||
#[derive(Clone)]
|
||||
pub struct DeleteBitSet {
|
||||
data: OwnedBytes,
|
||||
num_deleted: usize,
|
||||
}
|
||||
|
||||
impl DeleteBitSet {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn for_test(docs: &[DocId], max_doc: u32) -> DeleteBitSet {
|
||||
use crate::directory::{Directory, RamDirectory, TerminatingWrite};
|
||||
use std::path::Path;
|
||||
assert!(docs.iter().all(|&doc| doc < max_doc));
|
||||
let mut bitset = BitSet::with_max_value(max_doc);
|
||||
for &doc in docs {
|
||||
bitset.insert(doc);
|
||||
}
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("dummydeletebitset");
|
||||
let mut wrt = directory.open_write(path).unwrap();
|
||||
write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap();
|
||||
wrt.terminate().unwrap();
|
||||
let file = directory.open_read(path).unwrap();
|
||||
Self::open(file).unwrap()
|
||||
}
|
||||
|
||||
/// Opens a delete bitset given its file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<DeleteBitSet> {
|
||||
let bytes = file.read_bytes()?;
|
||||
let num_deleted: usize = bytes
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|b| b.count_ones() as usize)
|
||||
.sum();
|
||||
Ok(DeleteBitSet {
|
||||
data: bytes,
|
||||
num_deleted,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
||||
pub fn is_alive(&self, doc: DocId) -> bool {
|
||||
!self.is_deleted(doc)
|
||||
}
|
||||
|
||||
/// Returns true iff the document has been marked as deleted.
|
||||
#[inline]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
let byte_offset = doc / 8u32;
|
||||
let b: u8 = self.data.as_slice()[byte_offset as usize];
|
||||
let shift = (doc & 7u32) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this bitset.
|
||||
pub fn space_usage(&self) -> ByteCount {
|
||||
self.data.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for DeleteBitSet {
|
||||
fn len(&self) -> usize {
|
||||
self.num_deleted
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::DeleteBitSet;
|
||||
use crate::common::HasLen;
|
||||
|
||||
#[test]
|
||||
fn test_delete_bitset_empty() {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[], 10);
|
||||
for doc in 0..10 {
|
||||
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc));
|
||||
}
|
||||
assert_eq!(delete_bitset.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_bitset() {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[1, 9], 10);
|
||||
assert!(delete_bitset.is_alive(0));
|
||||
assert!(delete_bitset.is_deleted(1));
|
||||
assert!(delete_bitset.is_alive(2));
|
||||
assert!(delete_bitset.is_alive(3));
|
||||
assert!(delete_bitset.is_alive(4));
|
||||
assert!(delete_bitset.is_alive(5));
|
||||
assert!(delete_bitset.is_alive(6));
|
||||
assert!(delete_bitset.is_alive(6));
|
||||
assert!(delete_bitset.is_alive(7));
|
||||
assert!(delete_bitset.is_alive(8));
|
||||
assert!(delete_bitset.is_deleted(9));
|
||||
for doc in 0..10 {
|
||||
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc));
|
||||
}
|
||||
assert_eq!(delete_bitset.len(), 2);
|
||||
}
|
||||
}
|
||||
@@ -84,18 +84,18 @@ impl FacetReader {
|
||||
mod tests {
|
||||
use crate::Index;
|
||||
use crate::{
|
||||
schema::{Facet, FacetOptions, SchemaBuilder, Value, INDEXED, STORED},
|
||||
schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED},
|
||||
DocAddress, Document,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_facet_only_indexed() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -106,42 +106,19 @@ mod tests {
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||
let value = doc.get_first(facet_field).and_then(Value::facet);
|
||||
assert_eq!(value, None);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_only_stored() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
.segment_reader(0u32)
|
||||
.facet_reader(facet_field)
|
||||
.unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert!(facet_ords.is_empty());
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||
assert_eq!(value, Some("/a/b".to_string()));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_stored_and_indexed() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", STORED | INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -152,43 +129,20 @@ mod tests {
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert_eq!(&facet_ords, &[2u64]);
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||
assert_eq!(value, Some("/a/b".to_string()));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_neither_stored_and_indexed() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
.segment_reader(0u32)
|
||||
.facet_reader(facet_field)
|
||||
.unwrap();
|
||||
let mut facet_ords = Vec::new();
|
||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||
assert!(facet_ords.is_empty());
|
||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||
assert_eq!(value, None);
|
||||
let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::facet);
|
||||
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_facet_not_populated_for_all_docs() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -206,12 +160,12 @@ mod tests {
|
||||
#[test]
|
||||
fn test_facet_not_populated_for_any_docs() -> crate::Result<()> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
|
||||
@@ -23,30 +23,32 @@ values stored.
|
||||
Read access performance is comparable to that of an array lookup.
|
||||
*/
|
||||
|
||||
pub use self::alive_bitset::intersect_alive_bitsets;
|
||||
pub use self::alive_bitset::write_alive_bitset;
|
||||
pub use self::alive_bitset::AliveBitSet;
|
||||
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
|
||||
pub use self::reader::BitpackedFastFieldReader;
|
||||
pub(crate) use self::reader::BitpackedFastFieldReader;
|
||||
pub use self::reader::DynamicFastFieldReader;
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub use self::serializer::CompositeFastFieldSerializer;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::serializer::FastFieldDataAccess;
|
||||
pub use self::serializer::FastFieldStats;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Value;
|
||||
use crate::DocId;
|
||||
use crate::{
|
||||
chrono::{NaiveDateTime, Utc},
|
||||
schema::Type,
|
||||
};
|
||||
use crate::{common, DocId};
|
||||
|
||||
mod alive_bitset;
|
||||
mod bytes;
|
||||
mod delete;
|
||||
mod error;
|
||||
mod facet_reader;
|
||||
mod multivalued;
|
||||
@@ -108,7 +110,7 @@ impl FastValue for u64 {
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
FieldType::HierarchicalFacet(_) => Some(Cardinality::MultiValues),
|
||||
FieldType::Facet(_) => Some(Cardinality::MultiValues),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@@ -212,9 +214,8 @@ fn value_to_u64(value: &Value) -> u64 {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::BitpackedFastFieldReader;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Schema;
|
||||
@@ -239,7 +240,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield() {
|
||||
let test_fastfield = BitpackedFastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
assert_eq!(test_fastfield.get(0), 100);
|
||||
assert_eq!(test_fastfield.get(1), 200);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
@@ -267,11 +268,11 @@ mod tests {
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
assert_eq!(file.len(), 36 as usize);
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 37);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(file)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get(2), 2u64);
|
||||
@@ -298,12 +299,12 @@ mod tests {
|
||||
fast_field_writers.serialize(&mut serializer, &HashMap::new(), None)?;
|
||||
serializer.close()?;
|
||||
}
|
||||
let file = directory.open_read(&path)?;
|
||||
assert_eq!(file.len(), 61 as usize);
|
||||
let file = directory.open_read(path)?;
|
||||
assert_eq!(file.len(), 62);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0), 4u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
|
||||
assert_eq!(fast_field_reader.get(2), 3_052u64);
|
||||
@@ -334,12 +335,12 @@ mod tests {
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
assert_eq!(file.len(), 34 as usize);
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 35);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
for doc in 0..10_000 {
|
||||
assert_eq!(fast_field_reader.get(doc), 100_000u64);
|
||||
}
|
||||
@@ -366,12 +367,12 @@ mod tests {
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
assert_eq!(file.len(), 80042 as usize);
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 80043);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0), 0u64);
|
||||
for doc in 1..10_001 {
|
||||
assert_eq!(
|
||||
@@ -405,12 +406,13 @@ mod tests {
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
assert_eq!(file.len(), 17709 as usize);
|
||||
let file = directory.open_read(path).unwrap();
|
||||
//assert_eq!(file.len(), 17710 as usize); //bitpacked size
|
||||
assert_eq!(file.len(), 10175_usize); // linear interpol size
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = BitpackedFastFieldReader::<i64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
|
||||
|
||||
assert_eq!(fast_field_reader.min_value(), -100i64);
|
||||
assert_eq!(fast_field_reader.max_value(), 9_999i64);
|
||||
@@ -446,11 +448,11 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
let file = directory.open_read(path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = BitpackedFastFieldReader::<i64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0u32), 0i64);
|
||||
}
|
||||
Ok(())
|
||||
@@ -479,11 +481,11 @@ mod tests {
|
||||
fast_field_writers.serialize(&mut serializer, &HashMap::new(), None)?;
|
||||
serializer.close()?;
|
||||
}
|
||||
let file = directory.open_read(&path)?;
|
||||
let file = directory.open_read(path)?;
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
|
||||
let mut a = 0u64;
|
||||
for _ in 0..n {
|
||||
@@ -495,18 +497,18 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_missing_date_fast_field() {
|
||||
fn test_merge_missing_date_fast_field() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field("date", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()));
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!());
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let segment_ids: Vec<SegmentId> = reader
|
||||
.searcher()
|
||||
.segment_readers()
|
||||
@@ -515,10 +517,10 @@ mod tests {
|
||||
.collect();
|
||||
assert_eq!(segment_ids.len(), 2);
|
||||
let merge_future = index_writer.merge(&segment_ids[..]);
|
||||
let merge_res = futures::executor::block_on(merge_future);
|
||||
assert!(merge_res.is_ok());
|
||||
assert!(reader.reload().is_ok());
|
||||
futures::executor::block_on(merge_future)?;
|
||||
reader.reload()?;
|
||||
assert_eq!(reader.searcher().segment_readers().len(), 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -527,7 +529,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_datefastfield() {
|
||||
fn test_datefastfield() -> crate::Result<()> {
|
||||
use crate::fastfield::FastValue;
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field("date", FAST);
|
||||
@@ -537,22 +539,22 @@ mod tests {
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
date_field => crate::DateTime::from_u64(1i64.to_u64()),
|
||||
multi_date_field => crate::DateTime::from_u64(2i64.to_u64()),
|
||||
multi_date_field => crate::DateTime::from_u64(3i64.to_u64())
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
date_field => crate::DateTime::from_u64(4i64.to_u64())
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
multi_date_field => crate::DateTime::from_u64(5i64.to_u64()),
|
||||
multi_date_field => crate::DateTime::from_u64(6i64.to_u64())
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
@@ -579,6 +581,7 @@ mod tests {
|
||||
assert_eq!(dates[0].timestamp(), 5i64);
|
||||
assert_eq!(dates[1].timestamp(), 6i64);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -587,7 +590,7 @@ mod bench {
|
||||
use super::tests::FIELD;
|
||||
use super::tests::{generate_permutation, SCHEMA};
|
||||
use super::*;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use std::collections::HashMap;
|
||||
@@ -627,7 +630,7 @@ mod bench {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
@@ -641,7 +644,7 @@ mod bench {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
@@ -661,7 +664,7 @@ mod bench {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
@@ -675,7 +678,7 @@ mod bench {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
|
||||
@@ -8,17 +8,25 @@ pub use self::writer::MultiValuedFastFieldWriter;
|
||||
mod tests {
|
||||
|
||||
use crate::collector::TopDocs;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::FacetOptions;
|
||||
use crate::schema::IntOptions;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::INDEXED;
|
||||
use crate::Document;
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
use chrono::Duration;
|
||||
use futures::executor::block_on;
|
||||
use proptest::prop_oneof;
|
||||
use proptest::proptest;
|
||||
use proptest::strategy::Strategy;
|
||||
use test_log::test;
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_u64() {
|
||||
fn test_multivalued_u64() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field(
|
||||
"multifield",
|
||||
@@ -26,17 +34,17 @@ mod tests {
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(field=>1u64, field=>3u64));
|
||||
index_writer.add_document(doc!());
|
||||
index_writer.add_document(doc!(field=>4u64));
|
||||
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(field=>1u64, field=>3u64))?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(field=>4u64))?;
|
||||
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = segment_reader.fast_fields().u64s(field).unwrap();
|
||||
let multi_value_reader = segment_reader.fast_fields().u64s(field)?;
|
||||
{
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[4u64]);
|
||||
@@ -49,56 +57,55 @@ mod tests {
|
||||
multi_value_reader.get_vals(1, &mut vals);
|
||||
assert!(vals.is_empty());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_date() {
|
||||
fn test_multivalued_date() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field(
|
||||
"multi_date_field",
|
||||
IntOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed()
|
||||
.set_fieldnorm()
|
||||
.set_stored(),
|
||||
);
|
||||
let time_i =
|
||||
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let first_time_stamp = chrono::Utc::now();
|
||||
index_writer.add_document(
|
||||
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
|
||||
);
|
||||
index_writer.add_document(doc!(time_i=>0i64));
|
||||
)?;
|
||||
index_writer.add_document(doc!(time_i=>0i64))?;
|
||||
// add one second
|
||||
index_writer
|
||||
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64));
|
||||
index_writer.add_document(
|
||||
doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64),
|
||||
)?;
|
||||
// add another second
|
||||
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
|
||||
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
|
||||
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64))?;
|
||||
// add three seconds
|
||||
index_writer
|
||||
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(
|
||||
doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64),
|
||||
)?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert_eq!(reader.num_docs(), 5);
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let query = parser
|
||||
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()).to_string())
|
||||
.expect("could not parse query");
|
||||
let results = searcher
|
||||
.search(&query, &TopDocs::with_limit(5))
|
||||
.expect("could not query index");
|
||||
|
||||
let query = parser.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))?;
|
||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
||||
assert_eq!(results.len(), 1);
|
||||
for (_score, doc_address) in results {
|
||||
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
.get_first(date_field)
|
||||
@@ -120,12 +127,8 @@ mod tests {
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let query = parser
|
||||
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()).to_string())
|
||||
.expect("could not parse query");
|
||||
let results = searcher
|
||||
.search(&query, &TopDocs::with_limit(5))
|
||||
.expect("could not query index");
|
||||
let query = parser.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))?;
|
||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
|
||||
@@ -157,10 +160,8 @@ mod tests {
|
||||
(first_time_stamp + Duration::seconds(1)).to_rfc3339(),
|
||||
(first_time_stamp + Duration::seconds(3)).to_rfc3339()
|
||||
);
|
||||
let query = parser.parse_query(&range_q).expect("could not parse query");
|
||||
let results = searcher
|
||||
.search(&query, &TopDocs::with_limit(5))
|
||||
.expect("could not query index");
|
||||
let query = parser.parse_query(&range_q)?;
|
||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
||||
|
||||
assert_eq!(results.len(), 2);
|
||||
for (i, doc_pair) in results.iter().enumerate() {
|
||||
@@ -188,16 +189,16 @@ mod tests {
|
||||
retrieved_doc
|
||||
.get_first(time_i)
|
||||
.expect("cannot find value")
|
||||
.i64_value()
|
||||
.expect("value not of i64 type"),
|
||||
time_i_val
|
||||
.i64_value(),
|
||||
Some(time_i_val)
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_i64() {
|
||||
fn test_multivalued_i64() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_i64_field(
|
||||
"multifield",
|
||||
@@ -205,14 +206,14 @@ mod tests {
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(field=> 1i64, field => 3i64));
|
||||
index_writer.add_document(doc!());
|
||||
index_writer.add_document(doc!(field=> -4i64));
|
||||
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(field=> 1i64, field => 3i64))?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(field=> -4i64))?;
|
||||
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
|
||||
@@ -224,18 +225,125 @@ mod tests {
|
||||
assert!(vals.is_empty());
|
||||
multi_value_reader.get_vals(3, &mut vals);
|
||||
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_many_facets() {
|
||||
|
||||
fn test_multivalued_no_panic(ops: &[IndexingOp]) -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_facet_field("facetfield", INDEXED);
|
||||
let field = schema_builder.add_u64_field(
|
||||
"multifield",
|
||||
IntOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed(),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
for i in 0..100_000 {
|
||||
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
|
||||
for &op in ops {
|
||||
match op {
|
||||
IndexingOp::AddDoc { id } => {
|
||||
match id % 3 {
|
||||
0 => {
|
||||
index_writer.add_document(doc!())?;
|
||||
}
|
||||
1 => {
|
||||
let mut doc = Document::new();
|
||||
for _ in 0..5001 {
|
||||
doc.add_u64(field, id as u64);
|
||||
}
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
_ => {
|
||||
let mut doc = Document::new();
|
||||
doc.add_u64(field, id as u64);
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
};
|
||||
}
|
||||
IndexingOp::DeleteDoc { id } => {
|
||||
index_writer.delete_term(Term::from_field_u64(field, id as u64));
|
||||
}
|
||||
IndexingOp::Commit => {
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
IndexingOp::Merge => {
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
if segment_ids.len() >= 2 {
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.segment_updater().wait_merging_thread()?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
if !segment_ids.is_empty() {
|
||||
block_on(index_writer.merge(&segment_ids)).unwrap();
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum IndexingOp {
|
||||
AddDoc { id: u32 },
|
||||
DeleteDoc { id: u32 },
|
||||
Commit,
|
||||
Merge,
|
||||
}
|
||||
|
||||
fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
|
||||
prop_oneof![
|
||||
(0u32..10u32).prop_map(|id| IndexingOp::DeleteDoc { id }),
|
||||
(0u32..10u32).prop_map(|id| IndexingOp::AddDoc { id }),
|
||||
(0u32..2u32).prop_map(|_| IndexingOp::Commit),
|
||||
(0u32..1u32).prop_map(|_| IndexingOp::Merge),
|
||||
]
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_multivalued_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_proptest_off_by_one_bug_1151() {
|
||||
use IndexingOp::*;
|
||||
let ops = [
|
||||
AddDoc { id: 3 },
|
||||
AddDoc { id: 1 },
|
||||
AddDoc { id: 3 },
|
||||
Commit,
|
||||
Merge,
|
||||
];
|
||||
|
||||
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_many_facets() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_facet_field("facetfield", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for i in 0..100_000 {
|
||||
index_writer
|
||||
.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, FastValue, MultiValueLength};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength};
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
@@ -13,14 +13,14 @@ use crate::DocId;
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct MultiValuedFastFieldReader<Item: FastValue> {
|
||||
idx_reader: BitpackedFastFieldReader<u64>,
|
||||
vals_reader: BitpackedFastFieldReader<Item>,
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: DynamicFastFieldReader<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
pub(crate) fn open(
|
||||
idx_reader: BitpackedFastFieldReader<u64>,
|
||||
vals_reader: BitpackedFastFieldReader<Item>,
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: DynamicFastFieldReader<Item>,
|
||||
) -> MultiValuedFastFieldReader<Item> {
|
||||
MultiValuedFastFieldReader {
|
||||
idx_reader,
|
||||
@@ -30,6 +30,7 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
#[inline]
|
||||
fn range(&self, doc: DocId) -> Range<u64> {
|
||||
let start = self.idx_reader.get(doc);
|
||||
let stop = self.idx_reader.get(doc + 1);
|
||||
@@ -37,20 +38,41 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let range = self.range(doc);
|
||||
let len = (range.end - range.start) as usize;
|
||||
vals.resize(len, Item::make_zero());
|
||||
self.vals_reader.get_range_u64(range.start, &mut vals[..]);
|
||||
self.vals_reader.get_range(range.start, &mut vals[..]);
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual mimimum value.
|
||||
pub fn min_value(&self) -> Item {
|
||||
self.vals_reader.min_value()
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self) -> Item {
|
||||
self.vals_reader.max_value()
|
||||
}
|
||||
|
||||
/// Returns the number of values associated with the document `DocId`.
|
||||
#[inline]
|
||||
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||
let range = self.range(doc);
|
||||
(range.end - range.start) as usize
|
||||
}
|
||||
|
||||
/// Returns the overall number of values in this field .
|
||||
#[inline]
|
||||
pub fn total_num_vals(&self) -> u64 {
|
||||
self.idx_reader.max_value()
|
||||
}
|
||||
@@ -69,27 +91,25 @@ impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
|
||||
mod tests {
|
||||
|
||||
use crate::core::Index;
|
||||
use crate::schema::{Facet, Schema, INDEXED};
|
||||
use crate::schema::{Cardinality, Facet, FacetOptions, IntOptions, Schema};
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader() {
|
||||
fn test_multifastfield_reader() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facets", INDEXED);
|
||||
let facet_field = schema_builder.add_facet_field("facets", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index
|
||||
.writer_for_tests()
|
||||
.expect("Failed to create index writer.");
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from("/category/cat2"),
|
||||
facet_field => Facet::from("/category/cat1"),
|
||||
));
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")));
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")));
|
||||
index_writer.commit().expect("Commit failed");
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
))?;
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")))?;
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
|
||||
let mut facet_reader = segment_reader.facet_reader(facet_field)?;
|
||||
|
||||
let mut facet = Facet::root();
|
||||
{
|
||||
@@ -123,5 +143,35 @@ mod tests {
|
||||
facet_reader.facet_ords(2, &mut vals);
|
||||
assert_eq!(&vals[..], &[4]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader_min_max() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field_options = IntOptions::default()
|
||||
.set_indexed()
|
||||
.set_fast(Cardinality::MultiValues);
|
||||
let item_field = schema_builder.add_i64_field("items", field_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index
|
||||
.writer_for_tests()
|
||||
.expect("Failed to create index writer.");
|
||||
index_writer.add_document(doc!(
|
||||
item_field => 2i64,
|
||||
item_field => 3i64,
|
||||
item_field => -2i64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64))?;
|
||||
index_writer.add_document(doc!(item_field => 4i64))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let field_reader = segment_reader.fast_fields().i64s(item_field)?;
|
||||
|
||||
assert_eq!(field_reader.min_value(), -2);
|
||||
assert_eq!(field_reader.max_value(), 6);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use crate::fastfield::serializer::DynamicFastFieldSerializer;
|
||||
use crate::fastfield::serializer::FastFieldSerializer;
|
||||
use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Document, Field};
|
||||
@@ -103,11 +102,11 @@ impl MultiValuedFastFieldWriter {
|
||||
&'a self,
|
||||
doc_id_map: Option<&'b DocIdMapping>,
|
||||
) -> impl Iterator<Item = &'b [u64]> {
|
||||
let doc_id_iter = if let Some(doc_id_map) = doc_id_map {
|
||||
Box::new(doc_id_map.iter_old_doc_ids().cloned()) as Box<dyn Iterator<Item = u32>>
|
||||
let doc_id_iter: Box<dyn Iterator<Item = u32>> = if let Some(doc_id_map) = doc_id_map {
|
||||
Box::new(doc_id_map.iter_old_doc_ids())
|
||||
} else {
|
||||
Box::new(self.doc_index.iter().enumerate().map(|el| el.0 as u32))
|
||||
as Box<dyn Iterator<Item = u32>>
|
||||
let max_doc = self.doc_index.len() as DocId;
|
||||
Box::new(0..max_doc)
|
||||
};
|
||||
doc_id_iter.map(move |doc_id| self.get_values_for_doc_id(doc_id))
|
||||
}
|
||||
@@ -155,7 +154,7 @@ impl MultiValuedFastFieldWriter {
|
||||
}
|
||||
{
|
||||
// writing the values themselves.
|
||||
let mut value_serializer: DynamicFastFieldSerializer<'_, _>;
|
||||
let mut value_serializer: BitpackedFastFieldSerializerLegacy<'_, _>;
|
||||
match mapping_opt {
|
||||
Some(mapping) => {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use super::FastValue;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
@@ -8,11 +7,18 @@ use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
use crate::DocId;
|
||||
use common::BinarySerializable;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldReader;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::FastFieldCodecReader;
|
||||
use fastfield_codecs::FastFieldCodecSerializer;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::path::Path;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
/// FastFieldReader is the trait to access fast field data.
|
||||
pub trait FastFieldReader<Item: FastValue>: Clone {
|
||||
@@ -38,13 +44,13 @@ pub trait FastFieldReader<Item: FastValue>: Clone {
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
fn get_range(&self, start: DocId, output: &mut [Item]);
|
||||
fn get_range(&self, start: u64, output: &mut [Item]);
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual mimimum value.
|
||||
fn min_value(&self) -> Item;
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
@@ -61,15 +67,48 @@ pub trait FastFieldReader<Item: FastValue>: Clone {
|
||||
///
|
||||
pub enum DynamicFastFieldReader<Item: FastValue> {
|
||||
/// Bitpacked compressed fastfield data.
|
||||
Bitpacked(BitpackedFastFieldReader<Item>),
|
||||
Bitpacked(FastFieldReaderCodecWrapper<Item, BitpackedReader>),
|
||||
/// Linear interpolated values + bitpacked
|
||||
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearInterpolFastFieldReader>),
|
||||
/// Blockwise linear interpolated values + bitpacked
|
||||
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearInterpolFastFieldReader>),
|
||||
}
|
||||
|
||||
impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
|
||||
Ok(DynamicFastFieldReader::Bitpacked(
|
||||
BitpackedFastFieldReader::open(file)?,
|
||||
))
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let id = bytes.read_u8();
|
||||
|
||||
let reader = match id {
|
||||
BitpackedFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::Bitpacked(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
BitpackedReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
LinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
LinearInterpolFastFieldReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
MultiLinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
MultiLinearInterpolFastFieldReader,
|
||||
>::open_from_bytes(
|
||||
bytes
|
||||
)?)
|
||||
}
|
||||
_ => {
|
||||
panic!(
|
||||
"unknown fastfield id {:?}. Data corrupted or using old tantivy version.",
|
||||
id
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(reader)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,57 +116,67 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
fn get(&self, doc: DocId) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get(doc),
|
||||
Self::LinearInterpol(reader) => reader.get(doc),
|
||||
Self::MultiLinearInterpol(reader) => reader.get(doc),
|
||||
}
|
||||
}
|
||||
fn get_range(&self, start: DocId, output: &mut [Item]) {
|
||||
fn get_range(&self, start: u64, output: &mut [Item]) {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get_range(start, output),
|
||||
Self::LinearInterpol(reader) => reader.get_range(start, output),
|
||||
Self::MultiLinearInterpol(reader) => reader.get_range(start, output),
|
||||
}
|
||||
}
|
||||
fn min_value(&self) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.min_value(),
|
||||
Self::LinearInterpol(reader) => reader.min_value(),
|
||||
Self::MultiLinearInterpol(reader) => reader.min_value(),
|
||||
}
|
||||
}
|
||||
fn max_value(&self) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.max_value(),
|
||||
Self::LinearInterpol(reader) => reader.max_value(),
|
||||
Self::MultiLinearInterpol(reader) => reader.max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait for accessing a fastfield.
|
||||
/// Wrapper for accessing a fastfield.
|
||||
///
|
||||
/// Holds the data and the codec to the read the data.
|
||||
///
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct BitpackedFastFieldReader<Item: FastValue> {
|
||||
pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
|
||||
reader: CodecReader,
|
||||
bytes: OwnedBytes,
|
||||
bit_unpacker: BitUnpacker,
|
||||
min_value_u64: u64,
|
||||
max_value_u64: u64,
|
||||
_phantom: PhantomData<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> BitpackedFastFieldReader<Item> {
|
||||
impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item, C> {
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<Self> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let min_value = u64::deserialize(&mut bytes)?;
|
||||
let amplitude = u64::deserialize(&mut bytes)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(BitpackedFastFieldReader {
|
||||
let id = u8::deserialize(&mut bytes)?;
|
||||
assert_eq!(
|
||||
BitpackedFastFieldSerializer::ID,
|
||||
id,
|
||||
"Tried to open fast field as bitpacked encoded (id=1), but got serializer with different id"
|
||||
);
|
||||
Self::open_from_bytes(bytes)
|
||||
}
|
||||
/// Opens a fast field given the bytes.
|
||||
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
|
||||
let reader = C::open_from_bytes(bytes.as_slice())?;
|
||||
Ok(FastFieldReaderCodecWrapper {
|
||||
reader,
|
||||
bytes,
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes))
|
||||
Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice()))
|
||||
}
|
||||
|
||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||
@@ -149,7 +198,9 @@ impl<Item: FastValue> BitpackedFastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> FastFieldReader<Item> for BitpackedFastFieldReader<Item> {
|
||||
impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
|
||||
for FastFieldReaderCodecWrapper<Item, C>
|
||||
{
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
@@ -175,8 +226,8 @@ impl<Item: FastValue> FastFieldReader<Item> for BitpackedFastFieldReader<Item> {
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
fn get_range(&self, start: DocId, output: &mut [Item]) {
|
||||
self.get_range_u64(u64::from(start), output);
|
||||
fn get_range(&self, start: u64, output: &mut [Item]) {
|
||||
self.get_range_u64(start, output);
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
@@ -185,7 +236,7 @@ impl<Item: FastValue> FastFieldReader<Item> for BitpackedFastFieldReader<Item> {
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
fn min_value(&self) -> Item {
|
||||
Item::from_u64(self.min_value_u64)
|
||||
Item::from_u64(self.reader.min_value())
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
@@ -194,12 +245,14 @@ impl<Item: FastValue> FastFieldReader<Item> for BitpackedFastFieldReader<Item> {
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
fn max_value(&self) -> Item {
|
||||
Item::from_u64(self.max_value_u64)
|
||||
Item::from_u64(self.reader.max_value())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> From<Vec<Item>> for BitpackedFastFieldReader<Item> {
|
||||
fn from(vals: Vec<Item>) -> BitpackedFastFieldReader<Item> {
|
||||
pub(crate) type BitpackedFastFieldReader<Item> = FastFieldReaderCodecWrapper<Item, BitpackedReader>;
|
||||
|
||||
impl<Item: FastValue> From<Vec<Item>> for DynamicFastFieldReader<Item> {
|
||||
fn from(vals: Vec<Item>) -> DynamicFastFieldReader<Item> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
@@ -231,6 +284,6 @@ impl<Item: FastValue> From<Vec<Item>> for BitpackedFastFieldReader<Item> {
|
||||
let field_file = composite_file
|
||||
.open_read(field)
|
||||
.expect("File component not found");
|
||||
BitpackedFastFieldReader::open(field_file).unwrap()
|
||||
DynamicFastFieldReader::open(field_file).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::fastfield::MultiValuedFastFieldReader;
|
||||
use crate::fastfield::{BitpackedFastFieldReader, FastFieldNotAvailableError};
|
||||
@@ -8,7 +8,6 @@ use crate::space_usage::PerFieldSpaceUsage;
|
||||
use crate::TantivyError;
|
||||
|
||||
use super::reader::DynamicFastFieldReader;
|
||||
use super::FastFieldReader;
|
||||
|
||||
/// Provides access to all of the BitpackedFastFieldReader.
|
||||
///
|
||||
@@ -41,7 +40,7 @@ fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality
|
||||
FieldType::Date(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::Date, cardinality)),
|
||||
FieldType::HierarchicalFacet(_) => Some((FastType::U64, Cardinality::MultiValues)),
|
||||
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@@ -100,23 +99,27 @@ impl FastFieldReaders {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn typed_fast_field_reader_with_idx<TFastValue: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
index: usize,
|
||||
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
|
||||
let fast_field_slice = self.fast_field_data(field, index)?;
|
||||
DynamicFastFieldReader::open(fast_field_slice)
|
||||
}
|
||||
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
|
||||
let fast_field_slice = self.fast_field_data(field, 0)?;
|
||||
DynamicFastFieldReader::open(fast_field_slice)
|
||||
self.typed_fast_field_reader_with_idx(field, 0)
|
||||
}
|
||||
|
||||
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
|
||||
let fast_field_slice_idx = self.fast_field_data(field, 0)?;
|
||||
let fast_field_slice_vals = self.fast_field_data(field, 1)?;
|
||||
let idx_reader = BitpackedFastFieldReader::open(fast_field_slice_idx)?;
|
||||
let vals_reader: BitpackedFastFieldReader<TFastValue> =
|
||||
BitpackedFastFieldReader::open(fast_field_slice_vals)?;
|
||||
let idx_reader = self.typed_fast_field_reader(field)?;
|
||||
let vals_reader = self.typed_fast_field_reader_with_idx(field, 1)?;
|
||||
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
|
||||
}
|
||||
|
||||
@@ -139,7 +142,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns an Error.
|
||||
pub fn i64(&self, field: Field) -> crate::Result<impl FastFieldReader<i64>> {
|
||||
pub fn i64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<i64>> {
|
||||
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -147,7 +150,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns an Error.
|
||||
pub fn date(&self, field: Field) -> crate::Result<impl FastFieldReader<crate::DateTime>> {
|
||||
pub fn date(&self, field: Field) -> crate::Result<DynamicFastFieldReader<crate::DateTime>> {
|
||||
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -155,7 +158,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `f64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a f64 fast field, this method returns an Error.
|
||||
pub fn f64(&self, field: Field) -> crate::Result<impl FastFieldReader<f64>> {
|
||||
pub fn f64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<f64>> {
|
||||
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
@@ -1,256 +0,0 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeWrite;
|
||||
use crate::common::CountingWriter;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
/// `CompositeFastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
///
|
||||
/// Fast fields have differnt encodings like bit-packing.
|
||||
///
|
||||
/// `FastFieldWriter`s are in charge of pushing the data to
|
||||
/// the serializer.
|
||||
/// The serializer expects to receive the following calls.
|
||||
///
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `close()`
|
||||
pub struct CompositeFastFieldSerializer {
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
}
|
||||
|
||||
impl CompositeFastFieldSerializer {
|
||||
/// Constructor
|
||||
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(CompositeFastFieldSerializer { composite_write })
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<DynamicFastFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
idx: usize,
|
||||
) -> io::Result<DynamicFastFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
DynamicFastFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
/// Start serializing a new [u8] fast field
|
||||
pub fn new_bytes_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> FastBytesFieldSerializer<'_, CountingWriter<WritePtr>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastBytesFieldSerializer { write: field_write }
|
||||
}
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.composite_write.close()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EstimationStats {
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
}
|
||||
/// The FastFieldSerializer trait is the common interface
|
||||
/// implemented by every fastfield serializer variant.
|
||||
///
|
||||
/// `DynamicFastFieldSerializer` is the enum wrapping all variants.
|
||||
/// It is used to create an serializer instance.
|
||||
pub trait FastFieldSerializer {
|
||||
/// add value to serializer
|
||||
fn add_val(&mut self, val: u64) -> io::Result<()>;
|
||||
/// finish serializing a field.
|
||||
fn close_field(self) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldSerializerEstimate {
|
||||
/// returns an estimate of the compression ratio.
|
||||
fn estimate(
|
||||
/*fastfield_accessor: impl FastFieldReader<u64>,*/ stats: EstimationStats,
|
||||
) -> (f32, &'static str);
|
||||
/// the unique name of the compressor
|
||||
fn name() -> &'static str;
|
||||
}
|
||||
|
||||
pub enum DynamicFastFieldSerializer<'a, W: Write> {
|
||||
Bitpacked(BitpackedFastFieldSerializer<'a, W>),
|
||||
}
|
||||
|
||||
impl<'a, W: Write> DynamicFastFieldSerializer<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<DynamicFastFieldSerializer<'a, W>> {
|
||||
let stats = EstimationStats {
|
||||
min_value,
|
||||
max_value,
|
||||
};
|
||||
let (_ratio, name) = (
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(stats),
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::name(),
|
||||
);
|
||||
Self::open_from_name(write, min_value, max_value, name)
|
||||
}
|
||||
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn open_from_name(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
name: &str,
|
||||
) -> io::Result<DynamicFastFieldSerializer<'a, W>> {
|
||||
// Weirdly the W generic on BitpackedFastFieldSerializer needs to be set,
|
||||
// although name() doesn't use it
|
||||
let variant = if name == BitpackedFastFieldSerializer::<Vec<u8>>::name() {
|
||||
DynamicFastFieldSerializer::Bitpacked(BitpackedFastFieldSerializer::open(
|
||||
write, min_value, max_value,
|
||||
)?)
|
||||
} else {
|
||||
panic!("unknown fastfield serializer {}", name);
|
||||
};
|
||||
|
||||
Ok(variant)
|
||||
}
|
||||
}
|
||||
impl<'a, W: Write> FastFieldSerializer for DynamicFastFieldSerializer<'a, W> {
|
||||
fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Bitpacked(serializer) => serializer.add_val(val),
|
||||
}
|
||||
}
|
||||
fn close_field(self) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Bitpacked(serializer) => serializer.close_field(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BitpackedFastFieldSerializer<'a, W: Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializer<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(BitpackedFastFieldSerializer {
|
||||
bit_packer,
|
||||
write,
|
||||
min_value,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W: 'a + Write> FastFieldSerializer for BitpackedFastFieldSerializer<'a, W> {
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> {
|
||||
fn estimate(
|
||||
/*_fastfield_accessor: impl FastFieldReader<u64>, */ stats: EstimationStats,
|
||||
) -> (f32, &'static str) {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
|
||||
let name = Self::name();
|
||||
(ratio, name)
|
||||
}
|
||||
fn name() -> &'static str {
|
||||
"Bitpacked"
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastBytesFieldSerializer<'a, W: Write> {
|
||||
write: &'a mut W,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
|
||||
pub fn write_all(&mut self, vals: &[u8]) -> io::Result<()> {
|
||||
self.write.write_all(vals)
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> io::Result<()> {
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
216
src/fastfield/serializer/mod.rs
Normal file
216
src/fastfield/serializer/mod.rs
Normal file
@@ -0,0 +1,216 @@
|
||||
use crate::directory::CompositeWrite;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
pub use fastfield_codecs::FastFieldCodecSerializer;
|
||||
pub use fastfield_codecs::FastFieldDataAccess;
|
||||
pub use fastfield_codecs::FastFieldStats;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// `CompositeFastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
///
|
||||
/// Fast fields have different encodings like bit-packing.
|
||||
///
|
||||
/// `FastFieldWriter`s are in charge of pushing the data to
|
||||
/// the serializer.
|
||||
/// The serializer expects to receive the following calls.
|
||||
///
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `close()`
|
||||
pub struct CompositeFastFieldSerializer {
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
}
|
||||
|
||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// https://github.com/rust-lang/rust/pull/86176
|
||||
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: &A,
|
||||
estimations: &mut Vec<(f32, &str, u8)>,
|
||||
) {
|
||||
if !T::is_applicable(fastfield_accessor, stats.clone()) {
|
||||
return;
|
||||
}
|
||||
let (ratio, name, id) = (T::estimate(fastfield_accessor, stats), T::NAME, T::ID);
|
||||
estimations.push((ratio, name, id));
|
||||
}
|
||||
|
||||
impl CompositeFastFieldSerializer {
|
||||
/// Constructor
|
||||
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(CompositeFastFieldSerializer { composite_write })
|
||||
}
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically.
|
||||
pub fn create_auto_detect_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
data_iter_1: impl Iterator<Item = u64>,
|
||||
data_iter_2: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
self.create_auto_detect_u64_fast_field_with_idx(
|
||||
field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
0,
|
||||
)
|
||||
}
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically.
|
||||
pub fn create_auto_detect_u64_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
data_iter_1: impl Iterator<Item = u64>,
|
||||
data_iter_2: impl Iterator<Item = u64>,
|
||||
idx: usize,
|
||||
) -> io::Result<()> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
|
||||
let mut estimations = vec![];
|
||||
|
||||
codec_estimation::<BitpackedFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
codec_estimation::<LinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
codec_estimation::<MultiLinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
|
||||
{
|
||||
warn!(
|
||||
"broken estimation for fast field codec {}",
|
||||
broken_estimation.1
|
||||
);
|
||||
}
|
||||
// removing nan values for codecs with broken calculations, and max values which disables codecs
|
||||
estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX);
|
||||
estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
||||
let (_ratio, name, id) = estimations[0];
|
||||
debug!(
|
||||
"choosing fast field codec {} for field_id {:?}",
|
||||
name, field
|
||||
); // todo print actual field name
|
||||
id.serialize(field_write)?;
|
||||
match name {
|
||||
BitpackedFastFieldSerializer::NAME => {
|
||||
BitpackedFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
LinearInterpolFastFieldSerializer::NAME => {
|
||||
LinearInterpolFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
MultiLinearInterpolFastFieldSerializer::NAME => {
|
||||
MultiLinearInterpolFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
_ => {
|
||||
panic!("unknown fastfield serializer {}", name)
|
||||
}
|
||||
};
|
||||
field_write.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
|
||||
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
idx: usize,
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
// Prepend codec id to field data for compatibility with DynamicFastFieldReader.
|
||||
let id = BitpackedFastFieldSerializer::ID;
|
||||
id.serialize(field_write)?;
|
||||
BitpackedFastFieldSerializerLegacy::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
/// Start serializing a new [u8] fast field
|
||||
pub fn new_bytes_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> FastBytesFieldSerializer<'_, CountingWriter<WritePtr>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastBytesFieldSerializer { write: field_write }
|
||||
}
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.composite_write.close()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastBytesFieldSerializer<'a, W: Write> {
|
||||
write: &'a mut W,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
|
||||
pub fn write_all(&mut self, vals: &[u8]) -> io::Result<()> {
|
||||
self.write.write_all(vals)
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> io::Result<()> {
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
@@ -1,11 +1,12 @@
|
||||
use super::multivalued::MultiValuedFastFieldWriter;
|
||||
use crate::common;
|
||||
use crate::fastfield::serializer::FastFieldSerializer;
|
||||
use super::serializer::FastFieldStats;
|
||||
use super::FastFieldDataAccess;
|
||||
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use common;
|
||||
use fnv::FnvHashMap;
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
@@ -53,7 +54,7 @@ impl FastFieldsWriter {
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
FieldType::HierarchicalFacet(_) => {
|
||||
FieldType::Facet(_) => {
|
||||
let fast_field_writer = MultiValuedFastFieldWriter::new(field, true);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
@@ -265,9 +266,9 @@ impl IntFastFieldWriter {
|
||||
self.add_val(val);
|
||||
}
|
||||
|
||||
/// Extract the stored data
|
||||
pub(crate) fn get_data(&self) -> Vec<u64> {
|
||||
self.vals.iter().collect::<Vec<u64>>()
|
||||
/// get iterator over the data
|
||||
pub(crate) fn iter(&self) -> impl Iterator<Item = u64> + '_ {
|
||||
self.vals.iter()
|
||||
}
|
||||
|
||||
/// Push the fast fields value to the `FastFieldWriter`.
|
||||
@@ -281,17 +282,59 @@ impl IntFastFieldWriter {
|
||||
} else {
|
||||
(self.val_min, self.val_max)
|
||||
};
|
||||
let mut single_field_serializer = serializer.new_u64_fast_field(self.field, min, max)?;
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
for doc_id in doc_id_map.iter_old_doc_ids() {
|
||||
single_field_serializer.add_val(self.vals.get(*doc_id as usize))?;
|
||||
}
|
||||
} else {
|
||||
for val in self.vals.iter() {
|
||||
single_field_serializer.add_val(val)?;
|
||||
}
|
||||
let fastfield_accessor = WriterFastFieldAccessProvider {
|
||||
doc_id_map,
|
||||
vals: &self.vals,
|
||||
};
|
||||
let stats = FastFieldStats {
|
||||
min_value: min,
|
||||
max_value: max,
|
||||
num_vals: self.val_count as u64,
|
||||
};
|
||||
|
||||
single_field_serializer.close_field()
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
let iter = doc_id_map
|
||||
.iter_old_doc_ids()
|
||||
.map(|doc_id| self.vals.get(doc_id as usize));
|
||||
serializer.create_auto_detect_u64_fast_field(
|
||||
self.field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
iter.clone(),
|
||||
iter,
|
||||
)?;
|
||||
} else {
|
||||
serializer.create_auto_detect_u64_fast_field(
|
||||
self.field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
self.vals.iter(),
|
||||
self.vals.iter(),
|
||||
)?;
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
doc_id_map: Option<&'map DocIdMapping>,
|
||||
vals: &'bitp BlockedBitpacker,
|
||||
}
|
||||
impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
/// Return the value associated to the given doc.
|
||||
///
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the index.
|
||||
fn get_val(&self, doc: u64) -> u64 {
|
||||
if let Some(doc_id_map) = self.doc_id_map {
|
||||
self.vals
|
||||
.get(doc_id_map.get_old_doc_id(doc as u32) as usize) // consider extra FastFieldReader wrapper for non doc_id_map
|
||||
} else {
|
||||
self.vals.get(doc as usize)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,3 +26,137 @@ pub use self::serializer::FieldNormsSerializer;
|
||||
pub use self::writer::FieldNormsWriter;
|
||||
|
||||
use self::code::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::fieldnorm::FieldNormsWriter;
|
||||
use crate::query::Query;
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::TextFieldIndexing;
|
||||
use crate::schema::TextOptions;
|
||||
use crate::schema::TEXT;
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
use crate::TERMINATED;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::path::Path;
|
||||
|
||||
use crate::schema::{Field, Schema, STORED};
|
||||
|
||||
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("field", STORED);
|
||||
schema_builder.add_text_field("txt_field", TEXT);
|
||||
schema_builder.add_text_field(
|
||||
"str_field",
|
||||
TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::Basic)
|
||||
.set_fieldnorms(false),
|
||||
),
|
||||
);
|
||||
schema_builder.build()
|
||||
});
|
||||
|
||||
pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
|
||||
pub static TXT_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("txt_field").unwrap());
|
||||
pub static STR_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("str_field").unwrap());
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Cannot register a given fieldnorm twice")]
|
||||
pub fn test_should_panic_when_recording_fieldnorm_twice_for_same_doc() {
|
||||
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
|
||||
fieldnorm_writers.record(0u32, *TXT_FIELD, 5);
|
||||
fieldnorm_writers.record(0u32, *TXT_FIELD, 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fieldnorm() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test"))?;
|
||||
let serializer = FieldNormsSerializer::from_write(write)?;
|
||||
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
|
||||
fieldnorm_writers.record(2u32, *TXT_FIELD, 5);
|
||||
fieldnorm_writers.record(3u32, *TXT_FIELD, 3);
|
||||
fieldnorm_writers.serialize(serializer, None)?;
|
||||
}
|
||||
let file = directory.open_read(&path)?;
|
||||
{
|
||||
let fields_composite = CompositeFile::open(&file)?;
|
||||
assert!(fields_composite.open_read(*FIELD).is_none());
|
||||
assert!(fields_composite.open_read(*STR_FIELD).is_none());
|
||||
let data = fields_composite.open_read(*TXT_FIELD).unwrap();
|
||||
let fieldnorm_reader = FieldNormReader::open(data)?;
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0u32), 0u32);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1u32), 0u32);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2u32), 5u32);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(3u32), 3u32);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fieldnorm_disabled() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(TextFieldIndexing::default().set_fieldnorms(false));
|
||||
let text = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
writer.add_document(doc!(text=>"hello"))?;
|
||||
writer.add_document(doc!(text=>"hello hello hello"))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let query = TermQuery::new(
|
||||
Term::from_field_text(text, "hello"),
|
||||
IndexRecordOption::WithFreqs,
|
||||
);
|
||||
let weight = query.weight(&*searcher, true)?;
|
||||
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
|
||||
assert_eq!(scorer.doc(), 0);
|
||||
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
|
||||
assert_eq!(scorer.advance(), 1);
|
||||
assert_eq!(scorer.doc(), 1);
|
||||
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
|
||||
assert_eq!(scorer.advance(), TERMINATED);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fieldnorm_enabled() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(TextFieldIndexing::default().set_fieldnorms(true));
|
||||
let text = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
writer.add_document(doc!(text=>"hello"))?;
|
||||
writer.add_document(doc!(text=>"hello hello hello"))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let query = TermQuery::new(
|
||||
Term::from_field_text(text, "hello"),
|
||||
IndexRecordOption::WithFreqs,
|
||||
);
|
||||
let weight = query.weight(&*searcher, true)?;
|
||||
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
|
||||
assert_eq!(scorer.doc(), 0);
|
||||
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
|
||||
assert_eq!(scorer.advance(), 1);
|
||||
assert_eq!(scorer.doc(), 1);
|
||||
assert!((scorer.score() - 0.15136132).abs() < 0.001f32);
|
||||
assert_eq!(scorer.advance(), TERMINATED);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use super::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::schema::Field;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::common::CompositeWrite;
|
||||
use crate::directory::CompositeWrite;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use std::io;
|
||||
|
||||
@@ -4,6 +4,7 @@ use super::fieldnorm_to_id;
|
||||
use super::FieldNormsSerializer;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Schema;
|
||||
use std::cmp::Ordering;
|
||||
use std::{io, iter};
|
||||
|
||||
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
|
||||
@@ -12,8 +13,7 @@ use std::{io, iter};
|
||||
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
|
||||
/// byte per document per field.
|
||||
pub struct FieldNormsWriter {
|
||||
fields: Vec<Field>,
|
||||
fieldnorms_buffer: Vec<Vec<u8>>,
|
||||
fieldnorms_buffers: Vec<Option<Vec<u8>>>,
|
||||
}
|
||||
|
||||
impl FieldNormsWriter {
|
||||
@@ -23,7 +23,7 @@ impl FieldNormsWriter {
|
||||
schema
|
||||
.fields()
|
||||
.filter_map(|(field, field_entry)| {
|
||||
if field_entry.is_indexed() {
|
||||
if field_entry.is_indexed() && field_entry.has_fieldnorms() {
|
||||
Some(field)
|
||||
} else {
|
||||
None
|
||||
@@ -35,25 +35,20 @@ impl FieldNormsWriter {
|
||||
/// Initialize with state for tracking the field norm fields
|
||||
/// specified in the schema.
|
||||
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
|
||||
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
|
||||
let max_field = fields
|
||||
.iter()
|
||||
.map(Field::field_id)
|
||||
.max()
|
||||
.map(|max_field_id| max_field_id as usize + 1)
|
||||
.unwrap_or(0);
|
||||
FieldNormsWriter {
|
||||
fields,
|
||||
fieldnorms_buffer: iter::repeat_with(Vec::new)
|
||||
.take(max_field)
|
||||
.collect::<Vec<_>>(),
|
||||
let mut fieldnorms_buffers: Vec<Option<Vec<u8>>> = iter::repeat_with(|| None)
|
||||
.take(schema.num_fields())
|
||||
.collect();
|
||||
for field in FieldNormsWriter::fields_with_fieldnorm(schema) {
|
||||
fieldnorms_buffers[field.field_id() as usize] = Some(Vec::with_capacity(1_000));
|
||||
}
|
||||
FieldNormsWriter { fieldnorms_buffers }
|
||||
}
|
||||
|
||||
/// The memory used inclusive childs
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.fieldnorms_buffer
|
||||
self.fieldnorms_buffers
|
||||
.iter()
|
||||
.flatten()
|
||||
.map(|buf| buf.capacity())
|
||||
.sum()
|
||||
}
|
||||
@@ -62,8 +57,10 @@ impl FieldNormsWriter {
|
||||
///
|
||||
/// Will extend with 0-bytes for documents that have not been seen.
|
||||
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
|
||||
for field in self.fields.iter() {
|
||||
self.fieldnorms_buffer[field.field_id() as usize].resize(max_doc as usize, 0u8);
|
||||
for fieldnorms_buffer_opt in self.fieldnorms_buffers.iter_mut() {
|
||||
if let Some(fieldnorms_buffer) = fieldnorms_buffer_opt.as_mut() {
|
||||
fieldnorms_buffer.resize(max_doc as usize, 0u8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -76,14 +73,23 @@ impl FieldNormsWriter {
|
||||
/// * field - the field being set
|
||||
/// * fieldnorm - the number of terms present in document `doc` in field `field`
|
||||
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
|
||||
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.field_id() as usize];
|
||||
assert!(
|
||||
fieldnorm_buffer.len() <= doc as usize,
|
||||
"Cannot register a given fieldnorm twice"
|
||||
);
|
||||
// we fill intermediary `DocId` as having a fieldnorm of 0.
|
||||
fieldnorm_buffer.resize(doc as usize + 1, 0u8);
|
||||
fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
|
||||
if let Some(fieldnorm_buffer) = self
|
||||
.fieldnorms_buffers
|
||||
.get_mut(field.field_id() as usize)
|
||||
.and_then(Option::as_mut)
|
||||
{
|
||||
match fieldnorm_buffer.len().cmp(&(doc as usize)) {
|
||||
Ordering::Less => {
|
||||
// we fill intermediary `DocId` as having a fieldnorm of 0.
|
||||
fieldnorm_buffer.resize(doc as usize, 0u8);
|
||||
}
|
||||
Ordering::Equal => {}
|
||||
Ordering::Greater => {
|
||||
panic!("Cannot register a given fieldnorm twice")
|
||||
}
|
||||
}
|
||||
fieldnorm_buffer.push(fieldnorm_to_id(fieldnorm));
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
||||
@@ -92,17 +98,18 @@ impl FieldNormsWriter {
|
||||
mut fieldnorms_serializer: FieldNormsSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
for &field in self.fields.iter() {
|
||||
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
|
||||
for (field, fieldnorms_buffer) in self.fieldnorms_buffers.iter().enumerate().filter_map(
|
||||
|(field_id, fieldnorms_buffer_opt)| {
|
||||
fieldnorms_buffer_opt.as_ref().map(|fieldnorms_buffer| {
|
||||
(Field::from_field_id(field_id as u32), fieldnorms_buffer)
|
||||
})
|
||||
},
|
||||
) {
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
let mut mapped_fieldnorm_values = vec![];
|
||||
mapped_fieldnorm_values.resize(fieldnorm_values.len(), 0u8);
|
||||
for (new_doc_id, old_doc_id) in doc_id_map.iter_old_doc_ids().enumerate() {
|
||||
mapped_fieldnorm_values[new_doc_id] = fieldnorm_values[*old_doc_id as usize];
|
||||
}
|
||||
fieldnorms_serializer.serialize_field(field, &mapped_fieldnorm_values)?;
|
||||
let remapped_fieldnorm_buffer = doc_id_map.remap(fieldnorms_buffer);
|
||||
fieldnorms_serializer.serialize_field(field, &remapped_fieldnorm_buffer)?;
|
||||
} else {
|
||||
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
|
||||
fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?;
|
||||
}
|
||||
}
|
||||
fieldnorms_serializer.close()?;
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
use crate::schema;
|
||||
use crate::Index;
|
||||
use crate::IndexSettings;
|
||||
use crate::IndexSortByField;
|
||||
use crate::Order;
|
||||
use crate::Searcher;
|
||||
use crate::{doc, schema::*};
|
||||
use rand::thread_rng;
|
||||
@@ -35,17 +39,17 @@ fn test_functional_store() -> crate::Result<()> {
|
||||
let mut doc_set: Vec<u64> = Vec::new();
|
||||
|
||||
let mut doc_id = 0u64;
|
||||
for iteration in 0..500 {
|
||||
for iteration in 0..get_num_iterations() {
|
||||
dbg!(iteration);
|
||||
let num_docs: usize = rng.gen_range(0..4);
|
||||
if doc_set.len() >= 1 {
|
||||
if !doc_set.is_empty() {
|
||||
let doc_to_remove_id = rng.gen_range(0..doc_set.len());
|
||||
let removed_doc_id = doc_set.swap_remove(doc_to_remove_id);
|
||||
index_writer.delete_term(Term::from_field_u64(id_field, removed_doc_id));
|
||||
}
|
||||
for _ in 0..num_docs {
|
||||
doc_set.push(doc_id);
|
||||
index_writer.add_document(doc!(id_field=>doc_id));
|
||||
index_writer.add_document(doc!(id_field=>doc_id))?;
|
||||
doc_id += 1;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
@@ -56,16 +60,37 @@ fn test_functional_store() -> crate::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_num_iterations() -> usize {
|
||||
std::env::var("NUM_FUNCTIONAL_TEST_ITERATIONS")
|
||||
.map(|str| str.parse().unwrap())
|
||||
.unwrap_or(2000)
|
||||
}
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_functional_indexing() -> crate::Result<()> {
|
||||
fn test_functional_indexing_sorted() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let id_field = schema_builder.add_u64_field("id", INDEXED | FAST);
|
||||
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
|
||||
let text_field_options = TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text_field", text_field_options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_from_tempdir(schema)?;
|
||||
let mut index_builder = Index::builder().schema(schema);
|
||||
index_builder = index_builder.settings(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "id".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
});
|
||||
let index = index_builder.create_from_tempdir().unwrap();
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
let mut rng = thread_rng();
|
||||
@@ -75,7 +100,7 @@ fn test_functional_indexing() -> crate::Result<()> {
|
||||
let mut committed_docs: HashSet<u64> = HashSet::new();
|
||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||
|
||||
for _ in 0..200 {
|
||||
for _ in 0..get_num_iterations() {
|
||||
let random_val = rng.gen_range(0..20);
|
||||
if random_val == 0 {
|
||||
index_writer.commit()?;
|
||||
@@ -88,19 +113,95 @@ fn test_functional_indexing() -> crate::Result<()> {
|
||||
&searcher,
|
||||
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
|
||||
)?;
|
||||
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
||||
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
||||
index_writer.delete_term(doc_id_term);
|
||||
} else {
|
||||
if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
||||
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
||||
index_writer.delete_term(doc_id_term);
|
||||
} else {
|
||||
uncommitted_docs.insert(random_val);
|
||||
let mut doc = Document::new();
|
||||
doc.add_u64(id_field, random_val);
|
||||
for i in 1u64..10u64 {
|
||||
doc.add_u64(multiples_field, random_val * i);
|
||||
}
|
||||
index_writer.add_document(doc);
|
||||
uncommitted_docs.insert(random_val);
|
||||
let mut doc = Document::new();
|
||||
doc.add_u64(id_field, random_val);
|
||||
for i in 1u64..10u64 {
|
||||
doc.add_u64(multiples_field, random_val * i);
|
||||
}
|
||||
doc.add_text(text_field, get_text());
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
|
||||
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
|
||||
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
|
||||
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
|
||||
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est \
|
||||
laborum.";
|
||||
fn get_text() -> String {
|
||||
use rand::seq::SliceRandom;
|
||||
let mut rng = thread_rng();
|
||||
let tokens: Vec<_> = LOREM.split(' ').collect();
|
||||
let random_val = rng.gen_range(0..20);
|
||||
|
||||
(0..random_val)
|
||||
.map(|_| tokens.choose(&mut rng).unwrap())
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_functional_indexing_unsorted() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
|
||||
let text_field_options = TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text_field", text_field_options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_from_tempdir(schema)?;
|
||||
let reader = index.reader()?;
|
||||
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
|
||||
|
||||
let mut committed_docs: HashSet<u64> = HashSet::new();
|
||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||
|
||||
for _ in 0..get_num_iterations() {
|
||||
let random_val = rng.gen_range(0..20);
|
||||
if random_val == 0 {
|
||||
index_writer.commit()?;
|
||||
committed_docs.extend(&uncommitted_docs);
|
||||
uncommitted_docs.clear();
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
// check that everything is correct.
|
||||
check_index_content(
|
||||
&searcher,
|
||||
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
|
||||
)?;
|
||||
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
||||
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
||||
index_writer.delete_term(doc_id_term);
|
||||
} else {
|
||||
uncommitted_docs.insert(random_val);
|
||||
let mut doc = Document::new();
|
||||
doc.add_u64(id_field, random_val);
|
||||
for i in 1u64..10u64 {
|
||||
doc.add_u64(multiples_field, random_val * i);
|
||||
}
|
||||
doc.add_text(text_field, get_text());
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::operation::DeleteOperation;
|
||||
use crate::Opstamp;
|
||||
use std::mem;
|
||||
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::{Arc, RwLock, Weak};
|
||||
|
||||
@@ -105,7 +105,7 @@ impl DeleteQueue {
|
||||
return None;
|
||||
}
|
||||
|
||||
let delete_operations = mem::replace(&mut self_wlock.writer, vec![]);
|
||||
let delete_operations = std::mem::take(&mut self_wlock.writer);
|
||||
|
||||
let new_block = Arc::new(Block {
|
||||
operations: Arc::from(delete_operations.into_boxed_slice()),
|
||||
@@ -286,7 +286,7 @@ mod tests {
|
||||
operations_it.advance();
|
||||
}
|
||||
{
|
||||
let mut operations_it = snapshot.clone();
|
||||
let mut operations_it = snapshot;
|
||||
assert_eq!(operations_it.get().unwrap().opstamp, 1);
|
||||
operations_it.advance();
|
||||
assert_eq!(operations_it.get().unwrap().opstamp, 2);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user