mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 08:12:54 +00:00
Compare commits
113 Commits
fix-issue-
...
revert-122
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bd083159ac | ||
|
|
dde49ac8e2 | ||
|
|
c3cc93406d | ||
|
|
bd0f9211da | ||
|
|
c503c6e4fa | ||
|
|
02174d26af | ||
|
|
cf92be3bd6 | ||
|
|
72cef12db1 | ||
|
|
bbc0a2e233 | ||
|
|
4fd1a6c84b | ||
|
|
c83d99c414 | ||
|
|
eacf510175 | ||
|
|
8802d125f8 | ||
|
|
33301a3eb4 | ||
|
|
7234bef0eb | ||
|
|
fcff91559b | ||
|
|
b75d4e59d1 | ||
|
|
c6b5ab1dbe | ||
|
|
c12e07f0ce | ||
|
|
8b877a4c26 | ||
|
|
7dc0dc1c9b | ||
|
|
0462754673 | ||
|
|
5916ceda73 | ||
|
|
70283dc6c8 | ||
|
|
dbaf4f3623 | ||
|
|
4808648322 | ||
|
|
54afb9b34a | ||
|
|
d336c8b938 | ||
|
|
980d1b2796 | ||
|
|
6317982876 | ||
|
|
e2fbbc08ca | ||
|
|
99cd25beae | ||
|
|
737ecc7015 | ||
|
|
09668459c8 | ||
|
|
e5fd30f438 | ||
|
|
c412a46105 | ||
|
|
3a78402496 | ||
|
|
d18ac136c0 | ||
|
|
b5b1244857 | ||
|
|
27acfa4dea | ||
|
|
02cffa4dea | ||
|
|
b52abbc771 | ||
|
|
894c61867f | ||
|
|
352e0cc58d | ||
|
|
ffe4446d90 | ||
|
|
4d05b26e7a | ||
|
|
0855649986 | ||
|
|
d828e58903 | ||
|
|
aa0396fe27 | ||
|
|
8d8315f8d0 | ||
|
|
078c0a2e2e | ||
|
|
f21e8dd875 | ||
|
|
74e36c7e97 | ||
|
|
f27ae04282 | ||
|
|
0ce49c9dd4 | ||
|
|
fe8e58e078 | ||
|
|
efc0d8341b | ||
|
|
22bcc83d10 | ||
|
|
5ee5037934 | ||
|
|
c217bfed1e | ||
|
|
c27ccd3e24 | ||
|
|
367f5da782 | ||
|
|
b256df6599 | ||
|
|
d7a6a409a1 | ||
|
|
a1f5cead96 | ||
|
|
37c5fe3c86 | ||
|
|
4583fa270b | ||
|
|
beb3a5bd73 | ||
|
|
93cbd52bf0 | ||
|
|
c22177a005 | ||
|
|
4da71273e1 | ||
|
|
2c78b31aab | ||
|
|
4ae1d87632 | ||
|
|
46b86a7976 | ||
|
|
3bc177e69d | ||
|
|
319609e9c1 | ||
|
|
9d87b89718 | ||
|
|
dd81e38e53 | ||
|
|
9f32b22602 | ||
|
|
096ce7488e | ||
|
|
a1782dd172 | ||
|
|
000d76b11a | ||
|
|
abd29f6646 | ||
|
|
b4ecf0ab2f | ||
|
|
798f7dbf67 | ||
|
|
06a2e47c8d | ||
|
|
e0b83eb291 | ||
|
|
13401f46ea | ||
|
|
1a45b030dc | ||
|
|
62052bcc2d | ||
|
|
3265f7bec3 | ||
|
|
ee0881712a | ||
|
|
483e0336b6 | ||
|
|
3e8f267e33 | ||
|
|
3b247fd968 | ||
|
|
750f6e6479 | ||
|
|
5b475e6603 | ||
|
|
0ca7f73dc5 | ||
|
|
47ed18845e | ||
|
|
dc141cdb29 | ||
|
|
f6cf6e889b | ||
|
|
f379a80233 | ||
|
|
4a320fd1ff | ||
|
|
85d23e8e3b | ||
|
|
022ab9d298 | ||
|
|
605e8603dc | ||
|
|
70f160b329 | ||
|
|
6d265e6bed | ||
|
|
fdc512391b | ||
|
|
108714c934 | ||
|
|
44e8cf98a5 | ||
|
|
f0ee69d9e9 | ||
|
|
b8a10c8406 |
7
.github/dependabot.yml
vendored
7
.github/dependabot.yml
vendored
@@ -6,3 +6,10 @@ updates:
|
||||
interval: daily
|
||||
time: "20:00"
|
||||
open-pull-requests-limit: 10
|
||||
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: daily
|
||||
time: "20:00"
|
||||
open-pull-requests-limit: 10
|
||||
|
||||
34
.github/workflows/coverage.yml
vendored
34
.github/workflows/coverage.yml
vendored
@@ -1,27 +1,25 @@
|
||||
name: coverage
|
||||
name: Coverage
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: coverage
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: xd009642/tarpaulin:develop-nightly
|
||||
options: --security-opt seccomp=unconfined
|
||||
coverage:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Generate code coverage
|
||||
run: |
|
||||
cargo +nightly tarpaulin --verbose --all-features --workspace --timeout 120 --out Xml
|
||||
|
||||
- name: Upload to codecov.io
|
||||
uses: codecov/codecov-action@v1
|
||||
- uses: actions/checkout@v2
|
||||
- name: Install Rust
|
||||
run: rustup toolchain install nightly --component llvm-tools-preview
|
||||
- name: Install cargo-llvm-cov
|
||||
run: curl -LsSf https://github.com/taiki-e/cargo-llvm-cov/releases/latest/download/cargo-llvm-cov-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin
|
||||
- name: Generate code coverage
|
||||
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v2
|
||||
with:
|
||||
# token: ${{secrets.CODECOV_TOKEN}} # not required for public repos
|
||||
fail_ci_if_error: true
|
||||
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
|
||||
files: lcov.info
|
||||
fail_ci_if_error: true
|
||||
|
||||
24
.github/workflows/long_running.yml
vendored
Normal file
24
.github/workflows/long_running.yml
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
name: Rust
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
|
||||
|
||||
jobs:
|
||||
functional_test_unsorted:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Run indexing_unsorted
|
||||
run: cargo test indexing_unsorted -- --ignored
|
||||
functional_test_sorted:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Run indexing_sorted
|
||||
run: cargo test indexing_sorted -- --ignored
|
||||
|
||||
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -10,7 +10,7 @@ env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
jobs:
|
||||
build:
|
||||
test:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,4 +1,5 @@
|
||||
tantivy.iml
|
||||
.cargo
|
||||
proptest-regressions
|
||||
*.swp
|
||||
target
|
||||
|
||||
92
.travis.yml
92
.travis.yml
@@ -1,92 +0,0 @@
|
||||
# Based on the "trust" template v0.1.2
|
||||
# https://github.com/japaric/trust/tree/v0.1.2
|
||||
|
||||
dist: trusty
|
||||
language: rust
|
||||
services: docker
|
||||
sudo: required
|
||||
|
||||
env:
|
||||
global:
|
||||
- CRATE_NAME=tantivy
|
||||
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
|
||||
# - secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
- kalakris-cmake
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- g++-4.8
|
||||
- libcurl4-openssl-dev
|
||||
- libelf-dev
|
||||
- libdw-dev
|
||||
- binutils-dev
|
||||
- cmake
|
||||
|
||||
matrix:
|
||||
include:
|
||||
# Android
|
||||
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
||||
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
|
||||
|
||||
# Linux
|
||||
#- env: TARGET=aarch64-unknown-linux-gnu
|
||||
#- env: TARGET=i686-unknown-linux-gnu
|
||||
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 #UPLOAD_DOCS=1
|
||||
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
|
||||
# OSX
|
||||
#- env: TARGET=x86_64-apple-darwin
|
||||
# os: osx
|
||||
|
||||
before_install:
|
||||
- set -e
|
||||
- rustup self update
|
||||
- rustup component add rustfmt
|
||||
|
||||
install:
|
||||
- sh ci/install.sh
|
||||
- source ~/.cargo/env || true
|
||||
- env | grep "TRAVIS"
|
||||
|
||||
before_script:
|
||||
- export PATH=$HOME/.cargo/bin:$PATH
|
||||
- cargo install cargo-update || echo "cargo-update already installed"
|
||||
- cargo install cargo-travis || echo "cargo-travis already installed"
|
||||
|
||||
script:
|
||||
- bash ci/script.sh
|
||||
- cargo fmt --all -- --check
|
||||
|
||||
before_deploy:
|
||||
- sh ci/before_deploy.sh
|
||||
|
||||
after_success:
|
||||
# Needs GH_TOKEN env var to be set in travis settings
|
||||
- if [[ -v GH_TOKEN ]]; then echo "GH TOKEN IS SET"; else echo "GH TOKEN NOT SET"; fi
|
||||
- if [[ -v UPLOAD_DOCS ]]; then cargo doc; cargo doc-upload; else echo "doc upload disabled."; fi
|
||||
|
||||
#cache: cargo
|
||||
#before_cache:
|
||||
# # Travis can't cache files that are not readable by "others"
|
||||
# - chmod -R a+r $HOME/.cargo
|
||||
# - find ./target/debug -type f -maxdepth 1 -delete
|
||||
# - rm -f ./target/.rustc_info.json
|
||||
# - rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
|
||||
# - rm -r target/debug/examples/
|
||||
# - ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
|
||||
|
||||
#branches:
|
||||
# only:
|
||||
# # release tags
|
||||
# - /^v\d+\.\d+\.\d+.*$/
|
||||
# - master
|
||||
|
||||
notifications:
|
||||
email:
|
||||
on_success: never
|
||||
20
CHANGELOG.md
20
CHANGELOG.md
@@ -1,3 +1,21 @@
|
||||
Tantivy 0.17
|
||||
================================
|
||||
- Change to non-strict schema. Ignore fields in data which are not defined in schema. Previously this returned an error. #1211
|
||||
|
||||
Tantivy 0.16.2
|
||||
================================
|
||||
- Bugfix in FuzzyTermQuery. (tranposition_cost_one was not doing anything)
|
||||
|
||||
Tantivy 0.16.1
|
||||
========================
|
||||
- Major Bugfix on multivalued fastfield. #1151
|
||||
- Demux operation (@PSeitz)
|
||||
|
||||
Tantivy 0.16.0
|
||||
=========================
|
||||
- Bugfix in the filesum check. (@evanxg852000) #1127
|
||||
- Bugfix in positions when the index is sorted by a field. (@appaquet) #1125
|
||||
|
||||
Tantivy 0.15.3
|
||||
=========================
|
||||
- Major bugfix. Deleting documents was broken when the index was sorted by a field. (@appaquet, @fulmicoton) #1101
|
||||
@@ -104,7 +122,7 @@ Tantivy 0.12.0
|
||||
## How to update?
|
||||
|
||||
Crates relying on custom tokenizer, or registering tokenizer in the manager will require some
|
||||
minor changes. Check https://github.com/tantivy-search/tantivy/blob/main/examples/custom_tokenizer.rs
|
||||
minor changes. Check https://github.com/quickwit-inc/tantivy/blob/main/examples/custom_tokenizer.rs
|
||||
to check for some code sample.
|
||||
|
||||
Tantivy 0.11.3
|
||||
|
||||
28
Cargo.toml
28
Cargo.toml
@@ -1,13 +1,13 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.16.0-dev"
|
||||
version = "0.17.0-dev"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
description = """Search engine library"""
|
||||
documentation = "https://docs.rs/tantivy/"
|
||||
homepage = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
homepage = "https://github.com/quickwit-inc/tantivy"
|
||||
repository = "https://github.com/quickwit-inc/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
@@ -19,8 +19,8 @@ crc32fast = "1.2.1"
|
||||
once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
tantivy-fst = "0.3"
|
||||
memmap2 = {version = "0.3", optional=true}
|
||||
lz4_flex = { version = "0.8.0", default-features = false, features = ["checked-decode"], optional = true }
|
||||
memmap2 = {version = "0.5", optional=true}
|
||||
lz4_flex = { version = "0.9", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3", optional = true }
|
||||
snap = { version = "1.0.5", optional = true }
|
||||
tempfile = { version = "3.2", optional = true }
|
||||
@@ -31,13 +31,13 @@ num_cpus = "1.13"
|
||||
fs2={ version = "0.4.3", optional = true }
|
||||
levenshtein_automata = "0.2"
|
||||
uuid = { version = "0.8.2", features = ["v4", "serde"] }
|
||||
crossbeam = "0.8"
|
||||
crossbeam = "0.8.1"
|
||||
futures = { version = "0.3.15", features = ["thread-pool"] }
|
||||
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||
common = { version="0.1", path="./common" }
|
||||
common = { version = "0.1", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version="0.1", path="./ownedbytes" }
|
||||
ownedbytes = { version="0.2", path="./ownedbytes" }
|
||||
stable_deref_trait = "1.2"
|
||||
rust-stemmers = "1.2"
|
||||
downcast-rs = "1.2"
|
||||
@@ -46,15 +46,15 @@ census = "0.4"
|
||||
fnv = "1.0.7"
|
||||
thiserror = "1.0.24"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.4"
|
||||
fail = "0.5"
|
||||
murmurhash32 = "0.2"
|
||||
chrono = "0.4.19"
|
||||
smallvec = "1.6.1"
|
||||
rayon = "1.5"
|
||||
lru = "0.6.5"
|
||||
lru = "0.7.0"
|
||||
fastdivide = "0.3"
|
||||
itertools = "0.10.0"
|
||||
measure_time = "0.7.0"
|
||||
measure_time = "0.8.0"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3.9"
|
||||
@@ -64,10 +64,12 @@ rand = "0.8.3"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.8"
|
||||
proptest = "1.0"
|
||||
criterion = "0.3.4"
|
||||
criterion = "0.3.5"
|
||||
test-env-log = "0.2.7"
|
||||
env_logger = "0.9.0"
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.4"
|
||||
version = "0.5"
|
||||
features = ["failpoints"]
|
||||
|
||||
[profile.release]
|
||||
|
||||
18
README.md
18
README.md
@@ -1,9 +1,9 @@
|
||||
|
||||
[](https://travis-ci.org/tantivy-search/tantivy)
|
||||
[](https://codecov.io/gh/tantivy-search/tantivy)
|
||||
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[](https://docs.rs/crate/tantivy/)
|
||||
[](https://github.com/quickwit-inc/tantivy/actions/workflows/test.yml)
|
||||
[](https://codecov.io/gh/quickwit-inc/tantivy)
|
||||
[](https://discord.gg/MT27AG5EVE)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/main)
|
||||
[](https://crates.io/crates/tantivy)
|
||||
|
||||

|
||||
@@ -17,9 +17,6 @@
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
|
||||
|
||||
[](https://www.patreon.com/fulmicoton)
|
||||
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in Rust.
|
||||
|
||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||
@@ -78,13 +75,12 @@ It walks you through getting a wikipedia search engine up and running in a few m
|
||||
|
||||
There are many ways to support this project.
|
||||
|
||||
- Use Tantivy and tell us about your experience on [Gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
|
||||
- Use Tantivy and tell us about your experience on [Discord](https://discord.gg/MT27AG5EVE) or by email (paul.masurel@gmail.com)
|
||||
- Report bugs
|
||||
- Write a blog post
|
||||
- Help with documentation by asking questions or submitting PRs
|
||||
- Contribute code (you can join [our Gitter](https://gitter.im/tantivy-search/tantivy))
|
||||
- Contribute code (you can join [our Discord server](https://discord.gg/MT27AG5EVE))
|
||||
- Talk about Tantivy around you
|
||||
- [](https://www.patreon.com/fulmicoton)
|
||||
|
||||
# Contributing code
|
||||
|
||||
@@ -96,7 +92,7 @@ Tantivy compiles on stable Rust but requires `Rust >= 1.27`.
|
||||
To check out and run tests, you can simply run:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/tantivy-search/tantivy.git
|
||||
git clone https://github.com/quickwit-inc/tantivy.git
|
||||
cd tantivy
|
||||
cargo build
|
||||
```
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
[package]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.1.0"
|
||||
version = "0.1.1"
|
||||
edition = "2018"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = []
|
||||
description = """Tantivy-sub crate: bitpacking"""
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/quickwit-inc/tantivy"
|
||||
keywords = []
|
||||
|
||||
|
||||
|
||||
@@ -50,3 +50,32 @@ where
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
assert_eq!(compute_num_bits(0), 0u8);
|
||||
assert_eq!(compute_num_bits(2), 2u8);
|
||||
assert_eq!(compute_num_bits(3), 2u8);
|
||||
assert_eq!(compute_num_bits(4), 3u8);
|
||||
assert_eq!(compute_num_bits(255), 8u8);
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_empty() {
|
||||
let vals: Vec<u32> = vec![];
|
||||
assert_eq!(minmax(vals.into_iter()), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_one() {
|
||||
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_two() {
|
||||
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
|
||||
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[package]
|
||||
name = "common"
|
||||
name = "tantivy-common"
|
||||
version = "0.1.0"
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
@@ -10,3 +10,8 @@ description = "common traits and utility functions used by multiple tantivy subc
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
ownedbytes = { version="0.2", path="../ownedbytes" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
rand = "0.8.4"
|
||||
|
||||
744
common/src/bitset.rs
Normal file
744
common/src/bitset.rs
Normal file
@@ -0,0 +1,744 @@
|
||||
use ownedbytes::OwnedBytes;
|
||||
use std::convert::TryInto;
|
||||
use std::io::Write;
|
||||
use std::u64;
|
||||
use std::{fmt, io};
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub struct TinySet(u64);
|
||||
|
||||
impl fmt::Debug for TinySet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.into_iter().collect::<Vec<u32>>().fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TinySetIterator(TinySet);
|
||||
impl Iterator for TinySetIterator {
|
||||
type Item = u32;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0.pop_lowest()
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for TinySet {
|
||||
type Item = u32;
|
||||
type IntoIter = TinySetIterator;
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
TinySetIterator(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl TinySet {
|
||||
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
|
||||
writer.write_all(self.0.to_le_bytes().as_ref())
|
||||
}
|
||||
|
||||
pub fn into_bytes(self) -> [u8; 8] {
|
||||
self.0.to_le_bytes()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn deserialize(data: [u8; 8]) -> Self {
|
||||
let val: u64 = u64::from_le_bytes(data);
|
||||
TinySet(val)
|
||||
}
|
||||
|
||||
/// Returns an empty `TinySet`.
|
||||
#[inline]
|
||||
pub fn empty() -> TinySet {
|
||||
TinySet(0u64)
|
||||
}
|
||||
|
||||
/// Returns a full `TinySet`.
|
||||
#[inline]
|
||||
pub fn full() -> TinySet {
|
||||
TinySet::empty().complement()
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
///
|
||||
/// Careful on making this function public, as it will break the padding handling in the last
|
||||
/// bucket.
|
||||
fn complement(self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns true iff the `TinySet` contains the element `el`.
|
||||
pub fn contains(self, el: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the number of elements in the TinySet.
|
||||
pub fn len(self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
/// Returns the intersection of `self` and `other`
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
}
|
||||
|
||||
/// Creates a new `TinySet` containing only one element
|
||||
/// within `[0; 64[`
|
||||
#[inline]
|
||||
pub fn singleton(el: u32) -> TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64)
|
||||
#[inline]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Removes an element within [0..64)
|
||||
#[inline]
|
||||
pub fn remove(self, el: u32) -> TinySet {
|
||||
self.intersect(TinySet::singleton(el).complement())
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64)
|
||||
///
|
||||
/// returns true if the set changed
|
||||
#[inline]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.insert(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Remove a element within [0..64)
|
||||
///
|
||||
/// returns true if the set changed
|
||||
#[inline]
|
||||
pub fn remove_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.remove(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 | other.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` is empty.
|
||||
#[inline]
|
||||
pub fn is_empty(self) -> bool {
|
||||
self.0 == 0u64
|
||||
}
|
||||
|
||||
/// Returns the lowest element in the `TinySet`
|
||||
/// and removes it.
|
||||
#[inline]
|
||||
pub fn pop_lowest(&mut self) -> Option<u32> {
|
||||
if self.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let lowest = self.0.trailing_zeros() as u32;
|
||||
self.0 ^= TinySet::singleton(lowest).0;
|
||||
Some(lowest)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` than contains all values up
|
||||
/// to limit excluded.
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_lower(upper_bound: u32) -> TinySet {
|
||||
TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` that contains all values greater
|
||||
/// or equal to the given limit, included. (and up to 63)
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
|
||||
TinySet::range_lower(from_included).complement()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BitSet {
|
||||
tinysets: Box<[TinySet]>,
|
||||
len: u64,
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
fn num_buckets(max_val: u32) -> u32 {
|
||||
(max_val + 63u32) / 64u32
|
||||
}
|
||||
|
||||
impl BitSet {
|
||||
/// serialize a `BitSet`.
|
||||
///
|
||||
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
|
||||
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
|
||||
for tinyset in self.tinysets.iter().cloned() {
|
||||
writer.write_all(&tinyset.into_bytes())?;
|
||||
}
|
||||
writer.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val)`.
|
||||
pub fn with_max_value(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybitsets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
||||
BitSet {
|
||||
tinysets: tinybitsets,
|
||||
len: 0,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `BitSet` that may contain elements. Initially all values will be set.
|
||||
/// within `[0, max_val)`.
|
||||
pub fn with_max_value_and_full(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let mut tinybitsets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
|
||||
|
||||
// Fix padding
|
||||
let lower = max_value % 64u32;
|
||||
if lower != 0 {
|
||||
tinybitsets[tinybitsets.len() - 1] = TinySet::range_lower(lower);
|
||||
}
|
||||
BitSet {
|
||||
tinysets: tinybitsets,
|
||||
len: max_value as u64,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all elements from the `BitSet`.
|
||||
pub fn clear(&mut self) {
|
||||
for tinyset in self.tinysets.iter_mut() {
|
||||
*tinyset = TinySet::empty();
|
||||
}
|
||||
}
|
||||
|
||||
/// Intersect with serialized bitset
|
||||
pub fn intersect_update(&mut self, other: &ReadOnlyBitSet) {
|
||||
self.intersect_update_with_iter(other.iter_tinysets());
|
||||
}
|
||||
|
||||
/// Intersect with tinysets
|
||||
fn intersect_update_with_iter(&mut self, other: impl Iterator<Item = TinySet>) {
|
||||
self.len = 0;
|
||||
for (left, right) in self.tinysets.iter_mut().zip(other) {
|
||||
*left = left.intersect(right);
|
||||
self.len += left.len() as u64;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.len as usize
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
#[inline]
|
||||
pub fn insert(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
#[inline]
|
||||
pub fn remove(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len -= if self.tinysets[higher as usize].remove_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
self.tinyset(el / 64u32).contains(el % 64)
|
||||
}
|
||||
|
||||
/// Returns the first non-empty `TinySet` associated to a bucket lower
|
||||
/// or greater than bucket.
|
||||
///
|
||||
/// Reminder: the tiny set with the bucket `bucket`, represents the
|
||||
/// elements from `bucket * 64` to `(bucket+1) * 64`.
|
||||
pub fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
|
||||
self.tinysets[bucket as usize..]
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|tinyset| !tinyset.is_empty())
|
||||
.map(|delta_bucket| bucket + delta_bucket as u32)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
/// Returns the tiny bitset representing the
|
||||
/// the set restricted to the number range from
|
||||
/// `bucket * 64` to `(bucket + 1) * 64`.
|
||||
pub fn tinyset(&self, bucket: u32) -> TinySet {
|
||||
self.tinysets[bucket as usize]
|
||||
}
|
||||
}
|
||||
|
||||
/// Serialized BitSet.
|
||||
#[derive(Clone)]
|
||||
pub struct ReadOnlyBitSet {
|
||||
data: OwnedBytes,
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
pub fn intersect_bitsets(left: &ReadOnlyBitSet, other: &ReadOnlyBitSet) -> ReadOnlyBitSet {
|
||||
assert_eq!(left.max_value(), other.max_value());
|
||||
assert_eq!(left.data.len(), other.data.len());
|
||||
let union_tinyset_it = left
|
||||
.iter_tinysets()
|
||||
.zip(other.iter_tinysets())
|
||||
.map(|(left_tinyset, right_tinyset)| left_tinyset.intersect(right_tinyset));
|
||||
let mut output_dataset: Vec<u8> = Vec::with_capacity(left.data.len());
|
||||
for tinyset in union_tinyset_it {
|
||||
output_dataset.extend_from_slice(&tinyset.into_bytes());
|
||||
}
|
||||
ReadOnlyBitSet {
|
||||
data: OwnedBytes::new(output_dataset),
|
||||
max_value: left.max_value(),
|
||||
}
|
||||
}
|
||||
|
||||
impl ReadOnlyBitSet {
|
||||
pub fn open(data: OwnedBytes) -> Self {
|
||||
let (max_value_data, data) = data.split(4);
|
||||
assert_eq!(data.len() % 8, 0);
|
||||
let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap());
|
||||
ReadOnlyBitSet { data, max_value }
|
||||
}
|
||||
|
||||
/// Number of elements in the bitset.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
self.iter_tinysets()
|
||||
.map(|tinyset| tinyset.len() as usize)
|
||||
.sum()
|
||||
}
|
||||
|
||||
/// Iterate the tinyset on the fly from serialized data.
|
||||
///
|
||||
#[inline]
|
||||
fn iter_tinysets(&self) -> impl Iterator<Item = TinySet> + '_ {
|
||||
self.data.chunks_exact(8).map(move |chunk| {
|
||||
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap());
|
||||
tinyset
|
||||
})
|
||||
}
|
||||
|
||||
/// Iterate over the positions of the elements.
|
||||
///
|
||||
#[inline]
|
||||
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
|
||||
self.iter_tinysets()
|
||||
.enumerate()
|
||||
.flat_map(move |(chunk_num, tinyset)| {
|
||||
let chunk_base_val = chunk_num as u32 * 64;
|
||||
tinyset
|
||||
.into_iter()
|
||||
.map(move |val| val + chunk_base_val)
|
||||
.take_while(move |doc| *doc < self.max_value)
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
#[inline]
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
let byte_offset = el / 8u32;
|
||||
let b: u8 = self.data[byte_offset as usize];
|
||||
let shift = (el % 8) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// Maximum value the bitset may contain.
|
||||
/// (Note this is not the maximum value contained in the set.)
|
||||
///
|
||||
/// A bitset has an intrinsic capacity.
|
||||
/// It only stores elements within [0..max_value).
|
||||
#[inline]
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
/// Number of bytes used in the bitset representation.
|
||||
pub fn num_bytes(&self) -> usize {
|
||||
self.data.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a BitSet> for ReadOnlyBitSet {
|
||||
fn from(bitset: &'a BitSet) -> ReadOnlyBitSet {
|
||||
let mut buffer = Vec::with_capacity(bitset.tinysets.len() * 8 + 4);
|
||||
bitset
|
||||
.serialize(&mut buffer)
|
||||
.expect("serializing into a buffer should never fail");
|
||||
ReadOnlyBitSet::open(OwnedBytes::new(buffer))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::BitSet;
|
||||
use super::ReadOnlyBitSet;
|
||||
use super::TinySet;
|
||||
use ownedbytes::OwnedBytes;
|
||||
use rand::distributions::Bernoulli;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_full_multi() {
|
||||
for i in 0..1000 {
|
||||
let bitset = BitSet::with_max_value_and_full(i);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len() as usize, i as usize);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_full_block() {
|
||||
let bitset = BitSet::with_max_value_and_full(64);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len() as usize, 64 as usize);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_full() {
|
||||
let mut bitset = BitSet::with_max_value_and_full(5);
|
||||
bitset.remove(3);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len(), 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_intersect() {
|
||||
let bitset_serialized = {
|
||||
let mut bitset = BitSet::with_max_value_and_full(5);
|
||||
bitset.remove(1);
|
||||
bitset.remove(3);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
ReadOnlyBitSet::open(OwnedBytes::new(out))
|
||||
};
|
||||
|
||||
let mut bitset = BitSet::with_max_value_and_full(5);
|
||||
bitset.remove(1);
|
||||
bitset.intersect_update(&bitset_serialized);
|
||||
|
||||
assert!(bitset.contains(0));
|
||||
assert!(!bitset.contains(1));
|
||||
assert!(bitset.contains(2));
|
||||
assert!(!bitset.contains(3));
|
||||
assert!(bitset.contains(4));
|
||||
|
||||
bitset.intersect_update_with_iter(vec![TinySet::singleton(0)].into_iter());
|
||||
|
||||
assert!(bitset.contains(0));
|
||||
assert!(!bitset.contains(1));
|
||||
assert!(!bitset.contains(2));
|
||||
assert!(!bitset.contains(3));
|
||||
assert!(!bitset.contains(4));
|
||||
assert_eq!(bitset.len(), 1);
|
||||
|
||||
bitset.intersect_update_with_iter(vec![TinySet::singleton(1)].into_iter());
|
||||
assert!(!bitset.contains(0));
|
||||
assert!(!bitset.contains(1));
|
||||
assert!(!bitset.contains(2));
|
||||
assert!(!bitset.contains(3));
|
||||
assert!(!bitset.contains(4));
|
||||
assert_eq!(bitset.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_serialized_bitset_empty() {
|
||||
let mut bitset = BitSet::with_max_value(5);
|
||||
bitset.insert(3);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len(), 1);
|
||||
|
||||
{
|
||||
let bitset = BitSet::with_max_value(5);
|
||||
let mut out = vec![];
|
||||
bitset.serialize(&mut out).unwrap();
|
||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
||||
assert_eq!(bitset.len(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set_remove() {
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32).insert(5).remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty()
|
||||
.insert(63u32)
|
||||
.insert(1)
|
||||
.insert(5)
|
||||
.remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1).remove(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1).remove(1u32);
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
assert!(TinySet::empty().is_empty());
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32).insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(2u32);
|
||||
assert_eq!(u.pop_lowest(), Some(2u32));
|
||||
u.insert_mut(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(63u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32).insert(5);
|
||||
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||
assert_eq!(u.pop_lowest(), Some(63u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let original = TinySet::empty().insert(63u32).insert(5);
|
||||
let after_serialize_deserialize = TinySet::deserialize(original.into_bytes());
|
||||
assert_eq!(original, after_serialize_deserialize);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset() {
|
||||
let test_against_hashset = |els: &[u32], max_value: u32| {
|
||||
let mut hashset: HashSet<u32> = HashSet::new();
|
||||
let mut bitset = BitSet::with_max_value(max_value);
|
||||
for &el in els {
|
||||
assert!(el < max_value);
|
||||
hashset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for el in 0..max_value {
|
||||
assert_eq!(hashset.contains(&el), bitset.contains(el));
|
||||
}
|
||||
assert_eq!(bitset.max_value(), max_value);
|
||||
|
||||
// test deser
|
||||
let mut data = vec![];
|
||||
bitset.serialize(&mut data).unwrap();
|
||||
let ro_bitset = ReadOnlyBitSet::open(OwnedBytes::new(data));
|
||||
for el in 0..max_value {
|
||||
assert_eq!(hashset.contains(&el), ro_bitset.contains(el));
|
||||
}
|
||||
assert_eq!(ro_bitset.max_value(), max_value);
|
||||
assert_eq!(ro_bitset.len(), els.len());
|
||||
};
|
||||
|
||||
test_against_hashset(&[], 0);
|
||||
test_against_hashset(&[], 1);
|
||||
test_against_hashset(&[0u32], 1);
|
||||
test_against_hashset(&[0u32], 100);
|
||||
test_against_hashset(&[1u32, 2u32], 4);
|
||||
test_against_hashset(&[99u32], 100);
|
||||
test_against_hashset(&[63u32], 64);
|
||||
test_against_hashset(&[62u32, 63u32], 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_num_buckets() {
|
||||
use super::num_buckets;
|
||||
assert_eq!(num_buckets(0u32), 0);
|
||||
assert_eq!(num_buckets(1u32), 1);
|
||||
assert_eq!(num_buckets(64u32), 1);
|
||||
assert_eq!(num_buckets(65u32), 2);
|
||||
assert_eq!(num_buckets(128u32), 2);
|
||||
assert_eq!(num_buckets(129u32), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tinyset_range() {
|
||||
assert_eq!(
|
||||
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1, 2]
|
||||
);
|
||||
assert!(TinySet::range_lower(0).is_empty());
|
||||
assert_eq!(
|
||||
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
|
||||
(0u32..63u32).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
|
||||
[0]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_greater_or_equal(3)
|
||||
.into_iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
(3u32..64u32).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_len() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(104u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
bitset.remove(105u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
bitset.remove(104u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.remove(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.remove(103u32);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
}
|
||||
|
||||
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
StdRng::from_seed([seed_val; 32])
|
||||
.sample_iter(&Bernoulli::new(ratio).unwrap())
|
||||
.take(n as usize)
|
||||
.enumerate()
|
||||
.filter_map(|(val, keep)| if keep { Some(val as u32) } else { None })
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn sample(n: u32, ratio: f64) -> Vec<u32> {
|
||||
sample_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_clear() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
let els = sample(1_000, 0.01f64);
|
||||
for &el in &els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
assert!(els.iter().all(|el| bitset.contains(*el)));
|
||||
bitset.clear();
|
||||
for el in 0u32..1000u32 {
|
||||
assert!(!bitset.contains(el));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use test;
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let mut tinyset = TinySet::singleton(test::black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
b.iter(|| {
|
||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,169 @@
|
||||
#![allow(clippy::len_without_is_empty)]
|
||||
|
||||
use std::ops::Deref;
|
||||
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
mod bitset;
|
||||
mod serialize;
|
||||
mod vint;
|
||||
mod writer;
|
||||
|
||||
pub use bitset::*;
|
||||
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt};
|
||||
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
|
||||
|
||||
/// Has length trait
|
||||
pub trait HasLen {
|
||||
/// Return length
|
||||
fn len(&self) -> usize;
|
||||
|
||||
/// Returns true iff empty.
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Deref<Target = [u8]>> HasLen for T {
|
||||
fn len(&self) -> usize {
|
||||
self.deref().len()
|
||||
}
|
||||
}
|
||||
|
||||
const HIGHEST_BIT: u64 = 1 << 63;
|
||||
|
||||
/// Maps a `i64` to `u64`
|
||||
///
|
||||
/// For simplicity, tantivy internally handles `i64` as `u64`.
|
||||
/// The mapping is defined by this function.
|
||||
///
|
||||
/// Maps `i64` to `u64` so that
|
||||
/// `-2^63 .. 2^63-1` is mapped
|
||||
/// to
|
||||
/// `0 .. 2^64-1`
|
||||
/// in that order.
|
||||
///
|
||||
/// This is more suited than simply casting (`val as u64`)
|
||||
/// because of bitpacking.
|
||||
///
|
||||
/// Imagine a list of `i64` ranging from -10 to 10.
|
||||
/// When casting negative values, the negative values are projected
|
||||
/// to values over 2^63, and all values end up requiring 64 bits.
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
|
||||
#[inline]
|
||||
pub fn i64_to_u64(val: i64) -> u64 {
|
||||
(val as u64) ^ HIGHEST_BIT
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline]
|
||||
pub fn u64_to_i64(val: u64) -> i64 {
|
||||
(val ^ HIGHEST_BIT) as i64
|
||||
}
|
||||
|
||||
/// Maps a `f64` to `u64`
|
||||
///
|
||||
/// For simplicity, tantivy internally handles `f64` as `u64`.
|
||||
/// The mapping is defined by this function.
|
||||
///
|
||||
/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved.
|
||||
///
|
||||
/// This is more suited than simply casting (`val as u64`)
|
||||
/// which would truncate the result
|
||||
///
|
||||
/// # Reference
|
||||
///
|
||||
/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/)
|
||||
/// explains the mapping in a clear manner.
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
|
||||
#[inline]
|
||||
pub fn f64_to_u64(val: f64) -> u64 {
|
||||
let bits = val.to_bits();
|
||||
if val.is_sign_positive() {
|
||||
bits ^ HIGHEST_BIT
|
||||
} else {
|
||||
!bits
|
||||
}
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline]
|
||||
pub fn u64_to_f64(val: u64) -> f64 {
|
||||
f64::from_bits(if val & HIGHEST_BIT != 0 {
|
||||
val ^ HIGHEST_BIT
|
||||
} else {
|
||||
!val
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use super::{BinarySerializable, FixedSize};
|
||||
use proptest::prelude::*;
|
||||
use std::f64;
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
fn test_f64_converter_helper(val: f64) {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
|
||||
let left_u64 = f64_to_u64(left);
|
||||
let right_u64 = f64_to_u64(right);
|
||||
assert_eq!(left_u64 < right_u64, left < right);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_i64_converter() {
|
||||
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
|
||||
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
|
||||
test_i64_converter_helper(0i64);
|
||||
test_i64_converter_helper(i64::min_value());
|
||||
test_i64_converter_helper(i64::max_value());
|
||||
for i in -1000i64..1000i64 {
|
||||
test_i64_converter_helper(i);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_converter() {
|
||||
test_f64_converter_helper(f64::INFINITY);
|
||||
test_f64_converter_helper(f64::NEG_INFINITY);
|
||||
test_f64_converter_helper(0.0);
|
||||
test_f64_converter_helper(-0.0);
|
||||
test_f64_converter_helper(1.0);
|
||||
test_f64_converter_helper(-1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_order() {
|
||||
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
|
||||
.contains(&f64_to_u64(f64::NAN))); //nan is not a number
|
||||
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
|
||||
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
|
||||
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
- [Segments](./basis.md)
|
||||
- [Defining your schema](./schema.md)
|
||||
- [Facetting](./facetting.md)
|
||||
- [Index Sorting](./index_sorting.md)
|
||||
- [Innerworkings](./innerworkings.md)
|
||||
- [Inverted index](./inverted_index.md)
|
||||
- [Best practise](./inverted_index.md)
|
||||
|
||||
61
doc/src/index_sorting.md
Normal file
61
doc/src/index_sorting.md
Normal file
@@ -0,0 +1,61 @@
|
||||
|
||||
- [Index Sorting](#index-sorting)
|
||||
+ [Why Sorting](#why-sorting)
|
||||
* [Compression](#compression)
|
||||
* [Top-N Optimization](#top-n-optimization)
|
||||
* [Pruning](#pruning)
|
||||
* [Other](#other)
|
||||
+ [Usage](#usage)
|
||||
|
||||
# Index Sorting
|
||||
|
||||
Tantivy allows you to sort the index according to a property.
|
||||
|
||||
## Why Sorting
|
||||
|
||||
Presorting an index has several advantages:
|
||||
|
||||
###### Compression
|
||||
|
||||
When data is sorted it is easier to compress the data. E.g. the numbers sequence [5, 2, 3, 1, 4] would be sorted to [1, 2, 3, 4, 5].
|
||||
If we apply delta encoding this list would be unsorted [5, -3, 1, -2, 3] vs. [1, 1, 1, 1, 1].
|
||||
Compression ratio is mainly affected on the fast field of the sorted property, every thing else is likely unaffected.
|
||||
###### Top-N Optimization
|
||||
|
||||
When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
|
||||
E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
|
||||
|
||||
Note: Tantivy 0.16 does not do this optimization yet.
|
||||
|
||||
###### Pruning
|
||||
|
||||
Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
|
||||
|
||||
Note: Tantivy 0.16 does not do this optimization yet.
|
||||
|
||||
###### Other?
|
||||
|
||||
In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?)
|
||||
|
||||
## Usage
|
||||
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used.
|
||||
|
||||
```
|
||||
let settings = IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
};
|
||||
let mut index_builder = Index::builder().schema(schema);
|
||||
index_builder = index_builder.settings(settings);
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
```
|
||||
|
||||
## Implementation details
|
||||
|
||||
Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073).
|
||||
|
||||
In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets).
|
||||
|
||||
@@ -96,7 +96,7 @@ fn main() -> tantivy::Result<()> {
|
||||
);
|
||||
|
||||
// ... and add it to the `IndexWriter`.
|
||||
index_writer.add_document(old_man_doc);
|
||||
index_writer.add_document(old_man_doc)?;
|
||||
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
@@ -110,7 +110,7 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
))?;
|
||||
|
||||
// Multivalued field just need to be repeated.
|
||||
index_writer.add_document(doc!(
|
||||
@@ -120,7 +120,7 @@ fn main() -> tantivy::Result<()> {
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
));
|
||||
))?;
|
||||
|
||||
// This is an example, so we will only index 3 documents
|
||||
// here. You can check out tantivy's tutorial to index
|
||||
|
||||
@@ -86,12 +86,10 @@ impl Collector for StatsCollector {
|
||||
|
||||
fn merge_fruits(&self, segment_stats: Vec<Option<Stats>>) -> tantivy::Result<Option<Stats>> {
|
||||
let mut stats = Stats::default();
|
||||
for segment_stats_opt in segment_stats {
|
||||
if let Some(segment_stats) = segment_stats_opt {
|
||||
stats.count += segment_stats.count;
|
||||
stats.sum += segment_stats.sum;
|
||||
stats.squared_sum += segment_stats.squared_sum;
|
||||
}
|
||||
for segment_stats in segment_stats.into_iter().flatten() {
|
||||
stats.count += segment_stats.count;
|
||||
stats.sum += segment_stats.sum;
|
||||
stats.squared_sum += segment_stats.squared_sum;
|
||||
}
|
||||
Ok(stats.non_zero_count())
|
||||
}
|
||||
@@ -147,23 +145,23 @@ fn main() -> tantivy::Result<()> {
|
||||
product_description => "While it is ok for short distance travel, this broom \
|
||||
was designed quiditch. It will up your game.",
|
||||
price => 30_200u64
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Turbulobroom",
|
||||
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
|
||||
You'll enjoy its sharp turns, and rapid acceleration",
|
||||
price => 29_240u64
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Broomio",
|
||||
product_description => "Great value for the price. This broom is a market favorite",
|
||||
price => 21_240u64
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Whack a Mole",
|
||||
product_description => "Prime quality bat.",
|
||||
price => 5_200u64
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
|
||||
@@ -79,14 +79,14 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent
|
||||
limbs and branches that arch over the pool"#
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and
|
||||
increasing confidence in the success of my undertaking."#
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -76,15 +76,15 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer.add_document(doc!(
|
||||
isbn => "978-0099908401",
|
||||
title => "The old Man and the see"
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
isbn => "978-0140177398",
|
||||
title => "Of Mice and Men",
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankentein", //< Oops there is a typo here.
|
||||
isbn => "978-9176370711",
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -122,7 +122,7 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
isbn => "978-9176370711",
|
||||
));
|
||||
))?;
|
||||
|
||||
// You are guaranteed that your clients will only observe your index in
|
||||
// the state it was in after a commit.
|
||||
|
||||
@@ -35,35 +35,35 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer.add_document(doc!(
|
||||
name => "Cat",
|
||||
classification => Facet::from("/Felidae/Felinae/Felis")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Canada lynx",
|
||||
classification => Facet::from("/Felidae/Felinae/Lynx")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Cheetah",
|
||||
classification => Facet::from("/Felidae/Felinae/Acinonyx")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Tiger",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Lion",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Jaguar",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Sunda clouded leopard",
|
||||
classification => Facet::from("/Felidae/Pantherinae/Neofelis")
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Fossa",
|
||||
classification => Facet::from("/Eupleridae/Cryptoprocta")
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -20,14 +20,14 @@ fn main() -> tantivy::Result<()> {
|
||||
title => "Fried egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Scrambled egg",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
ingredient => Facet::from("/ingredient/butter"),
|
||||
ingredient => Facet::from("/ingredient/milk"),
|
||||
ingredient => Facet::from("/ingredient/salt"),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
title => "Egg rolls",
|
||||
ingredient => Facet::from("/ingredient/egg"),
|
||||
@@ -36,7 +36,7 @@ fn main() -> tantivy::Result<()> {
|
||||
ingredient => Facet::from("/ingredient/oil"),
|
||||
ingredient => Facet::from("/ingredient/tortilla-wrap"),
|
||||
ingredient => Facet::from("/ingredient/mushroom"),
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -7,7 +7,7 @@ use tantivy::query::RangeQuery;
|
||||
use tantivy::schema::{Schema, INDEXED};
|
||||
use tantivy::{doc, Index, Result};
|
||||
|
||||
fn run() -> Result<()> {
|
||||
fn main() -> Result<()> {
|
||||
// For the sake of simplicity, this schema will only have 1 field
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -19,7 +19,7 @@ fn run() -> Result<()> {
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
|
||||
for year in 1950u64..2019u64 {
|
||||
index_writer.add_document(doc!(year_field => year));
|
||||
index_writer.add_document(doc!(year_field => year))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
// The index will be a range of years
|
||||
@@ -33,7 +33,3 @@ fn run() -> Result<()> {
|
||||
assert_eq!(num_60s_books, 10);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() {
|
||||
run().unwrap()
|
||||
}
|
||||
|
||||
@@ -25,9 +25,9 @@ fn main() -> tantivy::Result<()> {
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"));
|
||||
index_writer.add_document(doc!(title => "Of Mice and Men"));
|
||||
index_writer.add_document(doc!(title => "The modern Promotheus"));
|
||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
|
||||
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
|
||||
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -29,7 +29,7 @@ use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tantivy::schema::{Schema, STORED, TEXT};
|
||||
use tantivy::{doc, Index, IndexWriter, Opstamp};
|
||||
use tantivy::{doc, Index, IndexWriter, Opstamp, TantivyError};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
@@ -59,10 +59,11 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
))?;
|
||||
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
}
|
||||
Result::<(), TantivyError>::Ok(())
|
||||
});
|
||||
|
||||
// # Second indexing thread.
|
||||
@@ -78,11 +79,12 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer_rlock.add_document(doc!(
|
||||
title => "Manufacturing consent",
|
||||
body => "Some great book description..."
|
||||
))
|
||||
))?
|
||||
};
|
||||
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(10));
|
||||
}
|
||||
Result::<(), TantivyError>::Ok(())
|
||||
});
|
||||
|
||||
// # In the main thread, we commit 10 times, once every 500ms.
|
||||
@@ -90,7 +92,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let opstamp: Opstamp = {
|
||||
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
|
||||
let mut index_writer_wlock = index_writer.write().unwrap();
|
||||
index_writer_wlock.commit().unwrap()
|
||||
index_writer_wlock.commit()?
|
||||
};
|
||||
println!("committed with opstamp {}", opstamp);
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
|
||||
@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let old_man_doc = doc!(title => title_tok, body => body_tok);
|
||||
|
||||
// ... now let's just add it to the IndexWriter
|
||||
index_writer.add_document(old_man_doc);
|
||||
index_writer.add_document(old_man_doc)?;
|
||||
|
||||
// Pretokenized text can also be fed as JSON
|
||||
let short_man_json = r#"{
|
||||
@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let short_man_doc = schema.parse_document(short_man_json)?;
|
||||
|
||||
index_writer.add_document(short_man_doc);
|
||||
index_writer.add_document(short_man_doc)?;
|
||||
|
||||
// Let's commit changes
|
||||
index_writer.commit()?;
|
||||
@@ -106,9 +106,7 @@ fn main() -> tantivy::Result<()> {
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let (top_docs, count) = searcher
|
||||
.search(&query, &(TopDocs::with_limit(2), Count))
|
||||
.unwrap();
|
||||
let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
|
||||
|
||||
assert_eq!(count, 2);
|
||||
|
||||
@@ -129,9 +127,7 @@ fn main() -> tantivy::Result<()> {
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let (_top_docs, count) = searcher
|
||||
.search(&query, &(TopDocs::with_limit(2), Count))
|
||||
.unwrap();
|
||||
let (_top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
|
||||
|
||||
assert_eq!(count, 0);
|
||||
|
||||
|
||||
@@ -40,7 +40,7 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
))?;
|
||||
// ...
|
||||
index_writer.commit()?;
|
||||
|
||||
|
||||
@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
));
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
@@ -80,7 +80,7 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
@@ -88,7 +88,7 @@ fn main() -> tantivy::Result<()> {
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
));
|
||||
))?;
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
|
||||
@@ -9,8 +9,8 @@ description = "Fast field codecs used by tantivy"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
common = { path = "../common/" }
|
||||
tantivy-bitpacker = { path = "../bitpacker/" }
|
||||
common = { version = "0.1", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
|
||||
prettytable-rs = {version="0.8.0", optional= true}
|
||||
rand = {version="0.8.3", optional= true}
|
||||
|
||||
|
||||
@@ -118,7 +118,7 @@ mod tests {
|
||||
);
|
||||
}
|
||||
}
|
||||
let actual_compression = data.len() as f32 / out.len() as f32;
|
||||
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
|
||||
(estimation, actual_compression)
|
||||
}
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
|
||||
@@ -239,11 +239,21 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<
|
||||
LinearInterpolFastFieldSerializer,
|
||||
LinearInterpolFastFieldReader,
|
||||
>(data, name);
|
||||
>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
|
||||
assert!(actual_compression < 0.01);
|
||||
assert!(estimate < 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -57,7 +57,7 @@ struct Function {
|
||||
impl Function {
|
||||
fn calc_slope(&mut self) {
|
||||
let num_vals = self.end_pos - self.start_pos;
|
||||
get_slope(self.value_start_pos, self.value_end_pos, num_vals);
|
||||
self.slope = get_slope(self.value_start_pos, self.value_end_pos, num_vals);
|
||||
}
|
||||
// split the interpolation into two function, change self and return the second split
|
||||
fn split(&mut self, split_pos: u64, split_pos_value: u64) -> Function {
|
||||
@@ -378,11 +378,22 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<
|
||||
MultiLinearInterpolFastFieldSerializer,
|
||||
MultiLinearInterpolFastFieldReader,
|
||||
>(data, name);
|
||||
>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.15);
|
||||
assert!(actual_compression > 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -414,9 +425,11 @@ mod tests {
|
||||
fn rand() {
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u64>() as u64)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
let (estimate, actual_compression) = create_and_validate(&data, "random");
|
||||
dbg!(estimate);
|
||||
dbg!(actual_compression);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
[package]
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
name = "ownedbytes"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
edition = "2018"
|
||||
description = "Expose data as static slice"
|
||||
license = "MIT"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -76,6 +76,19 @@ impl OwnedBytes {
|
||||
(left, right)
|
||||
}
|
||||
|
||||
/// Splits the right part of the `OwnedBytes` at the given offset.
|
||||
///
|
||||
/// `self` is truncated to `split_len`, left with the remaining bytes.
|
||||
pub fn split_off(&mut self, split_len: usize) -> OwnedBytes {
|
||||
let right_box_stable_deref = self.box_stable_deref.clone();
|
||||
let right_piece = OwnedBytes {
|
||||
data: &self.data[split_len..],
|
||||
box_stable_deref: right_box_stable_deref,
|
||||
};
|
||||
self.data = &self.data[..split_len];
|
||||
right_piece
|
||||
}
|
||||
|
||||
/// Returns true iff this `OwnedBytes` is empty.
|
||||
#[inline]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
@@ -124,6 +137,35 @@ impl fmt::Debug for OwnedBytes {
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for OwnedBytes {
|
||||
fn eq(&self, other: &OwnedBytes) -> bool {
|
||||
self.as_slice() == other.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for OwnedBytes {}
|
||||
|
||||
impl PartialEq<[u8]> for OwnedBytes {
|
||||
fn eq(&self, other: &[u8]) -> bool {
|
||||
self.as_slice() == other
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq<str> for OwnedBytes {
|
||||
fn eq(&self, other: &str) -> bool {
|
||||
self.as_slice() == other.as_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: ?Sized> PartialEq<&'a T> for OwnedBytes
|
||||
where
|
||||
OwnedBytes: PartialEq<T>,
|
||||
{
|
||||
fn eq(&self, other: &&'a T) -> bool {
|
||||
*self == **other
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for OwnedBytes {
|
||||
type Target = [u8];
|
||||
|
||||
@@ -287,4 +329,14 @@ mod tests {
|
||||
assert_eq!(right.as_slice(), b"");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_off() {
|
||||
let mut data = OwnedBytes::new(b"abcdef".as_ref());
|
||||
assert_eq!(data, "abcdef");
|
||||
assert_eq!(data.split_off(2), "cdef");
|
||||
assert_eq!(data, "ab");
|
||||
assert_eq!(data.split_off(1), "b");
|
||||
assert_eq!(data, "a");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,9 +5,9 @@ authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
description = """Search engine library"""
|
||||
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
|
||||
homepage = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
documentation = "https://quickwit-inc.github.io/tantivy/tantivy/index.html"
|
||||
homepage = "https://github.com/quickwit-inc/tantivy"
|
||||
repository = "https://github.com/quickwit-inc/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
|
||||
@@ -20,10 +20,10 @@ use crate::SegmentReader;
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind")).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib")).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow")).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl")).unwrap();
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
|
||||
@@ -103,23 +103,23 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// title => "The Name of the Wind",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/fiction/fantasy")
|
||||
/// ));
|
||||
/// ))?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "Dune",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/fiction/sci-fi")
|
||||
/// ));
|
||||
/// ))?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "La Vénus d'Ille",
|
||||
/// facet => Facet::from("/lang/fr"),
|
||||
/// facet => Facet::from("/category/fiction/fantasy"),
|
||||
/// facet => Facet::from("/category/fiction/horror")
|
||||
/// ));
|
||||
/// ))?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/biography")
|
||||
/// ));
|
||||
/// ))?;
|
||||
/// index_writer.commit()?;
|
||||
/// }
|
||||
/// let reader = index.reader()?;
|
||||
@@ -470,13 +470,13 @@ mod tests {
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_drilldown() {
|
||||
fn test_facet_collector_drilldown() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let num_facets: usize = 3 * 4 * 5;
|
||||
let facets: Vec<Facet> = (0..num_facets)
|
||||
.map(|mut n| {
|
||||
@@ -491,14 +491,14 @@ mod tests {
|
||||
for i in 0..num_facets * 10 {
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet(Facet::from("/top1"));
|
||||
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
let counts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
|
||||
{
|
||||
let facets: Vec<(String, u64)> = counts
|
||||
@@ -518,6 +518,7 @@ mod tests {
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -530,27 +531,28 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_unsorted_multifacet() {
|
||||
fn test_doc_unsorted_multifacet() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facets", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/subjects/A/a").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/B/a").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/A/b").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/B/b").unwrap(),
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 1);
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet("/subjects");
|
||||
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
let counts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
|
||||
assert_eq!(facets[0].1, 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -562,16 +564,16 @@ mod tests {
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/A").unwrap(),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/B").unwrap(),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/C/A").unwrap(),
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/D/C/A").unwrap(),
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -637,7 +639,7 @@ mod tests {
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc).unwrap();
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
@@ -677,7 +679,7 @@ mod tests {
|
||||
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for doc in docs {
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
|
||||
@@ -25,34 +25,37 @@ use crate::{Score, SegmentReader, TantivyError};
|
||||
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64));
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary").unwrap();
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
|
||||
/// let top_docs = searcher.search(&query, &no_filter_collector).unwrap();
|
||||
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
|
||||
///
|
||||
/// assert_eq!(top_docs.len(), 1);
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
///
|
||||
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
|
||||
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
|
||||
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
|
||||
///
|
||||
/// assert_eq!(filtered_top_docs.len(), 0);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
|
||||
where
|
||||
|
||||
@@ -226,10 +226,10 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||
writer.add_document(doc!(val_field=>12i64));
|
||||
writer.add_document(doc!(val_field=>-30i64));
|
||||
writer.add_document(doc!(val_field=>-12i64));
|
||||
writer.add_document(doc!(val_field=>-10i64));
|
||||
writer.add_document(doc!(val_field=>12i64))?;
|
||||
writer.add_document(doc!(val_field=>-30i64))?;
|
||||
writer.add_document(doc!(val_field=>-12i64))?;
|
||||
writer.add_document(doc!(val_field=>-10i64))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -247,13 +247,13 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||
writer.add_document(doc!(val_field=>12i64));
|
||||
writer.add_document(doc!(val_field=>12i64))?;
|
||||
writer.commit()?;
|
||||
writer.add_document(doc!(val_field=>-30i64));
|
||||
writer.add_document(doc!(val_field=>-30i64))?;
|
||||
writer.commit()?;
|
||||
writer.add_document(doc!(val_field=>-12i64));
|
||||
writer.add_document(doc!(val_field=>-12i64))?;
|
||||
writer.commit()?;
|
||||
writer.add_document(doc!(val_field=>-10i64));
|
||||
writer.add_document(doc!(val_field=>-10i64))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -271,9 +271,9 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)));
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)));
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)));
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)))?;
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)))?;
|
||||
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
@@ -48,10 +48,10 @@ use tantivy::collector::{Count, TopDocs};
|
||||
# let mut index_writer = index.writer(3_000_000)?;
|
||||
# index_writer.add_document(doc!(
|
||||
# title => "The Name of the Wind",
|
||||
# ));
|
||||
# ))?;
|
||||
# index_writer.add_document(doc!(
|
||||
# title => "The Diary of Muadib",
|
||||
# ));
|
||||
# ))?;
|
||||
# index_writer.commit()?;
|
||||
# let reader = index.reader()?;
|
||||
# let searcher = reader.searcher();
|
||||
@@ -178,9 +178,9 @@ pub trait Collector: Sync + Send {
|
||||
) -> crate::Result<<Self::Child as SegmentCollector>::Fruit> {
|
||||
let mut segment_collector = self.for_segment(segment_ord as u32, reader)?;
|
||||
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
if let Some(alive_bitset) = reader.alive_bitset() {
|
||||
weight.for_each(reader, &mut |doc, score| {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
if alive_bitset.is_alive(doc) {
|
||||
segment_collector.collect(doc, score);
|
||||
}
|
||||
})?;
|
||||
|
||||
@@ -112,19 +112,19 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let mut collectors = MultiCollector::new();
|
||||
@@ -139,6 +139,8 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
///
|
||||
/// assert_eq!(count, 2);
|
||||
/// assert_eq!(top_docs.len(), 2);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[derive(Default)]
|
||||
@@ -252,24 +254,24 @@ mod tests {
|
||||
use crate::Term;
|
||||
|
||||
#[test]
|
||||
fn test_multi_collector() {
|
||||
fn test_multi_collector() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc"));
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!(text=>""));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.commit().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text=>"abc"))?;
|
||||
index_writer.add_document(doc!(text=>"abc abc abc"))?;
|
||||
index_writer.add_document(doc!(text=>"abc abc"))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(text=>""))?;
|
||||
index_writer.add_document(doc!(text=>"abc abc abc abc"))?;
|
||||
index_writer.add_document(doc!(text=>"abc"))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let term = Term::from_field_text(text, "abc");
|
||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
|
||||
@@ -280,5 +282,6 @@ mod tests {
|
||||
|
||||
assert_eq!(count_handler.extract(&mut multifruits), 5);
|
||||
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ pub const TEST_COLLECTOR_WITHOUT_SCORE: TestCollector = TestCollector {
|
||||
};
|
||||
|
||||
#[test]
|
||||
pub fn test_filter_collector() {
|
||||
pub fn test_filter_collector() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let price = schema_builder.add_u64_field("price", FAST);
|
||||
@@ -33,25 +33,25 @@ pub fn test_filter_collector() {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()));
|
||||
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()));
|
||||
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()));
|
||||
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()));
|
||||
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
let query = query_parser.parse_query("diary").unwrap();
|
||||
let query = query_parser.parse_query("diary")?;
|
||||
let filter_some_collector = FilterCollector::new(
|
||||
price,
|
||||
&|value: u64| value > 20_120u64,
|
||||
TopDocs::with_limit(2),
|
||||
);
|
||||
let top_docs = searcher.search(&query, &filter_some_collector).unwrap();
|
||||
let top_docs = searcher.search(&query, &filter_some_collector)?;
|
||||
|
||||
assert_eq!(top_docs.len(), 1);
|
||||
assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
@@ -67,9 +67,10 @@ pub fn test_filter_collector() {
|
||||
}
|
||||
|
||||
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
|
||||
let filtered_date_docs = searcher.search(&query, &filter_dates_collector).unwrap();
|
||||
let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?;
|
||||
|
||||
assert_eq!(filtered_date_docs.len(), 2);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
@@ -274,8 +275,8 @@ fn make_test_searcher() -> crate::Result<crate::LeasedItem<Searcher>> {
|
||||
let schema = Schema::builder().build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.commit()?;
|
||||
Ok(index.reader()?.searcher())
|
||||
}
|
||||
|
||||
@@ -70,9 +70,7 @@ where
|
||||
/// # Panics
|
||||
/// The method panics if limit is 0
|
||||
pub fn with_limit(limit: usize) -> TopCollector<T> {
|
||||
if limit < 1 {
|
||||
panic!("Limit must be strictly greater than 0.");
|
||||
}
|
||||
assert!(limit >= 1, "Limit must be strictly greater than 0.");
|
||||
Self {
|
||||
limit,
|
||||
offset: 0,
|
||||
|
||||
@@ -94,27 +94,30 @@ where
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary").unwrap();
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2))?;
|
||||
///
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub struct TopDocs(TopCollector<Score>);
|
||||
|
||||
@@ -180,29 +183,32 @@ impl TopDocs {
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, DocAddress, Index};
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"));
|
||||
/// assert!(index_writer.commit().is_ok());
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let reader = index.reader().unwrap();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary").unwrap();
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1)).unwrap();
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1))?;
|
||||
///
|
||||
/// assert_eq!(top_docs.len(), 2);
|
||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 4));
|
||||
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
|
||||
/// Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
pub fn and_offset(self, offset: usize) -> TopDocs {
|
||||
TopDocs(self.0.and_offset(offset))
|
||||
@@ -234,11 +240,11 @@ impl TopDocs {
|
||||
/// #
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64));
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
|
||||
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
|
||||
/// # assert!(index_writer.commit().is_ok());
|
||||
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64))?;
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64))?;
|
||||
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64))?;
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64))?;
|
||||
/// # index_writer.commit()?;
|
||||
/// # let reader = index.reader()?;
|
||||
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
|
||||
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
|
||||
@@ -316,9 +322,9 @@ impl TopDocs {
|
||||
/// #
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64));
|
||||
/// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64));
|
||||
/// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64));
|
||||
/// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64))?;
|
||||
/// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64))?;
|
||||
/// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64))?;
|
||||
/// # assert!(index_writer.commit().is_ok());
|
||||
/// # let reader = index.reader()?;
|
||||
/// # let top_docs = docs_sorted_by_revenue(&reader.searcher(), &AllQuery, rating)?;
|
||||
@@ -417,9 +423,9 @@ impl TopDocs {
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
/// let product_name = index.schema().get_field("product_name").unwrap();
|
||||
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
|
||||
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
|
||||
/// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64));
|
||||
/// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64));
|
||||
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64))?;
|
||||
/// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64))?;
|
||||
/// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64))?;
|
||||
/// index_writer.commit()?;
|
||||
/// Ok(index)
|
||||
/// }
|
||||
@@ -527,9 +533,9 @@ impl TopDocs {
|
||||
/// #
|
||||
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
|
||||
/// let boosted: Field = index.schema().get_field("boosted").unwrap();
|
||||
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64));
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64));
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64));
|
||||
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64))?;
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64))?;
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64))?;
|
||||
/// # index_writer.commit()?;
|
||||
/// // ...
|
||||
/// # let user_query = "diary";
|
||||
@@ -629,10 +635,10 @@ impl Collector for TopDocs {
|
||||
let heap_len = self.0.limit + self.0.offset;
|
||||
let mut heap: BinaryHeap<ComparableDoc<Score, DocId>> = BinaryHeap::with_capacity(heap_len);
|
||||
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
if let Some(alive_bitset) = reader.alive_bitset() {
|
||||
let mut threshold = Score::MIN;
|
||||
weight.for_each_pruning(threshold, reader, &mut |doc, score| {
|
||||
if delete_bitset.is_deleted(doc) {
|
||||
if alive_bitset.is_deleted(doc) {
|
||||
return threshold;
|
||||
}
|
||||
let heap_item = ComparableDoc {
|
||||
@@ -713,20 +719,18 @@ mod tests {
|
||||
use crate::Score;
|
||||
use crate::{DocAddress, DocId, SegmentReader};
|
||||
|
||||
fn make_index() -> Index {
|
||||
fn make_index() -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
|
||||
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
|
||||
index_writer.add_document(doc!(text_field=>"I like Droopy"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."))?;
|
||||
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"))?;
|
||||
index_writer.add_document(doc!(text_field=>"I like Droopy"))?;
|
||||
index_writer.commit()?;
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
fn assert_results_equals(results: &[(Score, DocAddress)], expected: &[(Score, DocAddress)]) {
|
||||
@@ -737,17 +741,15 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity_without_offset() {
|
||||
let index = make_index();
|
||||
fn test_top_collector_not_at_capacity_without_offset() -> crate::Result<()> {
|
||||
let index = make_index()?;
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let text_query = query_parser.parse_query("droopy tax")?;
|
||||
let score_docs: Vec<(Score, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
.reader()?
|
||||
.searcher()
|
||||
.search(&text_query, &TopDocs::with_limit(4))
|
||||
.unwrap();
|
||||
.search(&text_query, &TopDocs::with_limit(4))?;
|
||||
assert_results_equals(
|
||||
&score_docs,
|
||||
&[
|
||||
@@ -756,11 +758,12 @@ mod tests {
|
||||
(0.48527452, DocAddress::new(0, 0)),
|
||||
],
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity_with_offset() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
@@ -775,7 +778,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_at_capacity() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
@@ -796,7 +799,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_at_capacity_with_offset() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
@@ -817,7 +820,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_stable_sorting() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
|
||||
// using AllQuery to get a constant score
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
@@ -848,29 +851,35 @@ mod tests {
|
||||
const SIZE: &str = "size";
|
||||
|
||||
#[test]
|
||||
fn test_top_field_collector_not_at_capacity() {
|
||||
fn test_top_field_collector_not_at_capacity() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, query) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "growler of beer",
|
||||
size => 64u64,
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "pint of beer",
|
||||
size => 16u64,
|
||||
));
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "growler of beer",
|
||||
size => 64u64,
|
||||
))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "pint of beer",
|
||||
size => 16u64,
|
||||
))
|
||||
.unwrap();
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
|
||||
assert_eq!(
|
||||
&top_docs[..],
|
||||
&[
|
||||
@@ -879,6 +888,7 @@ mod tests {
|
||||
(12, DocAddress::new(0, 0))
|
||||
]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -894,12 +904,12 @@ mod tests {
|
||||
index_writer.add_document(doc!(
|
||||
name => "Paul Robeson",
|
||||
birthday => pr_birthday
|
||||
));
|
||||
))?;
|
||||
let mr_birthday = crate::DateTime::from_str("1947-11-08T00:00:00+00:00")?;
|
||||
index_writer.add_document(doc!(
|
||||
name => "Minnie Riperton",
|
||||
birthday => mr_birthday
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
|
||||
@@ -926,11 +936,11 @@ mod tests {
|
||||
index_writer.add_document(doc!(
|
||||
city => "georgetown",
|
||||
altitude => -1i64,
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
city => "tokyo",
|
||||
altitude => 40i64,
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
|
||||
@@ -956,11 +966,11 @@ mod tests {
|
||||
index_writer.add_document(doc!(
|
||||
city => "georgetown",
|
||||
altitude => -1.0f64,
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
city => "tokyo",
|
||||
altitude => 40f64,
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
|
||||
@@ -983,10 +993,12 @@ mod tests {
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, _) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
index_writer
|
||||
.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
))
|
||||
.unwrap();
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
|
||||
@@ -1003,7 +1015,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(size=>1u64));
|
||||
index_writer.add_document(doc!(size=>1u64))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
@@ -1020,7 +1032,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(size=>1u64));
|
||||
index_writer.add_document(doc!(size=>1u64))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
@@ -1033,30 +1045,26 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tweak_score_top_collector_with_offset() {
|
||||
let index = make_index();
|
||||
fn test_tweak_score_top_collector_with_offset() -> crate::Result<()> {
|
||||
let index = make_index()?;
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let text_query = query_parser.parse_query("droopy tax")?;
|
||||
let collector = TopDocs::with_limit(2).and_offset(1).tweak_score(
|
||||
move |_segment_reader: &SegmentReader| move |doc: DocId, _original_score: Score| doc,
|
||||
);
|
||||
let score_docs: Vec<(u32, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
.searcher()
|
||||
.search(&text_query, &collector)
|
||||
.unwrap();
|
||||
|
||||
let score_docs: Vec<(u32, DocAddress)> =
|
||||
index.reader()?.searcher().search(&text_query, &collector)?;
|
||||
assert_eq!(
|
||||
score_docs,
|
||||
vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_custom_score_top_collector_with_offset() {
|
||||
let index = make_index();
|
||||
let index = make_index().unwrap();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
|
||||
@@ -1,396 +0,0 @@
|
||||
use std::fmt;
|
||||
use std::u64;
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub(crate) struct TinySet(u64);
|
||||
|
||||
impl fmt::Debug for TinySet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.into_iter().collect::<Vec<u32>>().fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TinySetIterator(TinySet);
|
||||
impl Iterator for TinySetIterator {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0.pop_lowest()
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for TinySet {
|
||||
type Item = u32;
|
||||
type IntoIter = TinySetIterator;
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
TinySetIterator(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl TinySet {
|
||||
/// Returns an empty `TinySet`.
|
||||
pub fn empty() -> TinySet {
|
||||
TinySet(0u64)
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
fn complement(self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` contains the element `el`.
|
||||
pub fn contains(self, el: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the TinySet.
|
||||
pub fn len(self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
|
||||
/// Returns the intersection of `self` and `other`
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
}
|
||||
|
||||
/// Creates a new `TinySet` containing only one element
|
||||
/// within `[0; 64[`
|
||||
#[inline]
|
||||
pub fn singleton(el: u32) -> TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.insert(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 | other.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` is empty.
|
||||
#[inline]
|
||||
pub fn is_empty(self) -> bool {
|
||||
self.0 == 0u64
|
||||
}
|
||||
|
||||
/// Returns the lowest element in the `TinySet`
|
||||
/// and removes it.
|
||||
#[inline]
|
||||
pub fn pop_lowest(&mut self) -> Option<u32> {
|
||||
if self.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let lowest = self.0.trailing_zeros() as u32;
|
||||
self.0 ^= TinySet::singleton(lowest).0;
|
||||
Some(lowest)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` than contains all values up
|
||||
/// to limit excluded.
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_lower(upper_bound: u32) -> TinySet {
|
||||
TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` that contains all values greater
|
||||
/// or equal to the given limit, included. (and up to 63)
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
|
||||
TinySet::range_lower(from_included).complement()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BitSet {
|
||||
tinysets: Box<[TinySet]>,
|
||||
len: usize,
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
fn num_buckets(max_val: u32) -> u32 {
|
||||
(max_val + 63u32) / 64u32
|
||||
}
|
||||
|
||||
impl BitSet {
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val[`.
|
||||
pub fn with_max_value(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
||||
BitSet {
|
||||
tinysets: tinybisets,
|
||||
len: 0,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all elements from the `BitSet`.
|
||||
pub fn clear(&mut self) {
|
||||
for tinyset in self.tinysets.iter_mut() {
|
||||
*tinyset = TinySet::empty();
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the `BitSet`.
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
pub fn insert(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
self.tinyset(el / 64u32).contains(el % 64)
|
||||
}
|
||||
|
||||
/// Returns the first non-empty `TinySet` associated to a bucket lower
|
||||
/// or greater than bucket.
|
||||
///
|
||||
/// Reminder: the tiny set with the bucket `bucket`, represents the
|
||||
/// elements from `bucket * 64` to `(bucket+1) * 64`.
|
||||
pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
|
||||
self.tinysets[bucket as usize..]
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|tinyset| !tinyset.is_empty())
|
||||
.map(|delta_bucket| bucket + delta_bucket as u32)
|
||||
}
|
||||
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
/// Returns the tiny bitset representing the
|
||||
/// the set restricted to the number range from
|
||||
/// `bucket * 64` to `(bucket + 1) * 64`.
|
||||
pub(crate) fn tinyset(&self, bucket: u32) -> TinySet {
|
||||
self.tinysets[bucket as usize]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::query::BitSetDocSet;
|
||||
use crate::tests;
|
||||
use crate::tests::generate_nonunique_unsorted;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
assert!(TinySet::empty().is_empty());
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32).insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(2u32);
|
||||
assert_eq!(u.pop_lowest(), Some(2u32));
|
||||
u.insert_mut(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(63u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset() {
|
||||
let test_against_hashset = |els: &[u32], max_value: u32| {
|
||||
let mut hashset: HashSet<u32> = HashSet::new();
|
||||
let mut bitset = BitSet::with_max_value(max_value);
|
||||
for &el in els {
|
||||
assert!(el < max_value);
|
||||
hashset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for el in 0..max_value {
|
||||
assert_eq!(hashset.contains(&el), bitset.contains(el));
|
||||
}
|
||||
assert_eq!(bitset.max_value(), max_value);
|
||||
};
|
||||
|
||||
test_against_hashset(&[], 0);
|
||||
test_against_hashset(&[], 1);
|
||||
test_against_hashset(&[0u32], 1);
|
||||
test_against_hashset(&[0u32], 100);
|
||||
test_against_hashset(&[1u32, 2u32], 4);
|
||||
test_against_hashset(&[99u32], 100);
|
||||
test_against_hashset(&[63u32], 64);
|
||||
test_against_hashset(&[62u32, 63u32], 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_large() {
|
||||
let arr = generate_nonunique_unsorted(100_000, 5_000);
|
||||
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
|
||||
let mut bitset = BitSet::with_max_value(100_000);
|
||||
for el in arr {
|
||||
btreeset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for i in 0..100_000 {
|
||||
assert_eq!(btreeset.contains(&i), bitset.contains(i));
|
||||
}
|
||||
assert_eq!(btreeset.len(), bitset.len());
|
||||
let mut bitset_docset = BitSetDocSet::from(bitset);
|
||||
let mut remaining = true;
|
||||
for el in btreeset.into_iter() {
|
||||
assert!(remaining);
|
||||
assert_eq!(bitset_docset.doc(), el);
|
||||
remaining = bitset_docset.advance() != TERMINATED;
|
||||
}
|
||||
assert!(!remaining);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_num_buckets() {
|
||||
use super::num_buckets;
|
||||
assert_eq!(num_buckets(0u32), 0);
|
||||
assert_eq!(num_buckets(1u32), 1);
|
||||
assert_eq!(num_buckets(64u32), 1);
|
||||
assert_eq!(num_buckets(65u32), 2);
|
||||
assert_eq!(num_buckets(128u32), 2);
|
||||
assert_eq!(num_buckets(129u32), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tinyset_range() {
|
||||
assert_eq!(
|
||||
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1, 2]
|
||||
);
|
||||
assert!(TinySet::range_lower(0).is_empty());
|
||||
assert_eq!(
|
||||
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
|
||||
(0u32..63u32).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
|
||||
[0]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_greater_or_equal(3)
|
||||
.into_iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
(3u32..64u32).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_len() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(104u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_clear() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
let els = tests::sample(1_000, 0.01f64);
|
||||
for &el in &els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
assert!(els.iter().all(|el| bitset.contains(*el)));
|
||||
bitset.clear();
|
||||
for el in 0u32..1000u32 {
|
||||
assert!(!bitset.contains(el));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use test;
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let mut tinyset = TinySet::singleton(test::black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
b.iter(|| {
|
||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
}
|
||||
@@ -1,203 +0,0 @@
|
||||
mod bitset;
|
||||
mod composite_file;
|
||||
|
||||
pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
pub use common::CountingWriter;
|
||||
pub use common::{
|
||||
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
|
||||
};
|
||||
pub use common::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
|
||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||
///
|
||||
/// We do not allow segments with more than
|
||||
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
||||
|
||||
/// Has length trait
|
||||
pub trait HasLen {
|
||||
/// Return length
|
||||
fn len(&self) -> usize;
|
||||
|
||||
/// Returns true iff empty.
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
}
|
||||
|
||||
const HIGHEST_BIT: u64 = 1 << 63;
|
||||
|
||||
/// Maps a `i64` to `u64`
|
||||
///
|
||||
/// For simplicity, tantivy internally handles `i64` as `u64`.
|
||||
/// The mapping is defined by this function.
|
||||
///
|
||||
/// Maps `i64` to `u64` so that
|
||||
/// `-2^63 .. 2^63-1` is mapped
|
||||
/// to
|
||||
/// `0 .. 2^64-1`
|
||||
/// in that order.
|
||||
///
|
||||
/// This is more suited than simply casting (`val as u64`)
|
||||
/// because of bitpacking.
|
||||
///
|
||||
/// Imagine a list of `i64` ranging from -10 to 10.
|
||||
/// When casting negative values, the negative values are projected
|
||||
/// to values over 2^63, and all values end up requiring 64 bits.
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
|
||||
#[inline]
|
||||
pub fn i64_to_u64(val: i64) -> u64 {
|
||||
(val as u64) ^ HIGHEST_BIT
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline]
|
||||
pub fn u64_to_i64(val: u64) -> i64 {
|
||||
(val ^ HIGHEST_BIT) as i64
|
||||
}
|
||||
|
||||
/// Maps a `f64` to `u64`
|
||||
///
|
||||
/// For simplicity, tantivy internally handles `f64` as `u64`.
|
||||
/// The mapping is defined by this function.
|
||||
///
|
||||
/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved.
|
||||
///
|
||||
/// This is more suited than simply casting (`val as u64`)
|
||||
/// which would truncate the result
|
||||
///
|
||||
/// # Reference
|
||||
///
|
||||
/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/)
|
||||
/// explains the mapping in a clear manner.
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
|
||||
#[inline]
|
||||
pub fn f64_to_u64(val: f64) -> u64 {
|
||||
let bits = val.to_bits();
|
||||
if val.is_sign_positive() {
|
||||
bits ^ HIGHEST_BIT
|
||||
} else {
|
||||
!bits
|
||||
}
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline]
|
||||
pub fn u64_to_f64(val: u64) -> f64 {
|
||||
f64::from_bits(if val & HIGHEST_BIT != 0 {
|
||||
val ^ HIGHEST_BIT
|
||||
} else {
|
||||
!val
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use proptest::prelude::*;
|
||||
use std::f64;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
pub use tantivy_bitpacker::minmax;
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
fn test_f64_converter_helper(val: f64) {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
|
||||
let left_u64 = f64_to_u64(left);
|
||||
let right_u64 = f64_to_u64(right);
|
||||
assert_eq!(left_u64 < right_u64, left < right);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_i64_converter() {
|
||||
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
|
||||
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
|
||||
test_i64_converter_helper(0i64);
|
||||
test_i64_converter_helper(i64::min_value());
|
||||
test_i64_converter_helper(i64::max_value());
|
||||
for i in -1000i64..1000i64 {
|
||||
test_i64_converter_helper(i);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_converter() {
|
||||
test_f64_converter_helper(f64::INFINITY);
|
||||
test_f64_converter_helper(f64::NEG_INFINITY);
|
||||
test_f64_converter_helper(0.0);
|
||||
test_f64_converter_helper(-0.0);
|
||||
test_f64_converter_helper(1.0);
|
||||
test_f64_converter_helper(-1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_order() {
|
||||
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
|
||||
.contains(&f64_to_u64(f64::NAN))); //nan is not a number
|
||||
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
|
||||
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
|
||||
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
assert_eq!(compute_num_bits(0), 0u8);
|
||||
assert_eq!(compute_num_bits(2), 2u8);
|
||||
assert_eq!(compute_num_bits(3), 2u8);
|
||||
assert_eq!(compute_num_bits(4), 3u8);
|
||||
assert_eq!(compute_num_bits(255), 8u8);
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_max_doc() {
|
||||
// this is the first time I write a unit test for a constant.
|
||||
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
|
||||
assert!((super::MAX_DOC_LIMIT as i32) < 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_empty() {
|
||||
let vals: Vec<u32> = vec![];
|
||||
assert_eq!(minmax(vals.into_iter()), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_one() {
|
||||
assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minmax_two() {
|
||||
assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
|
||||
assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
|
||||
}
|
||||
}
|
||||
@@ -120,11 +120,11 @@ impl IndexBuilder {
|
||||
/// Creates a new index in a given filepath.
|
||||
/// The index will use the `MMapDirectory`.
|
||||
///
|
||||
/// If a previous index was in this directory, then its meta file will be destroyed.
|
||||
/// If a previous index was in this directory, it returns an `IndexAlreadyExists` error.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
if Index::exists(&mmap_directory)? {
|
||||
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::open(directory_path)?);
|
||||
if Index::exists(&*mmap_directory)? {
|
||||
return Err(TantivyError::IndexAlreadyExists);
|
||||
}
|
||||
self.create(mmap_directory)
|
||||
@@ -139,7 +139,7 @@ impl IndexBuilder {
|
||||
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_from_tempdir(self) -> crate::Result<Index> {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
||||
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::create_from_tempdir()?);
|
||||
self.create(mmap_directory)
|
||||
}
|
||||
fn get_expect_schema(&self) -> crate::Result<Schema> {
|
||||
@@ -149,8 +149,9 @@ impl IndexBuilder {
|
||||
.ok_or(TantivyError::IndexBuilderMissingArgument("schema"))
|
||||
}
|
||||
/// Opens or creates a new index in the provided directory
|
||||
pub fn open_or_create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
|
||||
if !Index::exists(&dir)? {
|
||||
pub fn open_or_create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
|
||||
let dir = dir.into();
|
||||
if !Index::exists(&*dir)? {
|
||||
return self.create(dir);
|
||||
}
|
||||
let index = Index::open(dir)?;
|
||||
@@ -165,7 +166,8 @@ impl IndexBuilder {
|
||||
/// Creates a new index given an implementation of the trait `Directory`.
|
||||
///
|
||||
/// If a directory previously existed, it will be erased.
|
||||
fn create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
|
||||
fn create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
|
||||
let dir = dir.into();
|
||||
let directory = ManagedDirectory::wrap(dir)?;
|
||||
save_new_metas(
|
||||
self.get_expect_schema()?,
|
||||
@@ -198,7 +200,7 @@ impl Index {
|
||||
/// Examines the directory to see if it contains an index.
|
||||
///
|
||||
/// Effectively, it only checks for the presence of the `meta.json` file.
|
||||
pub fn exists<Dir: Directory>(dir: &Dir) -> Result<bool, OpenReadError> {
|
||||
pub fn exists(dir: &dyn Directory) -> Result<bool, OpenReadError> {
|
||||
dir.exists(&META_FILEPATH)
|
||||
}
|
||||
|
||||
@@ -229,7 +231,8 @@ impl Index {
|
||||
/// Creates a new index using the `RamDirectory`.
|
||||
///
|
||||
/// The index will be allocated in anonymous memory.
|
||||
/// This should only be used for unit tests.
|
||||
/// This is useful for indexing small set of documents
|
||||
/// for instances like unit test or temporary in memory index.
|
||||
pub fn create_in_ram(schema: Schema) -> Index {
|
||||
IndexBuilder::new().schema(schema).create_in_ram().unwrap()
|
||||
}
|
||||
@@ -237,7 +240,7 @@ impl Index {
|
||||
/// Creates a new index in a given filepath.
|
||||
/// The index will use the `MMapDirectory`.
|
||||
///
|
||||
/// If a previous index was in this directory, then its meta file will be destroyed.
|
||||
/// If a previous index was in this directory, then it returns an `IndexAlreadyExists` error.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_in_dir<P: AsRef<Path>>(
|
||||
directory_path: P,
|
||||
@@ -249,7 +252,11 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Opens or creates a new index in the provided directory
|
||||
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> crate::Result<Index> {
|
||||
pub fn open_or_create<T: Into<Box<dyn Directory>>>(
|
||||
dir: T,
|
||||
schema: Schema,
|
||||
) -> crate::Result<Index> {
|
||||
let dir = dir.into();
|
||||
IndexBuilder::new().schema(schema).open_or_create(dir)
|
||||
}
|
||||
|
||||
@@ -269,11 +276,12 @@ impl Index {
|
||||
/// Creates a new index given an implementation of the trait `Directory`.
|
||||
///
|
||||
/// If a directory previously existed, it will be erased.
|
||||
pub fn create<Dir: Directory>(
|
||||
dir: Dir,
|
||||
pub fn create<T: Into<Box<dyn Directory>>>(
|
||||
dir: T,
|
||||
schema: Schema,
|
||||
settings: IndexSettings,
|
||||
) -> crate::Result<Index> {
|
||||
let dir: Box<dyn Directory> = dir.into();
|
||||
let mut builder = IndexBuilder::new().schema(schema);
|
||||
builder = builder.settings(settings);
|
||||
builder.create(dir)
|
||||
@@ -364,7 +372,8 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Open the index using the provided directory
|
||||
pub fn open<D: Directory>(directory: D) -> crate::Result<Index> {
|
||||
pub fn open<T: Into<Box<dyn Directory>>>(directory: T) -> crate::Result<Index> {
|
||||
let directory = directory.into();
|
||||
let directory = ManagedDirectory::wrap(directory)?;
|
||||
let inventory = SegmentMetaInventory::default();
|
||||
let metas = load_metas(&directory, &inventory)?;
|
||||
@@ -394,9 +403,7 @@ impl Index {
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
|
||||
///
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument`
|
||||
pub fn writer_with_num_threads(
|
||||
&self,
|
||||
num_threads: usize,
|
||||
@@ -438,14 +445,13 @@ impl Index {
|
||||
/// Creates a multithreaded writer
|
||||
///
|
||||
/// Tantivy will automatically define the number of threads to use, but
|
||||
/// no more than [`MAX_NUM_THREAD`] threads.
|
||||
/// no more than 8 threads.
|
||||
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split
|
||||
/// between a given number of threads.
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument`
|
||||
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> crate::Result<IndexWriter> {
|
||||
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
@@ -523,7 +529,22 @@ impl Index {
|
||||
|
||||
/// Returns the set of corrupted files
|
||||
pub fn validate_checksum(&self) -> crate::Result<HashSet<PathBuf>> {
|
||||
self.directory.list_damaged().map_err(Into::into)
|
||||
let managed_files = self.directory.list_managed_files();
|
||||
let active_segments_files: HashSet<PathBuf> = self
|
||||
.searchable_segment_metas()?
|
||||
.iter()
|
||||
.flat_map(|segment_meta| segment_meta.list_files())
|
||||
.collect();
|
||||
let active_existing_files: HashSet<&PathBuf> =
|
||||
active_segments_files.intersection(&managed_files).collect();
|
||||
|
||||
let mut damaged_files = HashSet::new();
|
||||
for path in active_existing_files {
|
||||
if !self.directory.validate_checksum(path)? {
|
||||
damaged_files.insert((*path).clone());
|
||||
}
|
||||
}
|
||||
Ok(damaged_files)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -561,15 +582,15 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_index_exists() {
|
||||
let directory = RamDirectory::create();
|
||||
assert!(!Index::exists(&directory).unwrap());
|
||||
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
|
||||
assert!(!Index::exists(directory.as_ref()).unwrap());
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
throw_away_schema(),
|
||||
IndexSettings::default()
|
||||
)
|
||||
.is_ok());
|
||||
assert!(Index::exists(&directory).unwrap());
|
||||
assert!(Index::exists(directory.as_ref()).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -582,27 +603,27 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn open_or_create_should_open() {
|
||||
let directory = RamDirectory::create();
|
||||
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
throw_away_schema(),
|
||||
IndexSettings::default()
|
||||
)
|
||||
.is_ok());
|
||||
assert!(Index::exists(&directory).unwrap());
|
||||
assert!(Index::exists(directory.as_ref()).unwrap());
|
||||
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn create_should_wipeoff_existing() {
|
||||
let directory = RamDirectory::create();
|
||||
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
|
||||
assert!(Index::create(
|
||||
directory.clone(),
|
||||
throw_away_schema(),
|
||||
IndexSettings::default()
|
||||
)
|
||||
.is_ok());
|
||||
assert!(Index::exists(&directory).unwrap());
|
||||
assert!(Index::exists(directory.as_ref()).unwrap());
|
||||
assert!(Index::create(
|
||||
directory,
|
||||
Schema::builder().build(),
|
||||
@@ -636,7 +657,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy() {
|
||||
fn test_index_on_commit_reload_policy() -> crate::Result<()> {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -646,7 +667,7 @@ mod tests {
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &index, &reader);
|
||||
test_index_on_commit_reload_policy_aux(field, &index, &reader)
|
||||
}
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
@@ -658,7 +679,7 @@ mod tests {
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_mmap() {
|
||||
fn test_index_on_commit_reload_policy_mmap() -> crate::Result<()> {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
@@ -670,7 +691,7 @@ mod tests {
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &index, &reader);
|
||||
test_index_on_commit_reload_policy_aux(field, &index, &reader)
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -685,7 +706,7 @@ mod tests {
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()?;
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64));
|
||||
writer.add_document(doc!(field=>1u64))?;
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _handle = index.directory_mut().watch(WatchCallback::new(move || {
|
||||
let _ = sender.send(());
|
||||
@@ -699,7 +720,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_different_directories() {
|
||||
fn test_index_on_commit_reload_policy_different_directories() -> crate::Result<()> {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
@@ -712,10 +733,14 @@ mod tests {
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
|
||||
test_index_on_commit_reload_policy_aux(field, &write_index, &reader)
|
||||
}
|
||||
}
|
||||
fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
|
||||
fn test_index_on_commit_reload_policy_aux(
|
||||
field: Field,
|
||||
index: &Index,
|
||||
reader: &IndexReader,
|
||||
) -> crate::Result<()> {
|
||||
let mut reader_index = reader.index();
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _watch_handle = reader_index
|
||||
@@ -723,9 +748,9 @@ mod tests {
|
||||
.watch(WatchCallback::new(move || {
|
||||
let _ = sender.send(());
|
||||
}));
|
||||
let mut writer = index.writer_for_tests().unwrap();
|
||||
let mut writer = index.writer_for_tests()?;
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64));
|
||||
writer.add_document(doc!(field=>1u64))?;
|
||||
writer.commit().unwrap();
|
||||
// We need a loop here because it is possible for notify to send more than
|
||||
// one modify event. It was observed on CI on MacOS.
|
||||
@@ -735,7 +760,7 @@ mod tests {
|
||||
break;
|
||||
}
|
||||
}
|
||||
writer.add_document(doc!(field=>2u64));
|
||||
writer.add_document(doc!(field=>2u64))?;
|
||||
writer.commit().unwrap();
|
||||
// ... Same as above
|
||||
loop {
|
||||
@@ -744,37 +769,37 @@ mod tests {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// This test will not pass on windows, because windows
|
||||
// prevent deleting files that are MMapped.
|
||||
#[cfg(not(target_os = "windows"))]
|
||||
#[test]
|
||||
fn garbage_collect_works_as_intended() {
|
||||
fn garbage_collect_works_as_intended() -> crate::Result<()> {
|
||||
let directory = RamDirectory::create();
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create(directory.clone(), schema, IndexSettings::default()).unwrap();
|
||||
let index = Index::create(directory.clone(), schema, IndexSettings::default())?;
|
||||
|
||||
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
|
||||
for i in 0u64..8_000u64 {
|
||||
writer.add_document(doc!(field => i));
|
||||
writer.add_document(doc!(field => i))?;
|
||||
}
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _handle = directory.watch(WatchCallback::new(move || {
|
||||
let _ = sender.send(());
|
||||
}));
|
||||
writer.commit().unwrap();
|
||||
writer.commit()?;
|
||||
let mem_right_after_commit = directory.total_mem_usage();
|
||||
assert!(receiver.recv().is_ok());
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
.try_into()?;
|
||||
|
||||
assert_eq!(reader.searcher().num_docs(), 8_000);
|
||||
writer.wait_merging_threads().unwrap();
|
||||
writer.wait_merging_threads()?;
|
||||
let mem_right_after_merge_finished = directory.total_mem_usage();
|
||||
|
||||
reader.reload().unwrap();
|
||||
@@ -786,5 +811,6 @@ mod tests {
|
||||
mem_right_after_merge_finished,
|
||||
mem_right_after_commit
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,6 +101,7 @@ impl SegmentMeta {
|
||||
|
||||
/// Returns the list of files that
|
||||
/// are required for the segment meta.
|
||||
/// Note: Some of the returned files may not exist depending on the state of the segment.
|
||||
///
|
||||
/// This is useful as the way tantivy removes files
|
||||
/// is by removing all files that have been created by tantivy
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::io;
|
||||
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::TermInfo;
|
||||
@@ -8,6 +7,7 @@ use crate::postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::termdict::TermDictionary;
|
||||
use common::BinarySerializable;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
|
||||
@@ -88,7 +88,7 @@ impl Searcher {
|
||||
&self.segment_readers
|
||||
}
|
||||
|
||||
/// Returns the segment_reader associated with the given segment_ordinal
|
||||
/// Returns the segment_reader associated with the given segment_ord
|
||||
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
|
||||
&self.segment_readers[segment_ord as usize]
|
||||
}
|
||||
|
||||
@@ -2,8 +2,11 @@ use crate::core::InvertedIndexReader;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::intersect_alive_bitsets;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::fastfield::FacetReader;
|
||||
use crate::fastfield::FastFieldReaders;
|
||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
|
||||
@@ -14,7 +17,6 @@ use crate::space_usage::SegmentSpaceUsage;
|
||||
use crate::store::StoreReader;
|
||||
use crate::termdict::TermDictionary;
|
||||
use crate::DocId;
|
||||
use crate::{common::CompositeFile, error::DataCorruption};
|
||||
use fail::fail_point;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
@@ -46,7 +48,7 @@ pub struct SegmentReader {
|
||||
fieldnorm_readers: FieldNormReaders,
|
||||
|
||||
store_file: FileSlice,
|
||||
delete_bitset_opt: Option<DeleteBitSet>,
|
||||
alive_bitset_opt: Option<AliveBitSet>,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
@@ -71,14 +73,12 @@ impl SegmentReader {
|
||||
/// Return the number of documents that have been
|
||||
/// deleted in the segment.
|
||||
pub fn num_deleted_docs(&self) -> DocId {
|
||||
self.delete_bitset()
|
||||
.map(|delete_set| delete_set.num_deleted() as DocId)
|
||||
.unwrap_or(0u32)
|
||||
self.max_doc - self.num_docs
|
||||
}
|
||||
|
||||
/// Returns true iff some of the documents of the segment have been deleted.
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.delete_bitset().is_some()
|
||||
self.num_deleted_docs() > 0
|
||||
}
|
||||
|
||||
/// Accessor to a segment's fast field reader given a field.
|
||||
@@ -141,6 +141,14 @@ impl SegmentReader {
|
||||
|
||||
/// Open a new segment for reading.
|
||||
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
|
||||
Self::open_with_custom_alive_set(segment, None)
|
||||
}
|
||||
|
||||
/// Open a new segment for reading.
|
||||
pub fn open_with_custom_alive_set(
|
||||
segment: &Segment,
|
||||
custom_bitset: Option<AliveBitSet>,
|
||||
) -> crate::Result<SegmentReader> {
|
||||
let termdict_file = segment.open_read(SegmentComponent::Terms)?;
|
||||
let termdict_composite = CompositeFile::open(&termdict_file)?;
|
||||
|
||||
@@ -165,29 +173,36 @@ impl SegmentReader {
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
let fast_field_readers =
|
||||
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
|
||||
|
||||
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
|
||||
let delete_bitset_opt = if segment.meta().has_deletes() {
|
||||
let delete_data = segment.open_read(SegmentComponent::Delete)?;
|
||||
let delete_bitset = DeleteBitSet::open(delete_data)?;
|
||||
Some(delete_bitset)
|
||||
let original_bitset = if segment.meta().has_deletes() {
|
||||
let delete_file_slice = segment.open_read(SegmentComponent::Delete)?;
|
||||
let delete_data = delete_file_slice.read_bytes()?;
|
||||
Some(AliveBitSet::open(delete_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let alive_bitset_opt = intersect_alive_bitset(original_bitset, custom_bitset);
|
||||
|
||||
let max_doc = segment.meta().max_doc();
|
||||
let num_docs = alive_bitset_opt
|
||||
.as_ref()
|
||||
.map(|alive_bitset| alive_bitset.num_alive_docs() as u32)
|
||||
.unwrap_or(max_doc);
|
||||
|
||||
Ok(SegmentReader {
|
||||
inv_idx_reader_cache: Default::default(),
|
||||
max_doc: segment.meta().max_doc(),
|
||||
num_docs: segment.meta().num_docs(),
|
||||
num_docs,
|
||||
max_doc,
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_readers: fast_field_readers,
|
||||
fieldnorm_readers,
|
||||
segment_id: segment.id(),
|
||||
store_file,
|
||||
delete_bitset_opt,
|
||||
alive_bitset_opt,
|
||||
positions_composite,
|
||||
schema,
|
||||
})
|
||||
@@ -273,21 +288,25 @@ impl SegmentReader {
|
||||
|
||||
/// Returns the bitset representing
|
||||
/// the documents that have been deleted.
|
||||
pub fn delete_bitset(&self) -> Option<&DeleteBitSet> {
|
||||
self.delete_bitset_opt.as_ref()
|
||||
pub fn alive_bitset(&self) -> Option<&AliveBitSet> {
|
||||
self.alive_bitset_opt.as_ref()
|
||||
}
|
||||
|
||||
/// Returns true iff the `doc` is marked
|
||||
/// as deleted.
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
self.delete_bitset()
|
||||
self.alive_bitset()
|
||||
.map(|delete_set| delete_set.is_deleted(doc))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Returns an iterator that will iterate over the alive document ids
|
||||
pub fn doc_ids_alive(&self) -> impl Iterator<Item = DocId> + '_ {
|
||||
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
|
||||
pub fn doc_ids_alive(&self) -> Box<dyn Iterator<Item = DocId> + '_> {
|
||||
if let Some(alive_bitset) = &self.alive_bitset_opt {
|
||||
Box::new(alive_bitset.iter_alive())
|
||||
} else {
|
||||
Box::new(0u32..self.max_doc)
|
||||
}
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this segment.
|
||||
@@ -300,14 +319,29 @@ impl SegmentReader {
|
||||
self.fast_fields_readers.space_usage(),
|
||||
self.fieldnorm_readers.space_usage(),
|
||||
self.get_store_reader()?.space_usage(),
|
||||
self.delete_bitset_opt
|
||||
self.alive_bitset_opt
|
||||
.as_ref()
|
||||
.map(DeleteBitSet::space_usage)
|
||||
.map(AliveBitSet::space_usage)
|
||||
.unwrap_or(0),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
fn intersect_alive_bitset(
|
||||
left_opt: Option<AliveBitSet>,
|
||||
right_opt: Option<AliveBitSet>,
|
||||
) -> Option<AliveBitSet> {
|
||||
match (left_opt, right_opt) {
|
||||
(Some(left), Some(right)) => {
|
||||
assert_eq!(left.bitset().max_value(), right.bitset().max_value());
|
||||
Some(intersect_alive_bitsets(left, right))
|
||||
}
|
||||
(Some(left), None) => Some(left),
|
||||
(None, Some(right)) => Some(right),
|
||||
(None, None) => None,
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentReader {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "SegmentReader({:?})", self.segment_id)
|
||||
@@ -330,10 +364,10 @@ mod test {
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(name => "tantivy"));
|
||||
index_writer.add_document(doc!(name => "horse"));
|
||||
index_writer.add_document(doc!(name => "jockey"));
|
||||
index_writer.add_document(doc!(name => "cap"));
|
||||
index_writer.add_document(doc!(name => "tantivy"))?;
|
||||
index_writer.add_document(doc!(name => "horse"))?;
|
||||
index_writer.add_document(doc!(name => "jockey"))?;
|
||||
index_writer.add_document(doc!(name => "cap"))?;
|
||||
// we should now have one segment with two docs
|
||||
index_writer.delete_term(Term::from_field_text(name, "horse"));
|
||||
index_writer.delete_term(Term::from_field_text(name, "cap"));
|
||||
@@ -356,10 +390,10 @@ mod test {
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(name => "tantivy"));
|
||||
index_writer.add_document(doc!(name => "horse"));
|
||||
index_writer.add_document(doc!(name => "jockey"));
|
||||
index_writer.add_document(doc!(name => "cap"));
|
||||
index_writer.add_document(doc!(name => "tantivy"))?;
|
||||
index_writer.add_document(doc!(name => "horse"))?;
|
||||
index_writer.add_document(doc!(name => "jockey"))?;
|
||||
index_writer.add_document(doc!(name => "cap"))?;
|
||||
// we should now have one segment with two docs
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
@@ -1,18 +1,17 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CountingWriter;
|
||||
use crate::common::VInt;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::{TerminatingWrite, WritePtr};
|
||||
use crate::schema::Field;
|
||||
use crate::space_usage::FieldUsage;
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use common::HasLen;
|
||||
use common::VInt;
|
||||
use std::collections::HashMap;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::iter::ExactSizeIterator;
|
||||
use std::ops::Range;
|
||||
|
||||
use super::HasLen;
|
||||
|
||||
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
|
||||
pub struct FileAddr {
|
||||
field: Field,
|
||||
@@ -188,10 +187,10 @@ impl CompositeFile {
|
||||
mod test {
|
||||
|
||||
use super::{CompositeFile, CompositeWrite};
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::VInt;
|
||||
use crate::directory::{Directory, RamDirectory};
|
||||
use crate::schema::Field;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
@@ -230,3 +230,15 @@ where
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Box<dyn Directory> {
|
||||
fn clone(&self) -> Self {
|
||||
self.box_clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Directory + 'static> From<T> for Box<dyn Directory> {
|
||||
fn from(t: T) -> Self {
|
||||
Box::new(t)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use stable_deref_trait::StableDeref;
|
||||
|
||||
use crate::common::HasLen;
|
||||
use crate::directory::OwnedBytes;
|
||||
use common::HasLen;
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::sync::{Arc, Weak};
|
||||
@@ -32,12 +32,6 @@ impl FileHandle for &'static [u8] {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Deref<Target = [u8]>> HasLen for T {
|
||||
fn len(&self) -> usize {
|
||||
self.deref().len()
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> From<B> for FileSlice
|
||||
where
|
||||
B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync,
|
||||
@@ -178,7 +172,7 @@ impl HasLen for FileSlice {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{FileHandle, FileSlice};
|
||||
use crate::common::HasLen;
|
||||
use common::HasLen;
|
||||
use std::io;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use crate::directory::error::Incompatibility;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::{
|
||||
common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen},
|
||||
directory::{AntiCallToken, TerminatingWrite},
|
||||
Version, INDEX_FORMAT_VERSION,
|
||||
};
|
||||
use common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen};
|
||||
use crc32fast::Hasher;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::io;
|
||||
@@ -156,10 +156,8 @@ mod tests {
|
||||
|
||||
use crate::directory::footer::Footer;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::{
|
||||
common::BinarySerializable,
|
||||
directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice},
|
||||
};
|
||||
use crate::directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice};
|
||||
use common::BinarySerializable;
|
||||
use std::io;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::core::{MANAGED_FILEPATH, META_FILEPATH};
|
||||
use crate::core::MANAGED_FILEPATH;
|
||||
use crate::directory::error::{DeleteError, LockError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::footer::{Footer, FooterProxy};
|
||||
use crate::directory::GarbageCollectionResult;
|
||||
@@ -64,7 +64,7 @@ fn save_managed_paths(
|
||||
|
||||
impl ManagedDirectory {
|
||||
/// Wraps a directory as managed directory.
|
||||
pub fn wrap<Dir: Directory>(directory: Dir) -> crate::Result<ManagedDirectory> {
|
||||
pub fn wrap(directory: Box<dyn Directory>) -> crate::Result<ManagedDirectory> {
|
||||
match directory.atomic_read(&MANAGED_FILEPATH) {
|
||||
Ok(data) => {
|
||||
let managed_files_json = String::from_utf8_lossy(&data);
|
||||
@@ -76,14 +76,14 @@ impl ManagedDirectory {
|
||||
)
|
||||
})?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: Box::new(directory),
|
||||
directory,
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
managed_paths: managed_files,
|
||||
})),
|
||||
})
|
||||
}
|
||||
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
|
||||
directory: Box::new(directory),
|
||||
directory,
|
||||
meta_informations: Arc::default(),
|
||||
}),
|
||||
io_err @ Err(OpenReadError::IoError { .. }) => Err(io_err.err().unwrap().into()),
|
||||
@@ -248,24 +248,15 @@ impl ManagedDirectory {
|
||||
Ok(footer.crc() == crc)
|
||||
}
|
||||
|
||||
/// List files for which checksum does not match content
|
||||
pub fn list_damaged(&self) -> result::Result<HashSet<PathBuf>, OpenReadError> {
|
||||
let mut managed_paths = self
|
||||
/// List all managed files
|
||||
pub fn list_managed_files(&self) -> HashSet<PathBuf> {
|
||||
let managed_paths = self
|
||||
.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in list damaged.")
|
||||
.managed_paths
|
||||
.clone();
|
||||
|
||||
managed_paths.remove(*META_FILEPATH);
|
||||
|
||||
let mut damaged_files = HashSet::new();
|
||||
for path in managed_paths {
|
||||
if !self.validate_checksum(&path)? {
|
||||
damaged_files.insert(path);
|
||||
}
|
||||
}
|
||||
Ok(damaged_files)
|
||||
managed_paths
|
||||
}
|
||||
}
|
||||
|
||||
@@ -336,7 +327,6 @@ mod tests_mmap_specific {
|
||||
|
||||
use crate::directory::{Directory, ManagedDirectory, MmapDirectory, TerminatingWrite};
|
||||
use std::collections::HashSet;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tempfile::TempDir;
|
||||
@@ -350,7 +340,7 @@ mod tests_mmap_specific {
|
||||
let test_path2: &'static Path = Path::new("some_path_for_test_2");
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap();
|
||||
let write_file = managed_directory.open_write(test_path1).unwrap();
|
||||
write_file.terminate().unwrap();
|
||||
managed_directory
|
||||
@@ -365,7 +355,7 @@ mod tests_mmap_specific {
|
||||
}
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap();
|
||||
assert!(managed_directory.exists(test_path1).unwrap());
|
||||
assert!(!managed_directory.exists(test_path2).unwrap());
|
||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||
@@ -384,7 +374,7 @@ mod tests_mmap_specific {
|
||||
let living_files = HashSet::new();
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap();
|
||||
let mut write = managed_directory.open_write(test_path1).unwrap();
|
||||
write.write_all(&[0u8, 1u8]).unwrap();
|
||||
write.terminate().unwrap();
|
||||
@@ -405,39 +395,4 @@ mod tests_mmap_specific {
|
||||
}
|
||||
assert!(!managed_directory.exists(test_path1).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_checksum() -> crate::Result<()> {
|
||||
let test_path1: &'static Path = Path::new("some_path_for_test");
|
||||
let test_path2: &'static Path = Path::new("other_test_path");
|
||||
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path)?;
|
||||
let managed_directory = ManagedDirectory::wrap(mmap_directory)?;
|
||||
let mut write = managed_directory.open_write(test_path1)?;
|
||||
write.write_all(&[0u8, 1u8])?;
|
||||
write.terminate()?;
|
||||
|
||||
let mut write = managed_directory.open_write(test_path2)?;
|
||||
write.write_all(&[3u8, 4u8, 5u8])?;
|
||||
write.terminate()?;
|
||||
|
||||
let read_file = managed_directory.open_read(test_path2)?.read_bytes()?;
|
||||
assert_eq!(read_file.as_slice(), &[3u8, 4u8, 5u8]);
|
||||
assert!(managed_directory.list_damaged().unwrap().is_empty());
|
||||
|
||||
let mut corrupted_path = tempdir_path;
|
||||
corrupted_path.push(test_path2);
|
||||
let mut file = OpenOptions::new().write(true).open(&corrupted_path)?;
|
||||
file.write_all(&[255u8])?;
|
||||
file.flush()?;
|
||||
drop(file);
|
||||
|
||||
let damaged = managed_directory.list_damaged()?;
|
||||
assert_eq!(damaged.len(), 1);
|
||||
assert!(damaged.contains(test_path2));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,20 +74,12 @@ pub struct CacheInfo {
|
||||
pub mmapped: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct MmapCache {
|
||||
counters: CacheCounters,
|
||||
cache: HashMap<PathBuf, WeakArcBytes>,
|
||||
}
|
||||
|
||||
impl Default for MmapCache {
|
||||
fn default() -> MmapCache {
|
||||
MmapCache {
|
||||
counters: CacheCounters::default(),
|
||||
cache: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MmapCache {
|
||||
fn get_info(&self) -> CacheInfo {
|
||||
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
|
||||
@@ -242,7 +234,7 @@ impl MmapDirectory {
|
||||
}
|
||||
|
||||
let fd = open_opts.open(&self.inner.root_path)?;
|
||||
fd.sync_all()?;
|
||||
fd.sync_data()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -296,8 +288,7 @@ impl Write for SafeFileWriter {
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.0.flush()?;
|
||||
self.0.sync_all()
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -309,7 +300,9 @@ impl Seek for SafeFileWriter {
|
||||
|
||||
impl TerminatingWrite for SafeFileWriter {
|
||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()
|
||||
self.0.flush()?;
|
||||
self.0.sync_data()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -339,6 +332,7 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> {
|
||||
let mut tempfile = tempfile::Builder::new().tempfile_in(&parent_path)?;
|
||||
tempfile.write_all(content)?;
|
||||
tempfile.flush()?;
|
||||
tempfile.as_file_mut().sync_data()?;
|
||||
tempfile.into_temp_path().persist(path)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -485,13 +479,14 @@ mod tests {
|
||||
// The following tests are specific to the MmapDirectory
|
||||
|
||||
use super::*;
|
||||
use crate::indexer::LogMergePolicy;
|
||||
use crate::Index;
|
||||
use crate::ReloadPolicy;
|
||||
use crate::{common::HasLen, indexer::LogMergePolicy};
|
||||
use crate::{
|
||||
schema::{Schema, SchemaBuilder, TEXT},
|
||||
IndexSettings,
|
||||
};
|
||||
use common::HasLen;
|
||||
|
||||
#[test]
|
||||
fn test_open_non_existent_path() {
|
||||
@@ -581,8 +576,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mmap_released() {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
||||
fn test_mmap_released() -> crate::Result<()> {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
||||
let mut schema_builder: SchemaBuilder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
@@ -591,31 +586,30 @@ mod tests {
|
||||
let index =
|
||||
Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap();
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
log_merge_policy.set_min_num_segments(3);
|
||||
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
||||
for _num_commits in 0..10 {
|
||||
for _ in 0..10 {
|
||||
index_writer.add_document(doc!(text_field=>"abc"));
|
||||
index_writer.add_document(doc!(text_field=>"abc"))?;
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
.try_into()?;
|
||||
|
||||
for _ in 0..4 {
|
||||
index_writer.add_document(doc!(text_field=>"abc"));
|
||||
index_writer.commit().unwrap();
|
||||
reader.reload().unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"abc"))?;
|
||||
index_writer.commit()?;
|
||||
reader.reload()?;
|
||||
}
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
index_writer.wait_merging_threads()?;
|
||||
|
||||
reader.reload().unwrap();
|
||||
reader.reload()?;
|
||||
let num_segments = reader.searcher().segment_readers().len();
|
||||
assert!(num_segments <= 4);
|
||||
let num_components_except_deletes_and_tempstore =
|
||||
@@ -626,5 +620,6 @@ mod tests {
|
||||
);
|
||||
}
|
||||
assert!(mmap_directory.get_cache_info().mmapped.is_empty());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,6 +20,9 @@ mod watch_event_router;
|
||||
/// Errors specific to the directory module.
|
||||
pub mod error;
|
||||
|
||||
mod composite_file;
|
||||
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::directory::DirectoryLock;
|
||||
pub use self::directory::{Directory, DirectoryClone};
|
||||
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::AntiCallToken;
|
||||
use crate::directory::WatchCallbackList;
|
||||
use crate::directory::{Directory, FileSlice, WatchCallback, WatchHandle};
|
||||
use crate::directory::{TerminatingWrite, WritePtr};
|
||||
use crate::{common::HasLen, core::META_FILEPATH};
|
||||
use common::HasLen;
|
||||
use fail::fail_point;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
@@ -17,13 +18,6 @@ use super::FileHandle;
|
||||
/// Writer associated with the `RamDirectory`
|
||||
///
|
||||
/// The Writer just writes a buffer.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// On drop, if the writer was left in a *dirty* state.
|
||||
/// That is, if flush was not called after the last call
|
||||
/// to write.
|
||||
///
|
||||
struct VecWriter {
|
||||
path: PathBuf,
|
||||
shared_directory: RamDirectory,
|
||||
@@ -45,7 +39,7 @@ impl VecWriter {
|
||||
impl Drop for VecWriter {
|
||||
fn drop(&mut self) {
|
||||
if !self.is_flushed {
|
||||
panic!(
|
||||
warn!(
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.",
|
||||
self.path
|
||||
)
|
||||
@@ -220,14 +214,8 @@ impl Directory for RamDirectory {
|
||||
}
|
||||
|
||||
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
fail_point!("RamDirectory::atomic_write", |msg| Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
)));
|
||||
let path_buf = PathBuf::from(path);
|
||||
|
||||
self.fs.write().unwrap().write(path_buf, data);
|
||||
|
||||
if path == *META_FILEPATH {
|
||||
let _ = self.fs.write().unwrap().watch_router.broadcast();
|
||||
}
|
||||
|
||||
@@ -118,15 +118,6 @@ mod ram_directory_tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn ram_directory_panics_if_flush_forgotten() {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
let ram_directory = RamDirectory::create();
|
||||
let mut write_file = ram_directory.open_write(test_path).unwrap();
|
||||
assert!(write_file.write_all(&[4]).is_ok());
|
||||
}
|
||||
|
||||
fn test_simple(directory: &dyn Directory) -> crate::Result<()> {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
let mut write_file = directory.open_write(test_path)?;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::DocId;
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
@@ -85,11 +85,11 @@ pub trait DocSet: Send {
|
||||
|
||||
/// Returns the number documents matching.
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
let mut count = 0u32;
|
||||
let mut doc = self.doc();
|
||||
while doc != TERMINATED {
|
||||
if !delete_bitset.is_deleted(doc) {
|
||||
if alive_bitset.is_alive(doc) {
|
||||
count += 1u32;
|
||||
}
|
||||
doc = self.advance();
|
||||
@@ -130,8 +130,8 @@ impl<'a> DocSet for &'a mut dyn DocSet {
|
||||
(**self).size_hint()
|
||||
}
|
||||
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
(**self).count(delete_bitset)
|
||||
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
(**self).count(alive_bitset)
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
@@ -160,9 +160,9 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
unboxed.size_hint()
|
||||
}
|
||||
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count(delete_bitset)
|
||||
unboxed.count(alive_bitset)
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
|
||||
224
src/fastfield/alive_bitset.rs
Normal file
224
src/fastfield/alive_bitset.rs
Normal file
@@ -0,0 +1,224 @@
|
||||
use crate::space_usage::ByteCount;
|
||||
use crate::DocId;
|
||||
use common::intersect_bitsets;
|
||||
use common::BitSet;
|
||||
use common::ReadOnlyBitSet;
|
||||
use ownedbytes::OwnedBytes;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
/// Write a alive `BitSet`
|
||||
///
|
||||
/// where `alive_bitset` is the set of alive `DocId`.
|
||||
/// Warning: this function does not call terminate. The caller is in charge of
|
||||
/// closing the writer properly.
|
||||
pub fn write_alive_bitset<T: Write>(alive_bitset: &BitSet, writer: &mut T) -> io::Result<()> {
|
||||
alive_bitset.serialize(writer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set of alive `DocId`s.
|
||||
#[derive(Clone)]
|
||||
pub struct AliveBitSet {
|
||||
num_alive_docs: usize,
|
||||
bitset: ReadOnlyBitSet,
|
||||
}
|
||||
|
||||
/// Intersects two AliveBitSets in a new one.
|
||||
/// The two bitsets need to have the same max_value.
|
||||
pub fn intersect_alive_bitsets(left: AliveBitSet, right: AliveBitSet) -> AliveBitSet {
|
||||
assert_eq!(left.bitset().max_value(), right.bitset().max_value());
|
||||
let bitset = intersect_bitsets(left.bitset(), right.bitset());
|
||||
let num_alive_docs = bitset.len();
|
||||
AliveBitSet {
|
||||
num_alive_docs,
|
||||
bitset,
|
||||
}
|
||||
}
|
||||
|
||||
impl AliveBitSet {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn for_test_from_deleted_docs(deleted_docs: &[DocId], max_doc: u32) -> AliveBitSet {
|
||||
assert!(deleted_docs.iter().all(|&doc| doc < max_doc));
|
||||
let mut bitset = BitSet::with_max_value_and_full(max_doc);
|
||||
for &doc in deleted_docs {
|
||||
bitset.remove(doc);
|
||||
}
|
||||
let mut alive_bitset_buffer = Vec::new();
|
||||
write_alive_bitset(&bitset, &mut alive_bitset_buffer).unwrap();
|
||||
let alive_bitset_bytes = OwnedBytes::new(alive_bitset_buffer);
|
||||
Self::open(alive_bitset_bytes)
|
||||
}
|
||||
|
||||
pub(crate) fn from_bitset(bitset: &BitSet) -> AliveBitSet {
|
||||
let readonly_bitset = ReadOnlyBitSet::from(bitset);
|
||||
AliveBitSet::from(readonly_bitset)
|
||||
}
|
||||
|
||||
/// Opens a delete bitset given its file.
|
||||
pub fn open(bytes: OwnedBytes) -> AliveBitSet {
|
||||
let bitset = ReadOnlyBitSet::open(bytes);
|
||||
AliveBitSet::from(bitset)
|
||||
}
|
||||
|
||||
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
||||
#[inline]
|
||||
pub fn is_alive(&self, doc: DocId) -> bool {
|
||||
self.bitset.contains(doc)
|
||||
}
|
||||
|
||||
/// Returns true iff the document has been marked as deleted.
|
||||
#[inline]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
!self.is_alive(doc)
|
||||
}
|
||||
|
||||
/// Iterate over the alive doc_ids.
|
||||
#[inline]
|
||||
pub fn iter_alive(&self) -> impl Iterator<Item = DocId> + '_ {
|
||||
self.bitset.iter()
|
||||
}
|
||||
|
||||
/// Get underlying bitset
|
||||
#[inline]
|
||||
pub fn bitset(&self) -> &ReadOnlyBitSet {
|
||||
&self.bitset
|
||||
}
|
||||
|
||||
/// The number of deleted docs
|
||||
pub fn num_alive_docs(&self) -> usize {
|
||||
self.num_alive_docs
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this bitset.
|
||||
pub fn space_usage(&self) -> ByteCount {
|
||||
self.bitset().num_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ReadOnlyBitSet> for AliveBitSet {
|
||||
fn from(bitset: ReadOnlyBitSet) -> AliveBitSet {
|
||||
let num_alive_docs = bitset.len();
|
||||
AliveBitSet {
|
||||
num_alive_docs,
|
||||
bitset,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::AliveBitSet;
|
||||
|
||||
#[test]
|
||||
fn test_alive_bitset_empty() {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[], 10);
|
||||
for doc in 0..10 {
|
||||
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
|
||||
assert!(!alive_bitset.is_deleted(doc));
|
||||
}
|
||||
assert_eq!(alive_bitset.num_alive_docs(), 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alive_bitset() {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[1, 9], 10);
|
||||
assert!(alive_bitset.is_alive(0));
|
||||
assert!(alive_bitset.is_deleted(1));
|
||||
assert!(alive_bitset.is_alive(2));
|
||||
assert!(alive_bitset.is_alive(3));
|
||||
assert!(alive_bitset.is_alive(4));
|
||||
assert!(alive_bitset.is_alive(5));
|
||||
assert!(alive_bitset.is_alive(6));
|
||||
assert!(alive_bitset.is_alive(6));
|
||||
assert!(alive_bitset.is_alive(7));
|
||||
assert!(alive_bitset.is_alive(8));
|
||||
assert!(alive_bitset.is_deleted(9));
|
||||
for doc in 0..10 {
|
||||
assert_eq!(alive_bitset.is_deleted(doc), !alive_bitset.is_alive(doc));
|
||||
}
|
||||
assert_eq!(alive_bitset.num_alive_docs(), 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alive_bitset_iter_minimal() {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[7], 8);
|
||||
|
||||
let data: Vec<_> = alive_bitset.iter_alive().collect();
|
||||
assert_eq!(data, vec![0, 1, 2, 3, 4, 5, 6]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alive_bitset_iter_small() {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 2, 3, 6], 7);
|
||||
|
||||
let data: Vec<_> = alive_bitset.iter_alive().collect();
|
||||
assert_eq!(data, vec![1, 4, 5]);
|
||||
}
|
||||
#[test]
|
||||
fn test_alive_bitset_iter() {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000], 1001);
|
||||
|
||||
let data: Vec<_> = alive_bitset.iter_alive().collect();
|
||||
assert_eq!(data, (2..=999).collect::<Vec<_>>());
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::AliveBitSet;
|
||||
use rand::prelude::IteratorRandom;
|
||||
use rand::thread_rng;
|
||||
use test::Bencher;
|
||||
|
||||
fn get_alive() -> Vec<u32> {
|
||||
let mut data = (0..1_000_000_u32).collect::<Vec<u32>>();
|
||||
for _ in 0..(1_000_000) * 1 / 8 {
|
||||
remove_rand(&mut data);
|
||||
}
|
||||
data
|
||||
}
|
||||
|
||||
fn remove_rand(raw: &mut Vec<u32>) {
|
||||
let i = (0..raw.len()).choose(&mut thread_rng()).unwrap();
|
||||
raw.remove(i);
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_access(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
(0..1_000_000_u32)
|
||||
.filter(|doc| alive_bitset.is_alive(*doc))
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
(0..1_000_000_u32)
|
||||
.filter(|doc| alive_bitset.is_alive(*doc))
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -18,11 +18,11 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(bytes_field=>vec![0u8, 1, 2, 3]));
|
||||
index_writer.add_document(doc!(bytes_field=>vec![]));
|
||||
index_writer.add_document(doc!(bytes_field=>vec![255u8]));
|
||||
index_writer.add_document(doc!(bytes_field=>vec![1u8, 3, 5, 7, 9]));
|
||||
index_writer.add_document(doc!(bytes_field=>vec![0u8; 1000]));
|
||||
index_writer.add_document(doc!(bytes_field=>vec![0u8, 1, 2, 3]))?;
|
||||
index_writer.add_document(doc!(bytes_field=>vec![]))?;
|
||||
index_writer.add_document(doc!(bytes_field=>vec![255u8]))?;
|
||||
index_writer.add_document(doc!(bytes_field=>vec![1u8, 3, 5, 7, 9]))?;
|
||||
index_writer.add_document(doc!(bytes_field=>vec![0u8; 1000]))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
@@ -47,7 +47,7 @@ mod tests {
|
||||
index_writer.add_document(doc!(
|
||||
field => b"tantivy".as_ref(),
|
||||
field => b"lucene".as_ref()
|
||||
));
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
Ok(index.reader()?.searcher())
|
||||
}
|
||||
|
||||
@@ -1,143 +0,0 @@
|
||||
use crate::common::{BitSet, HasLen};
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::space_usage::ByteCount;
|
||||
use crate::DocId;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
/// Write a delete `BitSet`
|
||||
///
|
||||
/// where `delete_bitset` is the set of deleted `DocId`.
|
||||
/// Warning: this function does not call terminate. The caller is in charge of
|
||||
/// closing the writer properly.
|
||||
pub fn write_delete_bitset(
|
||||
delete_bitset: &BitSet,
|
||||
max_doc: u32,
|
||||
writer: &mut WritePtr,
|
||||
) -> io::Result<()> {
|
||||
let mut byte = 0u8;
|
||||
let mut shift = 0u8;
|
||||
for doc in 0..max_doc {
|
||||
if delete_bitset.contains(doc) {
|
||||
byte |= 1 << shift;
|
||||
}
|
||||
if shift == 7 {
|
||||
writer.write_all(&[byte])?;
|
||||
shift = 0;
|
||||
byte = 0;
|
||||
} else {
|
||||
shift += 1;
|
||||
}
|
||||
}
|
||||
if max_doc % 8 > 0 {
|
||||
writer.write_all(&[byte])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set of deleted `DocId`s.
|
||||
#[derive(Clone)]
|
||||
pub struct DeleteBitSet {
|
||||
data: OwnedBytes,
|
||||
num_deleted: usize,
|
||||
}
|
||||
|
||||
impl DeleteBitSet {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn for_test(docs: &[DocId], max_doc: u32) -> DeleteBitSet {
|
||||
use crate::directory::{Directory, RamDirectory, TerminatingWrite};
|
||||
use std::path::Path;
|
||||
assert!(docs.iter().all(|&doc| doc < max_doc));
|
||||
let mut bitset = BitSet::with_max_value(max_doc);
|
||||
for &doc in docs {
|
||||
bitset.insert(doc);
|
||||
}
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("dummydeletebitset");
|
||||
let mut wrt = directory.open_write(path).unwrap();
|
||||
write_delete_bitset(&bitset, max_doc, &mut wrt).unwrap();
|
||||
wrt.terminate().unwrap();
|
||||
let file = directory.open_read(path).unwrap();
|
||||
Self::open(file).unwrap()
|
||||
}
|
||||
|
||||
/// Opens a delete bitset given its file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<DeleteBitSet> {
|
||||
let bytes = file.read_bytes()?;
|
||||
let num_deleted: usize = bytes
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|b| b.count_ones() as usize)
|
||||
.sum();
|
||||
Ok(DeleteBitSet {
|
||||
data: bytes,
|
||||
num_deleted,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
||||
pub fn is_alive(&self, doc: DocId) -> bool {
|
||||
!self.is_deleted(doc)
|
||||
}
|
||||
|
||||
/// Returns true iff the document has been marked as deleted.
|
||||
#[inline]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
let byte_offset = doc / 8u32;
|
||||
let b: u8 = self.data.as_slice()[byte_offset as usize];
|
||||
let shift = (doc & 7u32) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// The number of deleted docs
|
||||
pub fn num_deleted(&self) -> usize {
|
||||
self.num_deleted
|
||||
}
|
||||
/// Summarize total space usage of this bitset.
|
||||
pub fn space_usage(&self) -> ByteCount {
|
||||
self.data.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for DeleteBitSet {
|
||||
fn len(&self) -> usize {
|
||||
self.num_deleted
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::DeleteBitSet;
|
||||
use crate::common::HasLen;
|
||||
|
||||
#[test]
|
||||
fn test_delete_bitset_empty() {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[], 10);
|
||||
for doc in 0..10 {
|
||||
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc));
|
||||
}
|
||||
assert_eq!(delete_bitset.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_bitset() {
|
||||
let delete_bitset = DeleteBitSet::for_test(&[1, 9], 10);
|
||||
assert!(delete_bitset.is_alive(0));
|
||||
assert!(delete_bitset.is_deleted(1));
|
||||
assert!(delete_bitset.is_alive(2));
|
||||
assert!(delete_bitset.is_alive(3));
|
||||
assert!(delete_bitset.is_alive(4));
|
||||
assert!(delete_bitset.is_alive(5));
|
||||
assert!(delete_bitset.is_alive(6));
|
||||
assert!(delete_bitset.is_alive(6));
|
||||
assert!(delete_bitset.is_alive(7));
|
||||
assert!(delete_bitset.is_alive(8));
|
||||
assert!(delete_bitset.is_deleted(9));
|
||||
for doc in 0..10 {
|
||||
assert_eq!(delete_bitset.is_deleted(doc), !delete_bitset.is_alive(doc));
|
||||
}
|
||||
assert_eq!(delete_bitset.len(), 2);
|
||||
}
|
||||
}
|
||||
@@ -95,7 +95,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -118,7 +118,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -141,7 +141,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -164,7 +164,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -187,8 +187,8 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -210,8 +210,8 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.add_document(Document::default())?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
|
||||
@@ -23,9 +23,10 @@ values stored.
|
||||
Read access performance is comparable to that of an array lookup.
|
||||
*/
|
||||
|
||||
pub use self::alive_bitset::intersect_alive_bitsets;
|
||||
pub use self::alive_bitset::write_alive_bitset;
|
||||
pub use self::alive_bitset::AliveBitSet;
|
||||
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
||||
pub use self::delete::write_delete_bitset;
|
||||
pub use self::delete::DeleteBitSet;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
|
||||
@@ -40,14 +41,14 @@ pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Value;
|
||||
use crate::DocId;
|
||||
use crate::{
|
||||
chrono::{NaiveDateTime, Utc},
|
||||
schema::Type,
|
||||
};
|
||||
use crate::{common, DocId};
|
||||
|
||||
mod alive_bitset;
|
||||
mod bytes;
|
||||
mod delete;
|
||||
mod error;
|
||||
mod facet_reader;
|
||||
mod multivalued;
|
||||
@@ -213,8 +214,7 @@ fn value_to_u64(value: &Value) -> u64 {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::common::HasLen;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::Field;
|
||||
@@ -222,6 +222,7 @@ mod tests {
|
||||
use crate::schema::FAST;
|
||||
use crate::schema::{Document, IntOptions};
|
||||
use crate::{Index, SegmentId, SegmentReader};
|
||||
use common::HasLen;
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
@@ -496,18 +497,18 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_missing_date_fast_field() {
|
||||
fn test_merge_missing_date_fast_field() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field("date", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()));
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!());
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let segment_ids: Vec<SegmentId> = reader
|
||||
.searcher()
|
||||
.segment_readers()
|
||||
@@ -516,10 +517,10 @@ mod tests {
|
||||
.collect();
|
||||
assert_eq!(segment_ids.len(), 2);
|
||||
let merge_future = index_writer.merge(&segment_ids[..]);
|
||||
let merge_res = futures::executor::block_on(merge_future);
|
||||
assert!(merge_res.is_ok());
|
||||
assert!(reader.reload().is_ok());
|
||||
futures::executor::block_on(merge_future)?;
|
||||
reader.reload()?;
|
||||
assert_eq!(reader.searcher().segment_readers().len(), 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -528,7 +529,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_datefastfield() {
|
||||
fn test_datefastfield() -> crate::Result<()> {
|
||||
use crate::fastfield::FastValue;
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field("date", FAST);
|
||||
@@ -538,22 +539,22 @@ mod tests {
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
date_field => crate::DateTime::from_u64(1i64.to_u64()),
|
||||
multi_date_field => crate::DateTime::from_u64(2i64.to_u64()),
|
||||
multi_date_field => crate::DateTime::from_u64(3i64.to_u64())
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
date_field => crate::DateTime::from_u64(4i64.to_u64())
|
||||
));
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
multi_date_field => crate::DateTime::from_u64(5i64.to_u64()),
|
||||
multi_date_field => crate::DateTime::from_u64(6i64.to_u64())
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
@@ -580,6 +581,7 @@ mod tests {
|
||||
assert_eq!(dates[0].timestamp(), 5i64);
|
||||
assert_eq!(dates[1].timestamp(), 6i64);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -588,7 +590,7 @@ mod bench {
|
||||
use super::tests::FIELD;
|
||||
use super::tests::{generate_permutation, SCHEMA};
|
||||
use super::*;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use std::collections::HashMap;
|
||||
|
||||
@@ -8,17 +8,25 @@ pub use self::writer::MultiValuedFastFieldWriter;
|
||||
mod tests {
|
||||
|
||||
use crate::collector::TopDocs;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::IntOptions;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::INDEXED;
|
||||
use crate::Document;
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
use chrono::Duration;
|
||||
use futures::executor::block_on;
|
||||
use proptest::prop_oneof;
|
||||
use proptest::proptest;
|
||||
use proptest::strategy::Strategy;
|
||||
use test_env_log::test;
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_u64() {
|
||||
fn test_multivalued_u64() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field(
|
||||
"multifield",
|
||||
@@ -26,17 +34,17 @@ mod tests {
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(field=>1u64, field=>3u64));
|
||||
index_writer.add_document(doc!());
|
||||
index_writer.add_document(doc!(field=>4u64));
|
||||
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(field=>1u64, field=>3u64))?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(field=>4u64))?;
|
||||
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = segment_reader.fast_fields().u64s(field).unwrap();
|
||||
let multi_value_reader = segment_reader.fast_fields().u64s(field)?;
|
||||
{
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[4u64]);
|
||||
@@ -49,10 +57,11 @@ mod tests {
|
||||
multi_value_reader.get_vals(1, &mut vals);
|
||||
assert!(vals.is_empty());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_date() {
|
||||
fn test_multivalued_date() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field(
|
||||
"multi_date_field",
|
||||
@@ -65,40 +74,37 @@ mod tests {
|
||||
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let first_time_stamp = chrono::Utc::now();
|
||||
index_writer.add_document(
|
||||
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
|
||||
);
|
||||
index_writer.add_document(doc!(time_i=>0i64));
|
||||
)?;
|
||||
index_writer.add_document(doc!(time_i=>0i64))?;
|
||||
// add one second
|
||||
index_writer
|
||||
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64));
|
||||
index_writer.add_document(
|
||||
doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64),
|
||||
)?;
|
||||
// add another second
|
||||
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
|
||||
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
|
||||
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64))?;
|
||||
// add three seconds
|
||||
index_writer
|
||||
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(
|
||||
doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64),
|
||||
)?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert_eq!(reader.num_docs(), 5);
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let query = parser
|
||||
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))
|
||||
.expect("could not parse query");
|
||||
let results = searcher
|
||||
.search(&query, &TopDocs::with_limit(5))
|
||||
.expect("could not query index");
|
||||
|
||||
let query = parser.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))?;
|
||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
||||
assert_eq!(results.len(), 1);
|
||||
for (_score, doc_address) in results {
|
||||
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
.get_first(date_field)
|
||||
@@ -120,12 +126,8 @@ mod tests {
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let query = parser
|
||||
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))
|
||||
.expect("could not parse query");
|
||||
let results = searcher
|
||||
.search(&query, &TopDocs::with_limit(5))
|
||||
.expect("could not query index");
|
||||
let query = parser.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))?;
|
||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
|
||||
@@ -157,10 +159,8 @@ mod tests {
|
||||
(first_time_stamp + Duration::seconds(1)).to_rfc3339(),
|
||||
(first_time_stamp + Duration::seconds(3)).to_rfc3339()
|
||||
);
|
||||
let query = parser.parse_query(&range_q).expect("could not parse query");
|
||||
let results = searcher
|
||||
.search(&query, &TopDocs::with_limit(5))
|
||||
.expect("could not query index");
|
||||
let query = parser.parse_query(&range_q)?;
|
||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
||||
|
||||
assert_eq!(results.len(), 2);
|
||||
for (i, doc_pair) in results.iter().enumerate() {
|
||||
@@ -188,16 +188,16 @@ mod tests {
|
||||
retrieved_doc
|
||||
.get_first(time_i)
|
||||
.expect("cannot find value")
|
||||
.i64_value()
|
||||
.expect("value not of i64 type"),
|
||||
time_i_val
|
||||
.i64_value(),
|
||||
Some(time_i_val)
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_i64() {
|
||||
fn test_multivalued_i64() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_i64_field(
|
||||
"multifield",
|
||||
@@ -205,14 +205,14 @@ mod tests {
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(field=> 1i64, field => 3i64));
|
||||
index_writer.add_document(doc!());
|
||||
index_writer.add_document(doc!(field=> -4i64));
|
||||
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(field=> 1i64, field => 3i64))?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(field=> -4i64))?;
|
||||
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
|
||||
@@ -224,18 +224,125 @@ mod tests {
|
||||
assert!(vals.is_empty());
|
||||
multi_value_reader.get_vals(3, &mut vals);
|
||||
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn test_multivalued_no_panic(ops: &[IndexingOp]) -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field(
|
||||
"multifield",
|
||||
IntOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed(),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
|
||||
for &op in ops {
|
||||
match op {
|
||||
IndexingOp::AddDoc { id } => {
|
||||
match id % 3 {
|
||||
0 => {
|
||||
index_writer.add_document(doc!())?;
|
||||
}
|
||||
1 => {
|
||||
let mut doc = Document::new();
|
||||
for _ in 0..5001 {
|
||||
doc.add_u64(field, id as u64);
|
||||
}
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
_ => {
|
||||
let mut doc = Document::new();
|
||||
doc.add_u64(field, id as u64);
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
};
|
||||
}
|
||||
IndexingOp::DeleteDoc { id } => {
|
||||
index_writer.delete_term(Term::from_field_u64(field, id as u64));
|
||||
}
|
||||
IndexingOp::Commit => {
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
IndexingOp::Merge => {
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
if segment_ids.len() >= 2 {
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.segment_updater().wait_merging_thread()?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
if !segment_ids.is_empty() {
|
||||
block_on(index_writer.merge(&segment_ids)).unwrap();
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum IndexingOp {
|
||||
AddDoc { id: u32 },
|
||||
DeleteDoc { id: u32 },
|
||||
Commit,
|
||||
Merge,
|
||||
}
|
||||
|
||||
fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
|
||||
prop_oneof![
|
||||
(0u32..10u32).prop_map(|id| IndexingOp::DeleteDoc { id }),
|
||||
(0u32..10u32).prop_map(|id| IndexingOp::AddDoc { id }),
|
||||
(0u32..2u32).prop_map(|_| IndexingOp::Commit),
|
||||
(0u32..1u32).prop_map(|_| IndexingOp::Merge),
|
||||
]
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_multivalued_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_proptest_off_by_one_bug_1151() {
|
||||
use IndexingOp::*;
|
||||
let ops = [
|
||||
AddDoc { id: 3 },
|
||||
AddDoc { id: 1 },
|
||||
AddDoc { id: 3 },
|
||||
Commit,
|
||||
Merge,
|
||||
];
|
||||
|
||||
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_many_facets() {
|
||||
fn test_many_facets() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_facet_field("facetfield", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for i in 0..100_000 {
|
||||
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
|
||||
index_writer
|
||||
.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())))?;
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,24 +94,22 @@ mod tests {
|
||||
use crate::schema::{Cardinality, Facet, IntOptions, Schema, INDEXED};
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader() {
|
||||
fn test_multifastfield_reader() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facets", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index
|
||||
.writer_for_tests()
|
||||
.expect("Failed to create index writer.");
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from("/category/cat2"),
|
||||
facet_field => Facet::from("/category/cat1"),
|
||||
));
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")));
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")));
|
||||
index_writer.commit().expect("Commit failed");
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
))?;
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")))?;
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
|
||||
let mut facet_reader = segment_reader.facet_reader(facet_field)?;
|
||||
|
||||
let mut facet = Facet::root();
|
||||
{
|
||||
@@ -145,10 +143,11 @@ mod tests {
|
||||
facet_reader.facet_ords(2, &mut vals);
|
||||
assert_eq!(&vals[..], &[4]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader_min_max() {
|
||||
fn test_multifastfield_reader_min_max() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field_options = IntOptions::default()
|
||||
.set_indexed()
|
||||
@@ -163,15 +162,16 @@ mod tests {
|
||||
item_field => 2i64,
|
||||
item_field => 3i64,
|
||||
item_field => -2i64,
|
||||
));
|
||||
index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64));
|
||||
index_writer.add_document(doc!(item_field => 4i64));
|
||||
index_writer.commit().expect("Commit failed");
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
))?;
|
||||
index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64))?;
|
||||
index_writer.add_document(doc!(item_field => 4i64))?;
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let field_reader = segment_reader.fast_fields().i64s(item_field).unwrap();
|
||||
let field_reader = segment_reader.fast_fields().i64s(item_field)?;
|
||||
|
||||
assert_eq!(field_reader.min_value(), -2);
|
||||
assert_eq!(field_reader.max_value(), 6);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use super::FastValue;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
@@ -8,6 +7,7 @@ use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
use crate::DocId;
|
||||
use common::BinarySerializable;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::fastfield::MultiValuedFastFieldReader;
|
||||
use crate::fastfield::{BitpackedFastFieldReader, FastFieldNotAvailableError};
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeWrite;
|
||||
use crate::common::CountingWriter;
|
||||
use crate::directory::CompositeWrite;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
@@ -105,9 +105,7 @@ impl CompositeFastFieldSerializer {
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
if let Some(broken_estimation) = estimations
|
||||
.iter()
|
||||
.find(|estimation| estimation.0 == f32::NAN)
|
||||
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
|
||||
{
|
||||
warn!(
|
||||
"broken estimation for fast field codec {}",
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use super::multivalued::MultiValuedFastFieldWriter;
|
||||
use super::serializer::FastFieldStats;
|
||||
use super::FastFieldDataAccess;
|
||||
use crate::common;
|
||||
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use common;
|
||||
use fnv::FnvHashMap;
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use super::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::schema::Field;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::common::CompositeWrite;
|
||||
use crate::directory::CompositeWrite;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use std::io;
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
use crate::schema;
|
||||
use crate::Index;
|
||||
use crate::IndexSettings;
|
||||
use crate::IndexSortByField;
|
||||
use crate::Order;
|
||||
use crate::Searcher;
|
||||
use crate::{doc, schema::*};
|
||||
use rand::thread_rng;
|
||||
@@ -35,7 +39,7 @@ fn test_functional_store() -> crate::Result<()> {
|
||||
let mut doc_set: Vec<u64> = Vec::new();
|
||||
|
||||
let mut doc_id = 0u64;
|
||||
for iteration in 0..500 {
|
||||
for iteration in 0..get_num_iterations() {
|
||||
dbg!(iteration);
|
||||
let num_docs: usize = rng.gen_range(0..4);
|
||||
if !doc_set.is_empty() {
|
||||
@@ -45,7 +49,7 @@ fn test_functional_store() -> crate::Result<()> {
|
||||
}
|
||||
for _ in 0..num_docs {
|
||||
doc_set.push(doc_id);
|
||||
index_writer.add_document(doc!(id_field=>doc_id));
|
||||
index_writer.add_document(doc!(id_field=>doc_id))?;
|
||||
doc_id += 1;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
@@ -56,16 +60,37 @@ fn test_functional_store() -> crate::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_num_iterations() -> usize {
|
||||
std::env::var("NUM_FUNCTIONAL_TEST_ITERATIONS")
|
||||
.map(|str| str.parse().unwrap())
|
||||
.unwrap_or(2000)
|
||||
}
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_functional_indexing() -> crate::Result<()> {
|
||||
fn test_functional_indexing_sorted() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let id_field = schema_builder.add_u64_field("id", INDEXED | FAST);
|
||||
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
|
||||
let text_field_options = TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text_field", text_field_options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_from_tempdir(schema)?;
|
||||
let mut index_builder = Index::builder().schema(schema);
|
||||
index_builder = index_builder.settings(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "id".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
});
|
||||
let index = index_builder.create_from_tempdir().unwrap();
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
let mut rng = thread_rng();
|
||||
@@ -75,7 +100,7 @@ fn test_functional_indexing() -> crate::Result<()> {
|
||||
let mut committed_docs: HashSet<u64> = HashSet::new();
|
||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||
|
||||
for _ in 0..200 {
|
||||
for _ in 0..get_num_iterations() {
|
||||
let random_val = rng.gen_range(0..20);
|
||||
if random_val == 0 {
|
||||
index_writer.commit()?;
|
||||
@@ -98,7 +123,85 @@ fn test_functional_indexing() -> crate::Result<()> {
|
||||
for i in 1u64..10u64 {
|
||||
doc.add_u64(multiples_field, random_val * i);
|
||||
}
|
||||
index_writer.add_document(doc);
|
||||
doc.add_text(text_field, get_text());
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
|
||||
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
|
||||
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
|
||||
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
|
||||
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est \
|
||||
laborum.";
|
||||
fn get_text() -> String {
|
||||
use rand::seq::SliceRandom;
|
||||
let mut rng = thread_rng();
|
||||
let tokens: Vec<_> = LOREM.split(' ').collect();
|
||||
let random_val = rng.gen_range(0..20);
|
||||
|
||||
(0..random_val)
|
||||
.map(|_| tokens.choose(&mut rng).unwrap())
|
||||
.cloned()
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_functional_indexing_unsorted() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
|
||||
let text_field_options = TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text_field", text_field_options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_from_tempdir(schema)?;
|
||||
let reader = index.reader()?;
|
||||
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
|
||||
|
||||
let mut committed_docs: HashSet<u64> = HashSet::new();
|
||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||
|
||||
for _ in 0..get_num_iterations() {
|
||||
let random_val = rng.gen_range(0..20);
|
||||
if random_val == 0 {
|
||||
index_writer.commit()?;
|
||||
committed_docs.extend(&uncommitted_docs);
|
||||
uncommitted_docs.clear();
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
// check that everything is correct.
|
||||
check_index_content(
|
||||
&searcher,
|
||||
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
|
||||
)?;
|
||||
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
|
||||
let doc_id_term = Term::from_field_u64(id_field, random_val);
|
||||
index_writer.delete_term(doc_id_term);
|
||||
} else {
|
||||
uncommitted_docs.insert(random_val);
|
||||
let mut doc = Document::new();
|
||||
doc.add_u64(id_field, random_val);
|
||||
for i in 1u64..10u64 {
|
||||
doc.add_u64(multiples_field, random_val * i);
|
||||
}
|
||||
doc.add_text(text_field, get_text());
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
||||
324
src/indexer/demuxer.rs
Normal file
324
src/indexer/demuxer.rs
Normal file
@@ -0,0 +1,324 @@
|
||||
use common::BitSet;
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::{merge_filtered_segments, Directory, Index, IndexSettings, Segment, SegmentOrdinal};
|
||||
/// DemuxMapping can be used to reorganize data from multiple segments.
|
||||
///
|
||||
/// DemuxMapping is useful in a multitenant settings, in which each document might actually belong to a different tenant.
|
||||
/// It allows to reorganize documents as follows:
|
||||
///
|
||||
/// e.g. if you have two tenant ids TENANT_A and TENANT_B and two segments with
|
||||
/// the documents (simplified)
|
||||
/// Seg 1 [TENANT_A, TENANT_B]
|
||||
/// Seg 2 [TENANT_A, TENANT_B]
|
||||
///
|
||||
/// You may want to group your documents to
|
||||
/// Seg 1 [TENANT_A, TENANT_A]
|
||||
/// Seg 2 [TENANT_B, TENANT_B]
|
||||
///
|
||||
/// Demuxing is the tool for that.
|
||||
/// Semantically you can define a mapping from [old segment ordinal, old doc_id] -> [new segment ordinal].
|
||||
#[derive(Debug, Default)]
|
||||
pub struct DemuxMapping {
|
||||
/// [index old segment ordinal] -> [index doc_id] = new segment ordinal
|
||||
mapping: Vec<DocIdToSegmentOrdinal>,
|
||||
}
|
||||
|
||||
/// DocIdToSegmentOrdinal maps from doc_id within a segment to the new segment ordinal for demuxing.
|
||||
///
|
||||
/// For every source segment there is a `DocIdToSegmentOrdinal` to distribute its doc_ids.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct DocIdToSegmentOrdinal {
|
||||
doc_id_index_to_segment_ord: Vec<SegmentOrdinal>,
|
||||
}
|
||||
|
||||
impl DocIdToSegmentOrdinal {
|
||||
/// Creates a new DocIdToSegmentOrdinal with size of num_doc_ids.
|
||||
/// Initially all doc_ids point to segment ordinal 0 and need to be set
|
||||
/// the via `set` method.
|
||||
pub fn with_max_doc(max_doc: usize) -> Self {
|
||||
DocIdToSegmentOrdinal {
|
||||
doc_id_index_to_segment_ord: vec![0; max_doc],
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of documents in this mapping.
|
||||
/// It should be equal to the `max_doc` of the segment it targets.
|
||||
pub fn max_doc(&self) -> u32 {
|
||||
self.doc_id_index_to_segment_ord.len() as u32
|
||||
}
|
||||
|
||||
/// Associates a doc_id with an output `SegmentOrdinal`.
|
||||
pub fn set(&mut self, doc_id: u32, segment_ord: SegmentOrdinal) {
|
||||
self.doc_id_index_to_segment_ord[doc_id as usize] = segment_ord;
|
||||
}
|
||||
|
||||
/// Iterates over the new SegmentOrdinal in the order of the doc_id.
|
||||
pub fn iter(&self) -> impl Iterator<Item = SegmentOrdinal> + '_ {
|
||||
self.doc_id_index_to_segment_ord.iter().cloned()
|
||||
}
|
||||
}
|
||||
|
||||
impl DemuxMapping {
|
||||
/// Adds a DocIdToSegmentOrdinal. The order of the pus calls
|
||||
/// defines the old segment ordinal. e.g. first push = ordinal 0.
|
||||
pub fn add(&mut self, segment_mapping: DocIdToSegmentOrdinal) {
|
||||
self.mapping.push(segment_mapping);
|
||||
}
|
||||
|
||||
/// Returns the old number of segments.
|
||||
pub fn get_old_num_segments(&self) -> usize {
|
||||
self.mapping.len()
|
||||
}
|
||||
}
|
||||
|
||||
fn docs_for_segment_ord(
|
||||
doc_id_to_segment_ord: &DocIdToSegmentOrdinal,
|
||||
target_segment_ord: SegmentOrdinal,
|
||||
) -> AliveBitSet {
|
||||
let mut bitset = BitSet::with_max_value(doc_id_to_segment_ord.max_doc());
|
||||
for doc_id in doc_id_to_segment_ord
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_doc_id, new_segment_ord)| *new_segment_ord == target_segment_ord)
|
||||
.map(|(doc_id, _)| doc_id)
|
||||
{
|
||||
// add document if segment ordinal = target segment ordinal
|
||||
bitset.insert(doc_id as u32);
|
||||
}
|
||||
AliveBitSet::from_bitset(&bitset)
|
||||
}
|
||||
|
||||
fn get_alive_bitsets(
|
||||
demux_mapping: &DemuxMapping,
|
||||
target_segment_ord: SegmentOrdinal,
|
||||
) -> Vec<AliveBitSet> {
|
||||
demux_mapping
|
||||
.mapping
|
||||
.iter()
|
||||
.map(|doc_id_to_segment_ord| {
|
||||
docs_for_segment_ord(doc_id_to_segment_ord, target_segment_ord)
|
||||
})
|
||||
.collect_vec()
|
||||
}
|
||||
|
||||
/// Demux the segments according to `demux_mapping`. See `DemuxMapping`.
|
||||
/// The number of output_directories need to match max new segment ordinal from `demux_mapping`.
|
||||
///
|
||||
/// The ordinal of `segments` need to match the ordinals provided in `demux_mapping`.
|
||||
pub fn demux(
|
||||
segments: &[Segment],
|
||||
demux_mapping: &DemuxMapping,
|
||||
target_settings: IndexSettings,
|
||||
output_directories: Vec<Box<dyn Directory>>,
|
||||
) -> crate::Result<Vec<Index>> {
|
||||
let mut indices = vec![];
|
||||
for (target_segment_ord, output_directory) in output_directories.into_iter().enumerate() {
|
||||
let delete_bitsets = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
|
||||
.into_iter()
|
||||
.map(Some)
|
||||
.collect_vec();
|
||||
let index = merge_filtered_segments(
|
||||
segments,
|
||||
target_settings.clone(),
|
||||
delete_bitsets,
|
||||
output_directory,
|
||||
)?;
|
||||
indices.push(index);
|
||||
}
|
||||
Ok(indices)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
collector::TopDocs,
|
||||
directory::RamDirectory,
|
||||
query::QueryParser,
|
||||
schema::{Schema, TEXT},
|
||||
DocAddress, Term,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_demux_map_to_deletebitset() {
|
||||
let max_value = 2;
|
||||
let mut demux_mapping = DemuxMapping::default();
|
||||
//segment ordinal 0 mapping
|
||||
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
|
||||
doc_id_to_segment.set(0, 1);
|
||||
doc_id_to_segment.set(1, 0);
|
||||
demux_mapping.add(doc_id_to_segment);
|
||||
|
||||
//segment ordinal 1 mapping
|
||||
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
|
||||
doc_id_to_segment.set(0, 1);
|
||||
doc_id_to_segment.set(1, 1);
|
||||
demux_mapping.add(doc_id_to_segment);
|
||||
{
|
||||
let bit_sets_for_demuxing_to_segment_ord_0 = get_alive_bitsets(&demux_mapping, 0);
|
||||
|
||||
assert_eq!(
|
||||
bit_sets_for_demuxing_to_segment_ord_0[0].is_deleted(0),
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
bit_sets_for_demuxing_to_segment_ord_0[0].is_deleted(1),
|
||||
false
|
||||
);
|
||||
assert_eq!(
|
||||
bit_sets_for_demuxing_to_segment_ord_0[1].is_deleted(0),
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
bit_sets_for_demuxing_to_segment_ord_0[1].is_deleted(1),
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
{
|
||||
let bit_sets_for_demuxing_to_segment_ord_1 = get_alive_bitsets(&demux_mapping, 1);
|
||||
|
||||
assert_eq!(
|
||||
bit_sets_for_demuxing_to_segment_ord_1[0].is_deleted(0),
|
||||
false
|
||||
);
|
||||
assert_eq!(
|
||||
bit_sets_for_demuxing_to_segment_ord_1[0].is_deleted(1),
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
bit_sets_for_demuxing_to_segment_ord_1[1].is_deleted(0),
|
||||
false
|
||||
);
|
||||
assert_eq!(
|
||||
bit_sets_for_demuxing_to_segment_ord_1[1].is_deleted(1),
|
||||
false
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_demux_segments() -> crate::Result<()> {
|
||||
let first_index = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"texto1"))?;
|
||||
index_writer.add_document(doc!(text_field=>"texto2"))?;
|
||||
index_writer.commit()?;
|
||||
index
|
||||
};
|
||||
|
||||
let second_index = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"texto3"))?;
|
||||
index_writer.add_document(doc!(text_field=>"texto4"))?;
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "4"));
|
||||
|
||||
index_writer.commit()?;
|
||||
index
|
||||
};
|
||||
|
||||
let mut segments: Vec<Segment> = Vec::new();
|
||||
segments.extend(first_index.searchable_segments()?);
|
||||
segments.extend(second_index.searchable_segments()?);
|
||||
|
||||
let target_settings = first_index.settings().clone();
|
||||
|
||||
let mut demux_mapping = DemuxMapping::default();
|
||||
{
|
||||
let max_value = 2;
|
||||
//segment ordinal 0 mapping
|
||||
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
|
||||
doc_id_to_segment.set(0, 1);
|
||||
doc_id_to_segment.set(1, 0);
|
||||
demux_mapping.add(doc_id_to_segment);
|
||||
|
||||
//segment ordinal 1 mapping
|
||||
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
|
||||
doc_id_to_segment.set(0, 1);
|
||||
doc_id_to_segment.set(1, 1);
|
||||
demux_mapping.add(doc_id_to_segment);
|
||||
}
|
||||
assert_eq!(demux_mapping.get_old_num_segments(), 2);
|
||||
|
||||
let demuxed_indices = demux(
|
||||
&segments,
|
||||
&demux_mapping,
|
||||
target_settings,
|
||||
vec![
|
||||
Box::new(RamDirectory::default()),
|
||||
Box::new(RamDirectory::default()),
|
||||
],
|
||||
)?;
|
||||
|
||||
{
|
||||
let index = &demuxed_indices[0];
|
||||
|
||||
let segments = index.searchable_segments()?;
|
||||
assert_eq!(segments.len(), 1);
|
||||
|
||||
let segment_metas = segments[0].meta();
|
||||
assert_eq!(segment_metas.num_deleted_docs(), 0);
|
||||
assert_eq!(segment_metas.num_docs(), 1);
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
{
|
||||
let text_field = index.schema().get_field("text").unwrap();
|
||||
|
||||
let do_search = |term: &str| {
|
||||
let query = QueryParser::for_index(&index, vec![text_field])
|
||||
.parse_query(term)
|
||||
.unwrap();
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
||||
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(do_search("texto1"), vec![] as Vec<u32>);
|
||||
assert_eq!(do_search("texto2"), vec![0]);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
let index = &demuxed_indices[1];
|
||||
|
||||
let segments = index.searchable_segments()?;
|
||||
assert_eq!(segments.len(), 1);
|
||||
|
||||
let segment_metas = segments[0].meta();
|
||||
assert_eq!(segment_metas.num_deleted_docs(), 0);
|
||||
assert_eq!(segment_metas.num_docs(), 3);
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
{
|
||||
let text_field = index.schema().get_field("text").unwrap();
|
||||
|
||||
let do_search = |term: &str| {
|
||||
let query = QueryParser::for_index(&index, vec![text_field])
|
||||
.parse_query(term)
|
||||
.unwrap();
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
||||
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(do_search("texto1"), vec![0]);
|
||||
assert_eq!(do_search("texto2"), vec![] as Vec<u32>);
|
||||
assert_eq!(do_search("texto3"), vec![1]);
|
||||
assert_eq!(do_search("texto4"), vec![2]);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -2,23 +2,23 @@
|
||||
//! to get mappings from old doc_id to new doc_id and vice versa, after sorting
|
||||
//!
|
||||
|
||||
use super::{merger::SegmentReaderWithOrdinal, SegmentWriter};
|
||||
use super::SegmentWriter;
|
||||
use crate::{
|
||||
schema::{Field, Schema},
|
||||
DocId, IndexSortByField, Order, TantivyError,
|
||||
DocId, IndexSortByField, Order, SegmentOrdinal, TantivyError,
|
||||
};
|
||||
use std::{cmp::Reverse, ops::Index};
|
||||
|
||||
/// Struct to provide mapping from new doc_id to old doc_id and segment.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct SegmentDocidMapping<'a> {
|
||||
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
|
||||
pub(crate) struct SegmentDocIdMapping {
|
||||
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
|
||||
is_trivial: bool,
|
||||
}
|
||||
|
||||
impl<'a> SegmentDocidMapping<'a> {
|
||||
impl SegmentDocIdMapping {
|
||||
pub(crate) fn new(
|
||||
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
|
||||
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
|
||||
is_trivial: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
@@ -26,7 +26,7 @@ impl<'a> SegmentDocidMapping<'a> {
|
||||
is_trivial,
|
||||
}
|
||||
}
|
||||
pub(crate) fn iter(&self) -> impl Iterator<Item = &(DocId, SegmentReaderWithOrdinal)> {
|
||||
pub(crate) fn iter(&self) -> impl Iterator<Item = &(DocId, SegmentOrdinal)> {
|
||||
self.new_doc_id_to_old_and_segment.iter()
|
||||
}
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
@@ -40,15 +40,15 @@ impl<'a> SegmentDocidMapping<'a> {
|
||||
self.is_trivial
|
||||
}
|
||||
}
|
||||
impl<'a> Index<usize> for SegmentDocidMapping<'a> {
|
||||
type Output = (DocId, SegmentReaderWithOrdinal<'a>);
|
||||
impl Index<usize> for SegmentDocIdMapping {
|
||||
type Output = (DocId, SegmentOrdinal);
|
||||
|
||||
fn index(&self, idx: usize) -> &Self::Output {
|
||||
&self.new_doc_id_to_old_and_segment[idx]
|
||||
}
|
||||
}
|
||||
impl<'a> IntoIterator for SegmentDocidMapping<'a> {
|
||||
type Item = (DocId, SegmentReaderWithOrdinal<'a>);
|
||||
impl IntoIterator for SegmentDocIdMapping {
|
||||
type Item = (DocId, SegmentOrdinal);
|
||||
type IntoIter = std::vec::IntoIter<Self::Item>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
@@ -146,7 +146,7 @@ mod tests_indexsorting {
|
||||
fn create_test_index(
|
||||
index_settings: Option<IndexSettings>,
|
||||
text_field_options: TextOptions,
|
||||
) -> Index {
|
||||
) -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
|
||||
@@ -166,19 +166,20 @@ mod tests_indexsorting {
|
||||
if let Some(settings) = index_settings {
|
||||
index_builder = index_builder.settings(settings);
|
||||
}
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
let index = index_builder.create_in_ram()?;
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(my_number=>40_u64));
|
||||
index_writer
|
||||
.add_document(doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64));
|
||||
index_writer.add_document(doc!(my_number=>100_u64));
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(my_number=>40_u64))?;
|
||||
index_writer.add_document(
|
||||
doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64),
|
||||
)?;
|
||||
index_writer.add_document(doc!(my_number=>100_u64))?;
|
||||
index_writer.add_document(
|
||||
doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"),
|
||||
);
|
||||
index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ));
|
||||
index_writer.commit().unwrap();
|
||||
index
|
||||
)?;
|
||||
index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ))?;
|
||||
index_writer.commit()?;
|
||||
Ok(index)
|
||||
}
|
||||
fn get_text_options() -> TextOptions {
|
||||
TextOptions::default().set_indexing_options(
|
||||
@@ -203,7 +204,7 @@ mod tests_indexsorting {
|
||||
for option in options {
|
||||
//let options = get_text_options();
|
||||
// no index_sort
|
||||
let index = create_test_index(None, option.clone());
|
||||
let index = create_test_index(None, option.clone())?;
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
@@ -225,7 +226,7 @@ mod tests_indexsorting {
|
||||
..Default::default()
|
||||
}),
|
||||
option.clone(),
|
||||
);
|
||||
)?;
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -257,7 +258,7 @@ mod tests_indexsorting {
|
||||
..Default::default()
|
||||
}),
|
||||
option.clone(),
|
||||
);
|
||||
)?;
|
||||
let my_string_field = index.schema().get_field("text_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
@@ -287,7 +288,7 @@ mod tests_indexsorting {
|
||||
#[test]
|
||||
fn test_sort_index_get_documents() -> crate::Result<()> {
|
||||
// default baseline
|
||||
let index = create_test_index(None, get_text_options());
|
||||
let index = create_test_index(None, get_text_options())?;
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
{
|
||||
@@ -316,7 +317,7 @@ mod tests_indexsorting {
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
)?;
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
{
|
||||
@@ -341,7 +342,7 @@ mod tests_indexsorting {
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
)?;
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
{
|
||||
@@ -356,7 +357,7 @@ mod tests_indexsorting {
|
||||
|
||||
#[test]
|
||||
fn test_sort_index_test_string_field() -> crate::Result<()> {
|
||||
let index = create_test_index(None, get_text_options());
|
||||
let index = create_test_index(None, get_text_options())?;
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
@@ -376,7 +377,7 @@ mod tests_indexsorting {
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
)?;
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -407,7 +408,7 @@ mod tests_indexsorting {
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
)?;
|
||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
@@ -443,7 +444,7 @@ mod tests_indexsorting {
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
)?;
|
||||
assert_eq!(
|
||||
index.settings().sort_by_field.as_ref().unwrap().field,
|
||||
"my_number".to_string()
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use super::operation::{AddOperation, UserOperation};
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use crate::common::BitSet;
|
||||
use crate::core::Index;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
@@ -12,9 +11,10 @@ use crate::directory::TerminatingWrite;
|
||||
use crate::directory::{DirectoryLock, GarbageCollectionResult};
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::error::TantivyError;
|
||||
use crate::fastfield::write_delete_bitset;
|
||||
use crate::fastfield::write_alive_bitset;
|
||||
use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||
use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
||||
use crate::indexer::index_writer_status::IndexWriterStatus;
|
||||
use crate::indexer::operation::DeleteOperation;
|
||||
use crate::indexer::stamper::Stamper;
|
||||
use crate::indexer::MergePolicy;
|
||||
@@ -24,17 +24,18 @@ use crate::schema::Document;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::Opstamp;
|
||||
use common::BitSet;
|
||||
use crossbeam::channel;
|
||||
use futures::executor::block_on;
|
||||
use futures::future::Future;
|
||||
use smallvec::smallvec;
|
||||
use smallvec::SmallVec;
|
||||
use std::mem;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
|
||||
use super::{AddBatch, AddBatchReceiver, AddBatchSender};
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
// in the heap goes below MARGIN_IN_BYTES.
|
||||
pub const MARGIN_IN_BYTES: usize = 1_000_000;
|
||||
@@ -50,15 +51,12 @@ pub const MAX_NUM_THREAD: usize = 8;
|
||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
|
||||
// Group of operations.
|
||||
// Most of the time, users will send operation one-by-one, but it can be useful to
|
||||
// send them as a small block to ensure that
|
||||
// - all docs in the operation will happen on the same segment and continuous doc_ids.
|
||||
// - all operations in the group are committed at the same time, making the group
|
||||
// atomic.
|
||||
type OperationGroup = SmallVec<[AddOperation; 4]>;
|
||||
type OperationSender = channel::Sender<OperationGroup>;
|
||||
type OperationReceiver = channel::Receiver<OperationGroup>;
|
||||
fn error_in_index_worker_thread(context: &str) -> TantivyError {
|
||||
TantivyError::ErrorInThread(format!(
|
||||
"{}. A worker thread encounterred an error (io::Error most likely) or panicked.",
|
||||
context
|
||||
))
|
||||
}
|
||||
|
||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||
///
|
||||
@@ -77,8 +75,8 @@ pub struct IndexWriter {
|
||||
|
||||
workers_join_handle: Vec<JoinHandle<crate::Result<()>>>,
|
||||
|
||||
operation_receiver: OperationReceiver,
|
||||
operation_sender: OperationSender,
|
||||
index_writer_status: IndexWriterStatus,
|
||||
operation_sender: AddBatchSender,
|
||||
|
||||
segment_updater: SegmentUpdater,
|
||||
|
||||
@@ -93,7 +91,7 @@ pub struct IndexWriter {
|
||||
}
|
||||
|
||||
fn compute_deleted_bitset(
|
||||
delete_bitset: &mut BitSet,
|
||||
alive_bitset: &mut BitSet,
|
||||
segment_reader: &SegmentReader,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &DocToOpstampMapping,
|
||||
@@ -114,7 +112,7 @@ fn compute_deleted_bitset(
|
||||
let mut doc_matching_deleted_term = docset.doc();
|
||||
while doc_matching_deleted_term != TERMINATED {
|
||||
if doc_opstamps.is_deleted(doc_matching_deleted_term, delete_op.opstamp) {
|
||||
delete_bitset.insert(doc_matching_deleted_term);
|
||||
alive_bitset.remove(doc_matching_deleted_term);
|
||||
might_have_changed = true;
|
||||
}
|
||||
doc_matching_deleted_term = docset.advance();
|
||||
@@ -141,7 +139,7 @@ pub(crate) fn advance_deletes(
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if segment_entry.delete_bitset().is_none() && segment_entry.delete_cursor().get().is_none() {
|
||||
if segment_entry.alive_bitset().is_none() && segment_entry.delete_cursor().get().is_none() {
|
||||
// There has been no `DeleteOperation` between the segment status and `target_opstamp`.
|
||||
return Ok(());
|
||||
}
|
||||
@@ -149,38 +147,32 @@ pub(crate) fn advance_deletes(
|
||||
let segment_reader = SegmentReader::open(&segment)?;
|
||||
|
||||
let max_doc = segment_reader.max_doc();
|
||||
let mut delete_bitset: BitSet = match segment_entry.delete_bitset() {
|
||||
Some(previous_delete_bitset) => (*previous_delete_bitset).clone(),
|
||||
None => BitSet::with_max_value(max_doc),
|
||||
let mut alive_bitset: BitSet = match segment_entry.alive_bitset() {
|
||||
Some(previous_alive_bitset) => (*previous_alive_bitset).clone(),
|
||||
None => BitSet::with_max_value_and_full(max_doc),
|
||||
};
|
||||
|
||||
let num_deleted_docs_before = segment.meta().num_deleted_docs();
|
||||
|
||||
compute_deleted_bitset(
|
||||
&mut delete_bitset,
|
||||
&mut alive_bitset,
|
||||
&segment_reader,
|
||||
segment_entry.delete_cursor(),
|
||||
&DocToOpstampMapping::None,
|
||||
target_opstamp,
|
||||
)?;
|
||||
|
||||
// TODO optimize
|
||||
// It should be possible to do something smarter by manipulation bitsets directly
|
||||
// to compute this union.
|
||||
if let Some(seg_delete_bitset) = segment_reader.delete_bitset() {
|
||||
for doc in 0u32..max_doc {
|
||||
if seg_delete_bitset.is_deleted(doc) {
|
||||
delete_bitset.insert(doc);
|
||||
}
|
||||
}
|
||||
if let Some(seg_alive_bitset) = segment_reader.alive_bitset() {
|
||||
alive_bitset.intersect_update(seg_alive_bitset.bitset());
|
||||
}
|
||||
|
||||
let num_deleted_docs: u32 = delete_bitset.len() as u32;
|
||||
let num_alive_docs: u32 = alive_bitset.len() as u32;
|
||||
let num_deleted_docs = max_doc - num_alive_docs;
|
||||
if num_deleted_docs > num_deleted_docs_before {
|
||||
// There are new deletes. We need to write a new delete file.
|
||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
let mut delete_file = segment.open_write(SegmentComponent::Delete)?;
|
||||
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
|
||||
write_alive_bitset(&alive_bitset, &mut delete_file)?;
|
||||
delete_file.terminate()?;
|
||||
}
|
||||
|
||||
@@ -191,10 +183,10 @@ pub(crate) fn advance_deletes(
|
||||
fn index_documents(
|
||||
memory_budget: usize,
|
||||
segment: Segment,
|
||||
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
|
||||
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> crate::Result<bool> {
|
||||
) -> crate::Result<()> {
|
||||
let schema = segment.schema();
|
||||
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
|
||||
@@ -213,7 +205,7 @@ fn index_documents(
|
||||
}
|
||||
|
||||
if !segment_updater.is_alive() {
|
||||
return Ok(false);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let max_doc = segment_writer.max_doc();
|
||||
@@ -226,21 +218,20 @@ fn index_documents(
|
||||
|
||||
let segment_with_max_doc = segment.with_max_doc(max_doc);
|
||||
|
||||
let delete_bitset_opt =
|
||||
apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;
|
||||
let alive_bitset_opt = apply_deletes(&segment_with_max_doc, &mut delete_cursor, &doc_opstamps)?;
|
||||
|
||||
let meta = segment_with_max_doc.meta().clone();
|
||||
meta.untrack_temp_docstore();
|
||||
// update segment_updater inventory to remove tempstore
|
||||
let segment_entry = SegmentEntry::new(meta, delete_cursor, delete_bitset_opt);
|
||||
let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt);
|
||||
block_on(segment_updater.schedule_add_segment(segment_entry))?;
|
||||
Ok(true)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// `doc_opstamps` is required to be non-empty.
|
||||
fn apply_deletes(
|
||||
segment: &Segment,
|
||||
mut delete_cursor: &mut DeleteCursor,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &[Opstamp],
|
||||
) -> crate::Result<Option<BitSet>> {
|
||||
if delete_cursor.get().is_none() {
|
||||
@@ -259,11 +250,11 @@ fn apply_deletes(
|
||||
let doc_to_opstamps = DocToOpstampMapping::WithMap(doc_opstamps);
|
||||
|
||||
let max_doc = segment.meta().max_doc();
|
||||
let mut deleted_bitset = BitSet::with_max_value(max_doc);
|
||||
let mut deleted_bitset = BitSet::with_max_value_and_full(max_doc);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
max_doc_opstamp,
|
||||
)?;
|
||||
@@ -287,8 +278,7 @@ impl IndexWriter {
|
||||
/// should work at the same time.
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument`
|
||||
pub(crate) fn new(
|
||||
index: &Index,
|
||||
num_threads: usize,
|
||||
@@ -306,7 +296,7 @@ impl IndexWriter {
|
||||
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
|
||||
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
@@ -324,7 +314,7 @@ impl IndexWriter {
|
||||
heap_size_in_bytes_per_thread,
|
||||
index: index.clone(),
|
||||
|
||||
operation_receiver: document_receiver,
|
||||
index_writer_status: IndexWriterStatus::from(document_receiver),
|
||||
operation_sender: document_sender,
|
||||
|
||||
segment_updater,
|
||||
@@ -348,6 +338,11 @@ impl IndexWriter {
|
||||
self.operation_sender = sender;
|
||||
}
|
||||
|
||||
/// Accessor to the index.
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
}
|
||||
|
||||
/// If there are some merging threads, blocks until they all finish their work and
|
||||
/// then drop the `IndexWriter`.
|
||||
pub fn wait_merging_threads(mut self) -> crate::Result<()> {
|
||||
@@ -359,16 +354,14 @@ impl IndexWriter {
|
||||
for join_handle in former_workers_handles {
|
||||
join_handle
|
||||
.join()
|
||||
.expect("Indexing Worker thread panicked")
|
||||
.map_err(|_| {
|
||||
TantivyError::ErrorInThread("Error in indexing worker thread.".into())
|
||||
})?;
|
||||
.map_err(|_| error_in_index_worker_thread("Worker thread panicked."))?
|
||||
.map_err(|_| error_in_index_worker_thread("Worker thread failed."))?;
|
||||
}
|
||||
|
||||
let result = self
|
||||
.segment_updater
|
||||
.wait_merging_thread()
|
||||
.map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into()));
|
||||
.map_err(|_| error_in_index_worker_thread("Failed to join merging thread."));
|
||||
|
||||
if let Err(ref e) = result {
|
||||
error!("Some merging thread failed {:?}", e);
|
||||
@@ -396,10 +389,18 @@ impl IndexWriter {
|
||||
self.index.new_segment()
|
||||
}
|
||||
|
||||
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver> {
|
||||
self.index_writer_status
|
||||
.operation_receiver()
|
||||
.ok_or_else(|| crate::TantivyError::ErrorInThread("The index writer was killed. It can happen if an indexing worker encounterred an Io error for instance.".to_string()))
|
||||
}
|
||||
|
||||
/// Spawns a new worker thread for indexing.
|
||||
/// The thread consumes documents from the pipeline.
|
||||
fn add_indexing_worker(&mut self) -> crate::Result<()> {
|
||||
let document_receiver_clone = self.operation_receiver.clone();
|
||||
let document_receiver_clone = self.operation_receiver()?;
|
||||
let index_writer_bomb = self.index_writer_status.create_bomb();
|
||||
|
||||
let mut segment_updater = self.segment_updater.clone();
|
||||
|
||||
let mut delete_cursor = self.delete_queue.cursor();
|
||||
@@ -410,32 +411,31 @@ impl IndexWriter {
|
||||
.name(format!("thrd-tantivy-index{}", self.worker_id))
|
||||
.spawn(move || {
|
||||
loop {
|
||||
let mut document_iterator =
|
||||
document_receiver_clone.clone().into_iter().peekable();
|
||||
let mut document_iterator = document_receiver_clone
|
||||
.clone()
|
||||
.into_iter()
|
||||
.filter(|batch| !batch.is_empty())
|
||||
.peekable();
|
||||
|
||||
// the peeking here is to avoid
|
||||
// creating a new segment's files
|
||||
// The peeking here is to avoid creating a new segment's files
|
||||
// if no document are available.
|
||||
//
|
||||
// this is a valid guarantee as the
|
||||
// peeked document now belongs to
|
||||
// This is a valid guarantee as the peeked document now belongs to
|
||||
// our local iterator.
|
||||
if let Some(operations) = document_iterator.peek() {
|
||||
if let Some(first) = operations.first() {
|
||||
delete_cursor.skip_to(first.opstamp);
|
||||
} else {
|
||||
return Ok(());
|
||||
}
|
||||
if let Some(batch) = document_iterator.peek() {
|
||||
assert!(!batch.is_empty());
|
||||
delete_cursor.skip_to(batch[0].opstamp);
|
||||
} else {
|
||||
// No more documents.
|
||||
// Happens when there is a commit, or if the `IndexWriter`
|
||||
// It happens when there is a commit, or if the `IndexWriter`
|
||||
// was dropped.
|
||||
index_writer_bomb.defuse();
|
||||
return Ok(());
|
||||
}
|
||||
let segment = index.new_segment();
|
||||
|
||||
index_documents(
|
||||
mem_budget,
|
||||
segment,
|
||||
index.new_segment(),
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
delete_cursor.clone(),
|
||||
@@ -465,10 +465,8 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
/// Detects and removes the files that are not used by the index anymore.
|
||||
pub fn garbage_collect_files(
|
||||
&self,
|
||||
) -> impl Future<Output = crate::Result<GarbageCollectionResult>> {
|
||||
self.segment_updater.schedule_garbage_collect()
|
||||
pub async fn garbage_collect_files(&self) -> crate::Result<GarbageCollectionResult> {
|
||||
self.segment_updater.schedule_garbage_collect().await
|
||||
}
|
||||
|
||||
/// Deletes all documents from the index
|
||||
@@ -491,7 +489,7 @@ impl IndexWriter {
|
||||
/// let index = Index::create_in_ram(schema.clone());
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The modern Promotheus"));
|
||||
/// index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let clear_res = index_writer.delete_all_documents().unwrap();
|
||||
@@ -535,12 +533,11 @@ impl IndexWriter {
|
||||
/// when no documents are remaining.
|
||||
///
|
||||
/// Returns the former segment_ready channel.
|
||||
#[allow(unused_must_use)]
|
||||
fn recreate_document_channel(&mut self) -> OperationReceiver {
|
||||
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
|
||||
fn recreate_document_channel(&mut self) {
|
||||
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
mem::replace(&mut self.operation_sender, document_sender);
|
||||
mem::replace(&mut self.operation_receiver, document_receiver)
|
||||
self.operation_sender = document_sender;
|
||||
self.index_writer_status = IndexWriterStatus::from(document_receiver);
|
||||
}
|
||||
|
||||
/// Rollback to the last commit
|
||||
@@ -556,7 +553,7 @@ impl IndexWriter {
|
||||
// marks the segment updater as killed. From now on, all
|
||||
// segment updates will be ignored.
|
||||
self.segment_updater.kill();
|
||||
let document_receiver = self.operation_receiver.clone();
|
||||
let document_receiver = self.operation_receiver();
|
||||
|
||||
// take the directory lock to create a new index_writer.
|
||||
let directory_lock = self
|
||||
@@ -696,14 +693,10 @@ impl IndexWriter {
|
||||
/// The opstamp is an increasing `u64` that can
|
||||
/// be used by the client to align commits with its own
|
||||
/// document queue.
|
||||
pub fn add_document(&self, document: Document) -> Opstamp {
|
||||
pub fn add_document(&self, document: Document) -> crate::Result<Opstamp> {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let add_operation = AddOperation { opstamp, document };
|
||||
let send_result = self.operation_sender.send(smallvec![add_operation]);
|
||||
if let Err(e) = send_result {
|
||||
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||
}
|
||||
opstamp
|
||||
self.send_add_documents_batch(smallvec![AddOperation { opstamp, document }])?;
|
||||
Ok(opstamp)
|
||||
}
|
||||
|
||||
/// Gets a range of stamps from the stamper and "pops" the last stamp
|
||||
@@ -716,11 +709,7 @@ impl IndexWriter {
|
||||
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
|
||||
let Range { start, end } = self.stamper.stamps(count + 1u64);
|
||||
let last_opstamp = end - 1;
|
||||
let stamps = Range {
|
||||
start,
|
||||
end: last_opstamp,
|
||||
};
|
||||
(last_opstamp, stamps)
|
||||
(last_opstamp, start..last_opstamp)
|
||||
}
|
||||
|
||||
/// Runs a group of document operations ensuring that the operations are
|
||||
@@ -739,16 +728,20 @@ impl IndexWriter {
|
||||
/// Like adds and deletes (see `IndexWriter.add_document` and
|
||||
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
|
||||
/// visible to readers only after calling `commit()`.
|
||||
pub fn run(&self, user_operations: Vec<UserOperation>) -> Opstamp {
|
||||
let count = user_operations.len() as u64;
|
||||
pub fn run<I>(&self, user_operations: I) -> crate::Result<Opstamp>
|
||||
where
|
||||
I: IntoIterator<Item = UserOperation>,
|
||||
I::IntoIter: ExactSizeIterator,
|
||||
{
|
||||
let user_operations_it = user_operations.into_iter();
|
||||
let count = user_operations_it.len() as u64;
|
||||
if count == 0 {
|
||||
return self.stamper.stamp();
|
||||
return Ok(self.stamper.stamp());
|
||||
}
|
||||
let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
|
||||
|
||||
let mut adds = OperationGroup::default();
|
||||
|
||||
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
|
||||
let mut adds = AddBatch::default();
|
||||
for (user_op, opstamp) in user_operations_it.zip(stamps) {
|
||||
match user_op {
|
||||
UserOperation::Delete(term) => {
|
||||
let delete_operation = DeleteOperation { opstamp, term };
|
||||
@@ -760,12 +753,16 @@ impl IndexWriter {
|
||||
}
|
||||
}
|
||||
}
|
||||
let send_result = self.operation_sender.send(adds);
|
||||
if let Err(e) = send_result {
|
||||
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||
};
|
||||
self.send_add_documents_batch(adds)?;
|
||||
Ok(batch_opstamp)
|
||||
}
|
||||
|
||||
batch_opstamp
|
||||
fn send_add_documents_batch(&self, add_ops: AddBatch) -> crate::Result<()> {
|
||||
if self.index_writer_status.is_alive() && self.operation_sender.send(add_ops).is_ok() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(error_in_index_worker_thread("An index writer was killed."))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -831,7 +828,7 @@ mod tests {
|
||||
UserOperation::Add(doc!(text_field=>"a")),
|
||||
UserOperation::Add(doc!(text_field=>"b")),
|
||||
];
|
||||
let batch_opstamp1 = index_writer.run(operations);
|
||||
let batch_opstamp1 = index_writer.run(operations).unwrap();
|
||||
assert_eq!(batch_opstamp1, 2u64);
|
||||
}
|
||||
|
||||
@@ -842,14 +839,18 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(text_field => "hello1"));
|
||||
index_writer.add_document(doc!(text_field => "hello2"));
|
||||
index_writer
|
||||
.add_document(doc!(text_field => "hello1"))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(text_field => "hello2"))
|
||||
.unwrap();
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 0);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_docs(), 2);
|
||||
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "hello1"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
@@ -857,7 +858,7 @@ mod tests {
|
||||
assert!(reader.reload().is_ok());
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_docs(), 1);
|
||||
|
||||
let previous_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
|
||||
|
||||
@@ -869,7 +870,7 @@ mod tests {
|
||||
assert!(reader.reload().is_ok());
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_deleted_docs(), 1);
|
||||
assert_eq!(searcher.segment_reader(0u32).num_docs(), 1);
|
||||
|
||||
let after_delete_opstamp = index.load_metas().unwrap().segments[0].delete_opstamp();
|
||||
assert_eq!(after_delete_opstamp, previous_delete_opstamp);
|
||||
@@ -900,7 +901,7 @@ mod tests {
|
||||
UserOperation::Delete(b_term),
|
||||
];
|
||||
|
||||
index_writer.run(operations);
|
||||
index_writer.run(operations).unwrap();
|
||||
index_writer.commit().expect("failed to commit");
|
||||
reader.reload().expect("failed to load searchers");
|
||||
|
||||
@@ -930,10 +931,10 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let index_writer = index.writer(3_000_000).unwrap();
|
||||
let operations1 = vec![];
|
||||
let batch_opstamp1 = index_writer.run(operations1);
|
||||
let batch_opstamp1 = index_writer.run(operations1).unwrap();
|
||||
assert_eq!(batch_opstamp1, 0u64);
|
||||
let operations2 = vec![];
|
||||
let batch_opstamp2 = index_writer.run(operations2);
|
||||
let batch_opstamp2 = index_writer.run(operations2).unwrap();
|
||||
assert_eq!(batch_opstamp2, 1u64);
|
||||
}
|
||||
|
||||
@@ -993,15 +994,14 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_commit_and_rollback() {
|
||||
fn test_commit_and_rollback() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
.try_into()?;
|
||||
let num_docs_containing = |s: &str| {
|
||||
let searcher = reader.searcher();
|
||||
let term = Term::from_field_text(text_field, s);
|
||||
@@ -1010,136 +1010,127 @@ mod tests {
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.rollback().unwrap();
|
||||
let mut index_writer = index.writer(3_000_000)?;
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
index_writer.rollback()?;
|
||||
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
reader.reload().unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
||||
index_writer.add_document(doc!(text_field=>"c"))?;
|
||||
index_writer.commit()?;
|
||||
reader.reload()?;
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
assert_eq!(num_docs_containing("b"), 1);
|
||||
assert_eq!(num_docs_containing("c"), 1);
|
||||
}
|
||||
reader.reload().unwrap();
|
||||
reader.reload()?;
|
||||
reader.searcher();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_merges() {
|
||||
fn test_with_merges() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
.try_into()?;
|
||||
let num_docs_containing = |s: &str| {
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
reader.searcher().doc_freq(&term_a).unwrap()
|
||||
};
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
}
|
||||
index_writer.commit().expect("commit failed");
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
}
|
||||
// this should create 8 segments and trigger a merge.
|
||||
index_writer.commit().expect("commit failed");
|
||||
index_writer
|
||||
.wait_merging_threads()
|
||||
.expect("waiting merging thread failed");
|
||||
|
||||
reader.reload().unwrap();
|
||||
|
||||
assert_eq!(num_docs_containing("a"), 200);
|
||||
assert!(index.searchable_segments().unwrap().len() < 8);
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
}
|
||||
// this should create 8 segments and trigger a merge.
|
||||
index_writer.commit()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
reader.reload()?;
|
||||
assert_eq!(num_docs_containing("a"), 200);
|
||||
assert!(index.searchable_segments()?.len() < 8);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prepare_with_commit_message() {
|
||||
fn test_prepare_with_commit_message() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer(12_000_000)?;
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"))?;
|
||||
}
|
||||
{
|
||||
let mut prepared_commit = index_writer.prepare_commit()?;
|
||||
prepared_commit.set_payload("first commit");
|
||||
prepared_commit.commit()?;
|
||||
}
|
||||
{
|
||||
let metas = index.load_metas()?;
|
||||
assert_eq!(metas.payload.unwrap(), "first commit");
|
||||
}
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
{
|
||||
let metas = index.load_metas()?;
|
||||
assert!(metas.payload.is_none());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prepare_but_rollback() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000)?;
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
index_writer.add_document(doc!(text_field => "a"))?;
|
||||
}
|
||||
{
|
||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||
let mut prepared_commit = index_writer.prepare_commit()?;
|
||||
prepared_commit.set_payload("first commit");
|
||||
prepared_commit.commit().expect("commit failed");
|
||||
prepared_commit.abort()?;
|
||||
}
|
||||
{
|
||||
let metas = index.load_metas().unwrap();
|
||||
assert_eq!(metas.payload.unwrap(), "first commit");
|
||||
}
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
{
|
||||
let metas = index.load_metas().unwrap();
|
||||
assert!(metas.payload.is_none());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prepare_but_rollback() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
{
|
||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||
prepared_commit.set_payload("first commit");
|
||||
prepared_commit.abort().expect("commit failed");
|
||||
}
|
||||
{
|
||||
let metas = index.load_metas().unwrap();
|
||||
let metas = index.load_metas()?;
|
||||
assert!(metas.payload.is_none());
|
||||
}
|
||||
for _doc in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "b"));
|
||||
index_writer.add_document(doc!(text_field => "b"))?;
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let num_docs_containing = |s: &str| {
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap()
|
||||
.try_into()?
|
||||
.searcher()
|
||||
.doc_freq(&term_a)
|
||||
.unwrap()
|
||||
};
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
assert_eq!(num_docs_containing("b"), 100);
|
||||
assert_eq!(num_docs_containing("a")?, 0);
|
||||
assert_eq!(num_docs_containing("b")?, 100);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1160,7 +1151,7 @@ mod tests {
|
||||
};
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
|
||||
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
|
||||
let add_tstamp = index_writer.add_document(doc!(text_field => "a")).unwrap();
|
||||
let commit_tstamp = index_writer.commit().unwrap();
|
||||
assert!(commit_tstamp > add_tstamp);
|
||||
index_writer.delete_all_documents().unwrap();
|
||||
@@ -1177,7 +1168,7 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
|
||||
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
|
||||
let add_tstamp = index_writer.add_document(doc!(text_field => "a")).unwrap();
|
||||
|
||||
// commit documents - they are now available
|
||||
let first_commit = index_writer.commit();
|
||||
@@ -1196,7 +1187,7 @@ mod tests {
|
||||
|
||||
// add new documents again
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "b"));
|
||||
index_writer.add_document(doc!(text_field => "b")).unwrap();
|
||||
}
|
||||
|
||||
// rollback to last commit, when index was empty
|
||||
@@ -1230,7 +1221,7 @@ mod tests {
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
// add one simple doc
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
index_writer.add_document(doc!(text_field => "a")).unwrap();
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
@@ -1254,7 +1245,7 @@ mod tests {
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
|
||||
// add one simple doc
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
assert!(index_writer.add_document(doc!(text_field => "a")).is_ok());
|
||||
let comm = index_writer.commit();
|
||||
assert!(comm.is_ok());
|
||||
let commit_tstamp = comm.unwrap();
|
||||
@@ -1330,13 +1321,13 @@ mod tests {
|
||||
|
||||
// create and delete docs in same commit
|
||||
for id in 0u64..5u64 {
|
||||
index_writer.add_document(doc!(id_field => id));
|
||||
index_writer.add_document(doc!(id_field => id))?;
|
||||
}
|
||||
for id in 2u64..4u64 {
|
||||
index_writer.delete_term(Term::from_field_u64(id_field, id));
|
||||
}
|
||||
for id in 5u64..10u64 {
|
||||
index_writer.add_document(doc!(id_field => id));
|
||||
index_writer.add_document(doc!(id_field => id))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
index_reader.reload()?;
|
||||
@@ -1361,13 +1352,24 @@ mod tests {
|
||||
AddDoc { id: u64 },
|
||||
DeleteDoc { id: u64 },
|
||||
Commit,
|
||||
Merge,
|
||||
}
|
||||
|
||||
fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
|
||||
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
|
||||
prop_oneof![
|
||||
(0u64..10u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
|
||||
(0u64..10u64).prop_map(|id| IndexingOp::AddDoc { id }),
|
||||
(0u64..2u64).prop_map(|_| IndexingOp::Commit),
|
||||
(0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
|
||||
(0u64..20u64).prop_map(|id| IndexingOp::AddDoc { id }),
|
||||
(0u64..1u64).prop_map(|_| IndexingOp::Commit),
|
||||
(0u64..1u64).prop_map(|_| IndexingOp::Merge),
|
||||
]
|
||||
}
|
||||
|
||||
fn adding_operation_strategy() -> impl Strategy<Value = IndexingOp> {
|
||||
prop_oneof![
|
||||
10 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
|
||||
50 => (0u64..100u64).prop_map(|id| IndexingOp::AddDoc { id }),
|
||||
2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit),
|
||||
1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge),
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1393,7 +1395,7 @@ mod tests {
|
||||
fn test_operation_strategy(
|
||||
ops: &[IndexingOp],
|
||||
sort_index: bool,
|
||||
force_merge: bool,
|
||||
force_end_merge: bool,
|
||||
) -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED);
|
||||
@@ -1435,12 +1437,16 @@ mod tests {
|
||||
.settings(settings)
|
||||
.create_in_ram()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
|
||||
let old_reader = index.reader()?;
|
||||
|
||||
for &op in ops {
|
||||
match op {
|
||||
IndexingOp::AddDoc { id } => {
|
||||
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
|
||||
index_writer
|
||||
.add_document(doc!(id_field=>id, multi_numbers=> id, multi_numbers => id, text_field => id.to_string(), facet_field => facet, large_text_field=> LOREM));
|
||||
.add_document(doc!(id_field=>id, multi_numbers=> id, multi_numbers => id, text_field => id.to_string(), facet_field => facet, large_text_field=> LOREM))?;
|
||||
}
|
||||
IndexingOp::DeleteDoc { id } => {
|
||||
index_writer.delete_term(Term::from_field_u64(id_field, id));
|
||||
@@ -1448,12 +1454,21 @@ mod tests {
|
||||
IndexingOp::Commit => {
|
||||
index_writer.commit()?;
|
||||
}
|
||||
IndexingOp::Merge => {
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
if segment_ids.len() >= 2 {
|
||||
block_on(index_writer.merge(&segment_ids)).unwrap();
|
||||
assert!(index_writer.segment_updater().wait_merging_thread().is_ok());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
if force_merge {
|
||||
if force_end_merge {
|
||||
index_writer.wait_merging_threads()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let segment_ids = index
|
||||
@@ -1464,6 +1479,21 @@ mod tests {
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
old_reader.reload()?;
|
||||
let old_searcher = old_reader.searcher();
|
||||
|
||||
let ids_old_searcher: HashSet<u64> = old_searcher
|
||||
.segment_readers()
|
||||
.iter()
|
||||
.flat_map(|segment_reader| {
|
||||
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
|
||||
segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc| ff_reader.get(doc))
|
||||
})
|
||||
.collect();
|
||||
|
||||
let ids: HashSet<u64> = searcher
|
||||
.segment_readers()
|
||||
.iter()
|
||||
@@ -1476,6 +1506,19 @@ mod tests {
|
||||
.collect();
|
||||
|
||||
let (expected_ids_and_num_occurences, deleted_ids) = expected_ids(ops);
|
||||
let num_docs_expected = expected_ids_and_num_occurences
|
||||
.iter()
|
||||
.map(|(_, id_occurences)| *id_occurences as usize)
|
||||
.sum::<usize>();
|
||||
assert_eq!(searcher.num_docs() as usize, num_docs_expected);
|
||||
assert_eq!(old_searcher.num_docs() as usize, num_docs_expected);
|
||||
assert_eq!(
|
||||
ids_old_searcher,
|
||||
expected_ids_and_num_occurences
|
||||
.keys()
|
||||
.cloned()
|
||||
.collect::<HashSet<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
ids,
|
||||
expected_ids_and_num_occurences
|
||||
@@ -1500,7 +1543,7 @@ mod tests {
|
||||
for segment_reader in searcher.segment_readers().iter() {
|
||||
let store_reader = segment_reader.get_store_reader().unwrap();
|
||||
// test store iterator
|
||||
for doc in store_reader.iter(segment_reader.delete_bitset()) {
|
||||
for doc in store_reader.iter(segment_reader.alive_bitset()) {
|
||||
let id = doc
|
||||
.unwrap()
|
||||
.get_first(id_field)
|
||||
@@ -1570,22 +1613,42 @@ mod tests {
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(20))]
|
||||
#[test]
|
||||
fn test_delete_with_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||
fn test_delete_with_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
||||
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
|
||||
}
|
||||
#[test]
|
||||
fn test_delete_without_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||
fn test_delete_without_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
||||
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
|
||||
}
|
||||
#[test]
|
||||
fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||
fn test_delete_with_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
||||
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
|
||||
}
|
||||
#[test]
|
||||
fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||
fn test_delete_without_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
||||
assert!(test_operation_strategy(&ops[..], false, true).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_with_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
|
||||
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
|
||||
}
|
||||
#[test]
|
||||
fn test_delete_without_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
|
||||
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
|
||||
}
|
||||
#[test]
|
||||
fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
|
||||
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
|
||||
}
|
||||
#[test]
|
||||
fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..100)) {
|
||||
assert!(test_operation_strategy(&ops[..], false, true).is_ok());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1610,11 +1673,11 @@ mod tests {
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
// We add a doc...
|
||||
index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64));
|
||||
index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64))?;
|
||||
// And remove it.
|
||||
index_writer.delete_term(Term::from_field_u64(id_field, 0u64));
|
||||
// We add another doc.
|
||||
index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64));
|
||||
index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64))?;
|
||||
|
||||
// The expected result is a segment with
|
||||
// maxdoc = 2
|
||||
@@ -1626,19 +1689,19 @@ mod tests {
|
||||
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
assert_eq!(segment_reader.max_doc(), 2);
|
||||
assert_eq!(segment_reader.num_deleted_docs(), 1);
|
||||
assert_eq!(segment_reader.num_docs(), 1);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_doc_missing_field() {
|
||||
fn test_index_doc_missing_field() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let idfield = schema_builder.add_text_field("id", STRING);
|
||||
schema_builder.add_text_field("optfield", STRING);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(idfield=>"myid"));
|
||||
let commit = index_writer.commit();
|
||||
assert!(commit.is_ok());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(idfield=>"myid"))?;
|
||||
index_writer.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
118
src/indexer/index_writer_status.rs
Normal file
118
src/indexer/index_writer_status.rs
Normal file
@@ -0,0 +1,118 @@
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use super::AddBatchReceiver;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct IndexWriterStatus {
|
||||
inner: Arc<Inner>,
|
||||
}
|
||||
|
||||
impl IndexWriterStatus {
|
||||
/// Returns true iff the index writer is alive.
|
||||
pub fn is_alive(&self) -> bool {
|
||||
self.inner.as_ref().is_alive()
|
||||
}
|
||||
|
||||
/// Returns a copy of the operation receiver.
|
||||
/// If the index writer was killed, returns None.
|
||||
pub fn operation_receiver(&self) -> Option<AddBatchReceiver> {
|
||||
let rlock = self
|
||||
.inner
|
||||
.receive_channel
|
||||
.read()
|
||||
.expect("This lock should never be poisoned");
|
||||
rlock.as_ref().map(|receiver| receiver.clone())
|
||||
}
|
||||
|
||||
/// Create an index writer bomb.
|
||||
/// If dropped, the index writer status will be killed.
|
||||
pub(crate) fn create_bomb(&self) -> IndexWriterBomb {
|
||||
IndexWriterBomb {
|
||||
inner: Some(self.inner.clone()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Inner {
|
||||
is_alive: AtomicBool,
|
||||
receive_channel: RwLock<Option<AddBatchReceiver>>,
|
||||
}
|
||||
|
||||
impl Inner {
|
||||
fn is_alive(&self) -> bool {
|
||||
self.is_alive.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
fn kill(&self) {
|
||||
self.is_alive.store(false, Ordering::Relaxed);
|
||||
self.receive_channel
|
||||
.write()
|
||||
.expect("This lock should never be poisoned")
|
||||
.take();
|
||||
}
|
||||
}
|
||||
|
||||
impl From<AddBatchReceiver> for IndexWriterStatus {
|
||||
fn from(receiver: AddBatchReceiver) -> Self {
|
||||
IndexWriterStatus {
|
||||
inner: Arc::new(Inner {
|
||||
is_alive: AtomicBool::new(true),
|
||||
receive_channel: RwLock::new(Some(receiver)),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// If dropped, the index writer will be killed.
|
||||
/// To prevent this, clients can call `.defuse()`.
|
||||
pub(crate) struct IndexWriterBomb {
|
||||
inner: Option<Arc<Inner>>,
|
||||
}
|
||||
|
||||
impl IndexWriterBomb {
|
||||
/// Defuses the bomb.
|
||||
///
|
||||
/// This is the only way to drop the bomb without killing
|
||||
/// the index writer.
|
||||
pub fn defuse(mut self) {
|
||||
self.inner = None;
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for IndexWriterBomb {
|
||||
fn drop(&mut self) {
|
||||
if let Some(inner) = self.inner.take() {
|
||||
inner.kill();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::IndexWriterStatus;
|
||||
use crossbeam::channel;
|
||||
use std::mem;
|
||||
|
||||
#[test]
|
||||
fn test_bomb_goes_boom() {
|
||||
let (_tx, rx) = channel::bounded(10);
|
||||
let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx);
|
||||
assert!(index_writer_status.operation_receiver().is_some());
|
||||
let bomb = index_writer_status.create_bomb();
|
||||
assert!(index_writer_status.operation_receiver().is_some());
|
||||
mem::drop(bomb);
|
||||
// boom!
|
||||
assert!(index_writer_status.operation_receiver().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bomb_defused() {
|
||||
let (_tx, rx) = channel::bounded(10);
|
||||
let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx);
|
||||
assert!(index_writer_status.operation_receiver().is_some());
|
||||
let bomb = index_writer_status.create_bomb();
|
||||
bomb.defuse();
|
||||
assert!(index_writer_status.operation_receiver().is_some());
|
||||
}
|
||||
}
|
||||
@@ -114,7 +114,7 @@ mod tests {
|
||||
use crate::Index;
|
||||
|
||||
#[test]
|
||||
fn create_index_test_max_merge_issue_1035() {
|
||||
fn create_index_test_max_merge_issue_1035() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("intval", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
@@ -127,34 +127,34 @@ mod tests {
|
||||
log_merge_policy.set_max_docs_before_merge(1);
|
||||
log_merge_policy.set_min_layer_size(0);
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
||||
|
||||
// after every commit the merge checker is started, it will merge only segments with 1
|
||||
// element in it because of the max_merge_size.
|
||||
index_writer.add_document(doc!(int_field=>1_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(int_field=>1_u64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
index_writer.add_document(doc!(int_field=>2_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(int_field=>2_u64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
index_writer.add_document(doc!(int_field=>3_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(int_field=>3_u64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
index_writer.add_document(doc!(int_field=>4_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(int_field=>4_u64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
index_writer.add_document(doc!(int_field=>5_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(int_field=>5_u64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
index_writer.add_document(doc!(int_field=>6_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(int_field=>6_u64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
index_writer.add_document(doc!(int_field=>7_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(int_field=>7_u64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
index_writer.add_document(doc!(int_field=>8_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(int_field=>8_u64))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let _segment_ids = index
|
||||
@@ -169,6 +169,7 @@ mod tests {
|
||||
panic!("segment can't have more than two segments");
|
||||
} // don't know how to wait for the merge, then it could be a simple eq
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn test_merge_policy() -> LogMergePolicy {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,7 @@
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::{AliveBitSet, FastFieldReader};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{
|
||||
collector::TopDocs,
|
||||
schema::{Cardinality, TextFieldIndexing},
|
||||
@@ -16,7 +17,7 @@ mod tests {
|
||||
schema::{self, BytesOptions},
|
||||
DocAddress,
|
||||
};
|
||||
use crate::{IndexSettings, Term};
|
||||
use crate::{DocSet, IndexSettings, Postings, Term};
|
||||
use futures::executor::block_on;
|
||||
|
||||
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
|
||||
@@ -38,14 +39,17 @@ mod tests {
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
|
||||
index_writer.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")));
|
||||
index_writer.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")));
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")));
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer
|
||||
.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")))
|
||||
.unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
// Merging the segments
|
||||
@@ -65,7 +69,7 @@ mod tests {
|
||||
fn create_test_index(
|
||||
index_settings: Option<IndexSettings>,
|
||||
force_disjunct_segment_sort_values: bool,
|
||||
) -> Index {
|
||||
) -> crate::Result<Index> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = IntOptions::default()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
@@ -94,32 +98,34 @@ mod tests {
|
||||
if let Some(settings) = index_settings {
|
||||
index_builder = index_builder.settings(settings);
|
||||
}
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
let index = index_builder.create_in_ram()?;
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
// segment 1 - range 1-3
|
||||
index_writer.add_document(doc!(int_field=>1_u64));
|
||||
index_writer.add_document(doc!(int_field=>1_u64))?;
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
|
||||
);
|
||||
index_writer.add_document(doc!(int_field=>1_u64, text_field=> "deleteme"));
|
||||
)?;
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64),
|
||||
);
|
||||
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
|
||||
)?;
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
|
||||
)?;
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.commit()?;
|
||||
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
|
||||
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64));
|
||||
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?;
|
||||
|
||||
let in_val = if force_disjunct_segment_sort_values {
|
||||
10_u64
|
||||
} else {
|
||||
1
|
||||
};
|
||||
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?;
|
||||
index_writer.commit()?;
|
||||
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
|
||||
let int_vals = if force_disjunct_segment_sort_values {
|
||||
[100_u64, 50]
|
||||
@@ -128,26 +134,24 @@ mod tests {
|
||||
};
|
||||
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
|
||||
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
|
||||
);
|
||||
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"));
|
||||
)?;
|
||||
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?;
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
|
||||
);
|
||||
)?;
|
||||
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
block_on(index_writer.merge(&segment_ids))?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
index
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -180,7 +184,8 @@ mod tests {
|
||||
..Default::default()
|
||||
}),
|
||||
force_disjunct_segment_sort_values,
|
||||
);
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let int_field = index.schema().get_field("intval").unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
@@ -243,6 +248,36 @@ mod tests {
|
||||
assert_eq!(do_search("biggest"), vec![0]);
|
||||
}
|
||||
|
||||
// postings file
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let term_a = Term::from_field_text(my_text_field, "text");
|
||||
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||
assert_eq!(
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||
),
|
||||
2
|
||||
);
|
||||
|
||||
assert_eq!(postings.term_freq(), 1);
|
||||
let mut output = vec![];
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1]);
|
||||
postings.advance();
|
||||
|
||||
assert_eq!(postings.term_freq(), 2);
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1, 3]);
|
||||
}
|
||||
|
||||
// access doc store
|
||||
{
|
||||
let blubber_pos = if force_disjunct_segment_sort_values {
|
||||
@@ -260,6 +295,70 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_unsorted_index() {
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
..Default::default()
|
||||
}),
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_readers().last().unwrap();
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
|
||||
let do_search = |term: &str| {
|
||||
let query = QueryParser::for_index(&index, vec![my_text_field])
|
||||
.parse_query(term)
|
||||
.unwrap();
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
||||
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(do_search("some"), vec![1]);
|
||||
assert_eq!(do_search("blubber"), vec![3]);
|
||||
assert_eq!(do_search("biggest"), vec![4]);
|
||||
}
|
||||
|
||||
// postings file
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let term_a = Term::from_field_text(my_text_field, "text");
|
||||
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||
assert_eq!(
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||
),
|
||||
2
|
||||
);
|
||||
|
||||
assert_eq!(postings.term_freq(), 1);
|
||||
let mut output = vec![];
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1]);
|
||||
postings.advance();
|
||||
|
||||
assert_eq!(postings.term_freq(), 2);
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1, 3]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_sorted_index_asc() {
|
||||
let index = create_test_index(
|
||||
@@ -271,7 +370,8 @@ mod tests {
|
||||
..Default::default()
|
||||
}),
|
||||
false,
|
||||
);
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let int_field = index.schema().get_field("intval").unwrap();
|
||||
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
||||
@@ -314,7 +414,7 @@ mod tests {
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
|
||||
@@ -339,6 +439,34 @@ mod tests {
|
||||
assert_eq!(do_search("biggest"), vec![5]);
|
||||
}
|
||||
|
||||
// postings file
|
||||
{
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let term_a = Term::from_field_text(my_text_field, "text");
|
||||
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(postings.doc_freq(), 2);
|
||||
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
|
||||
assert_eq!(
|
||||
postings.doc_freq_given_deletes(
|
||||
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
|
||||
),
|
||||
2
|
||||
);
|
||||
|
||||
let mut output = vec![];
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1, 3]);
|
||||
postings.advance();
|
||||
|
||||
postings.positions(&mut output);
|
||||
assert_eq!(output, vec![1]);
|
||||
}
|
||||
|
||||
// access doc store
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
|
||||
@@ -422,14 +550,15 @@ mod bench_sorted_index_merge {
|
||||
let doc_id_mapping = merger.generate_doc_id_mapping(&sort_by_field).unwrap();
|
||||
b.iter(|| {
|
||||
|
||||
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, reader)|{
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader.reader
|
||||
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, ordinal)|{
|
||||
let reader = &merger.readers[*ordinal as usize];
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
(doc_id, reader, u64_reader)
|
||||
});
|
||||
// add values in order of the new docids
|
||||
// add values in order of the new doc_ids
|
||||
let mut val = 0;
|
||||
for (doc_id, _reader, field_reader) in sorted_doc_ids {
|
||||
val = field_reader.get(*doc_id);
|
||||
@@ -442,7 +571,7 @@ mod bench_sorted_index_merge {
|
||||
Ok(())
|
||||
}
|
||||
#[bench]
|
||||
fn create_sorted_index_create_docid_mapping(b: &mut Bencher) -> crate::Result<()> {
|
||||
fn create_sorted_index_create_doc_id_mapping(b: &mut Bencher) -> crate::Result<()> {
|
||||
let sort_by_field = IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
|
||||
@@ -1,15 +1,17 @@
|
||||
pub mod delete_queue;
|
||||
|
||||
pub mod demuxer;
|
||||
pub mod doc_id_mapping;
|
||||
mod doc_opstamp_mapping;
|
||||
pub mod index_writer;
|
||||
mod index_writer_status;
|
||||
mod log_merge_policy;
|
||||
mod merge_operation;
|
||||
pub mod merge_policy;
|
||||
pub mod merger;
|
||||
mod merger_sorted_index_test;
|
||||
pub mod operation;
|
||||
mod prepared_commit;
|
||||
pub mod prepared_commit;
|
||||
mod segment_entry;
|
||||
mod segment_manager;
|
||||
mod segment_register;
|
||||
@@ -18,6 +20,11 @@ pub mod segment_updater;
|
||||
mod segment_writer;
|
||||
mod stamper;
|
||||
|
||||
use crossbeam::channel;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
use crate::indexer::operation::AddOperation;
|
||||
|
||||
pub use self::index_writer::IndexWriter;
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_operation::MergeOperation;
|
||||
@@ -26,12 +33,23 @@ pub use self::prepared_commit::PreparedCommit;
|
||||
pub use self::segment_entry::SegmentEntry;
|
||||
pub use self::segment_manager::SegmentManager;
|
||||
pub use self::segment_serializer::SegmentSerializer;
|
||||
pub use self::segment_updater::merge_segments;
|
||||
pub use self::segment_updater::merge_filtered_segments;
|
||||
pub use self::segment_updater::merge_indices;
|
||||
pub use self::segment_writer::SegmentWriter;
|
||||
|
||||
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
||||
pub type DefaultMergePolicy = LogMergePolicy;
|
||||
|
||||
// Batch of documents.
|
||||
// Most of the time, users will send operation one-by-one, but it can be useful to
|
||||
// send them as a small block to ensure that
|
||||
// - all docs in the operation will happen on the same segment and continuous doc_ids.
|
||||
// - all operations in the group are committed at the same time, making the group
|
||||
// atomic.
|
||||
type AddBatch = SmallVec<[AddOperation; 4]>;
|
||||
type AddBatchSender = channel::Sender<AddBatch>;
|
||||
type AddBatchReceiver = channel::Receiver<AddBatch>;
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(test)]
|
||||
mod tests_mmap {
|
||||
@@ -39,19 +57,20 @@ mod tests_mmap {
|
||||
use crate::{Index, Term};
|
||||
|
||||
#[test]
|
||||
fn test_advance_delete_bug() {
|
||||
fn test_advance_delete_bug() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let index = Index::create_from_tempdir(schema_builder.build())?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
// there must be one deleted document in the segment
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "b"));
|
||||
// we need enough data to trigger the bug (at least 32 documents)
|
||||
for _ in 0..32 {
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
index_writer.add_document(doc!(text_field=>"c"))?;
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.commit()?;
|
||||
index_writer.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,25 +18,38 @@ impl<'a> PreparedCommit<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the opstamp associated to the prepared commit.
|
||||
pub fn opstamp(&self) -> Opstamp {
|
||||
self.opstamp
|
||||
}
|
||||
|
||||
/// Adds an arbitrary payload to the commit.
|
||||
pub fn set_payload(&mut self, payload: &str) {
|
||||
self.payload = Some(payload.to_string())
|
||||
}
|
||||
|
||||
/// Rollbacks any change.
|
||||
pub fn abort(self) -> crate::Result<Opstamp> {
|
||||
self.index_writer.rollback()
|
||||
}
|
||||
|
||||
/// Proceeds to commit.
|
||||
/// See `.commit_async()`.
|
||||
pub fn commit(self) -> crate::Result<Opstamp> {
|
||||
block_on(self.commit_async())
|
||||
}
|
||||
|
||||
/// Proceeds to commit.
|
||||
///
|
||||
/// Unfortunately, contrary to what `PrepareCommit` may suggests,
|
||||
/// this operation is not at all really light.
|
||||
/// At this point deletes have not been flushed yet.
|
||||
pub async fn commit_async(self) -> crate::Result<Opstamp> {
|
||||
info!("committing {}", self.opstamp);
|
||||
let _ = block_on(
|
||||
self.index_writer
|
||||
.segment_updater()
|
||||
.schedule_commit(self.opstamp, self.payload),
|
||||
);
|
||||
self.index_writer
|
||||
.segment_updater()
|
||||
.schedule_commit(self.opstamp, self.payload)
|
||||
.await?;
|
||||
Ok(self.opstamp)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::common::BitSet;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use common::BitSet;
|
||||
use std::fmt;
|
||||
|
||||
/// A segment entry describes the state of
|
||||
@@ -9,18 +9,16 @@ use std::fmt;
|
||||
///
|
||||
/// In addition to segment `meta`,
|
||||
/// it contains a few transient states
|
||||
/// - `state` expresses whether the segment is already in the
|
||||
/// middle of a merge
|
||||
/// - `delete_bitset` is a bitset describing
|
||||
/// documents that were deleted during the commit
|
||||
/// - `alive_bitset` is a bitset describing
|
||||
/// documents that were alive during the commit
|
||||
/// itself.
|
||||
/// - `delete_cursor` is the position in the delete queue.
|
||||
/// Deletes happening before the cursor are reflected either
|
||||
/// in the .del file or in the `delete_bitset`.
|
||||
/// in the .del file or in the `alive_bitset`.
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentEntry {
|
||||
meta: SegmentMeta,
|
||||
delete_bitset: Option<BitSet>,
|
||||
alive_bitset: Option<BitSet>,
|
||||
delete_cursor: DeleteCursor,
|
||||
}
|
||||
|
||||
@@ -29,11 +27,11 @@ impl SegmentEntry {
|
||||
pub fn new(
|
||||
segment_meta: SegmentMeta,
|
||||
delete_cursor: DeleteCursor,
|
||||
delete_bitset: Option<BitSet>,
|
||||
alive_bitset: Option<BitSet>,
|
||||
) -> SegmentEntry {
|
||||
SegmentEntry {
|
||||
meta: segment_meta,
|
||||
delete_bitset,
|
||||
alive_bitset,
|
||||
delete_cursor,
|
||||
}
|
||||
}
|
||||
@@ -41,8 +39,8 @@ impl SegmentEntry {
|
||||
/// Return a reference to the segment entry deleted bitset.
|
||||
///
|
||||
/// `DocId` in this bitset are flagged as deleted.
|
||||
pub fn delete_bitset(&self) -> Option<&BitSet> {
|
||||
self.delete_bitset.as_ref()
|
||||
pub fn alive_bitset(&self) -> Option<&BitSet> {
|
||||
self.alive_bitset.as_ref()
|
||||
}
|
||||
|
||||
/// Set the `SegmentMeta` for this segment.
|
||||
|
||||
@@ -66,13 +66,10 @@ impl SegmentRegister {
|
||||
}
|
||||
|
||||
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
||||
let mut segment_ids: Vec<SegmentMeta> = self
|
||||
.segment_states
|
||||
self.segment_states
|
||||
.values()
|
||||
.map(|segment_entry| segment_entry.meta().clone())
|
||||
.collect();
|
||||
segment_ids.sort_by_key(SegmentMeta::id);
|
||||
segment_ids
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {
|
||||
|
||||
@@ -7,6 +7,7 @@ use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::indexer::index_writer::advance_deletes;
|
||||
use crate::indexer::merge_operation::MergeOperationInventory;
|
||||
@@ -19,12 +20,15 @@ use crate::indexer::{DefaultMergePolicy, MergePolicy};
|
||||
use crate::indexer::{MergeCandidate, MergeOperation};
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use crate::TantivyError;
|
||||
use fail::fail_point;
|
||||
use futures::channel::oneshot;
|
||||
use futures::executor::{ThreadPool, ThreadPoolBuilder};
|
||||
use futures::future::Future;
|
||||
use futures::future::TryFutureExt;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashSet;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::ops::Deref;
|
||||
use std::path::PathBuf;
|
||||
@@ -74,6 +78,10 @@ fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()>
|
||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||
// Just adding a new line at the end of the buffer.
|
||||
writeln!(&mut buffer)?;
|
||||
fail_point!("save_metas", |msg| Err(TantivyError::from(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
))));
|
||||
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
||||
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
||||
Ok(())
|
||||
@@ -159,9 +167,9 @@ fn merge(
|
||||
/// meant to work if you have an IndexWriter running for the origin indices, or
|
||||
/// the destination Index.
|
||||
#[doc(hidden)]
|
||||
pub fn merge_segments<Dir: Directory>(
|
||||
pub fn merge_indices<T: Into<Box<dyn Directory>>>(
|
||||
indices: &[Index],
|
||||
output_directory: Dir,
|
||||
output_directory: T,
|
||||
) -> crate::Result<Index> {
|
||||
if indices.is_empty() {
|
||||
// If there are no indices to merge, there is no need to do anything.
|
||||
@@ -170,19 +178,8 @@ pub fn merge_segments<Dir: Directory>(
|
||||
));
|
||||
}
|
||||
|
||||
let target_schema = indices[0].schema();
|
||||
let target_settings = indices[0].settings().clone();
|
||||
|
||||
// let's check that all of the indices have the same schema
|
||||
if indices
|
||||
.iter()
|
||||
.skip(1)
|
||||
.any(|index| index.schema() != target_schema)
|
||||
{
|
||||
return Err(crate::TantivyError::InvalidArgument(
|
||||
"Attempt to merge different schema indices".to_string(),
|
||||
));
|
||||
}
|
||||
// let's check that all of the indices have the same index settings
|
||||
if indices
|
||||
.iter()
|
||||
@@ -199,13 +196,61 @@ pub fn merge_segments<Dir: Directory>(
|
||||
segments.extend(index.searchable_segments()?);
|
||||
}
|
||||
|
||||
let mut merged_index = Index::create(output_directory, target_schema.clone(), target_settings)?;
|
||||
let non_filter = segments.iter().map(|_| None).collect::<Vec<_>>();
|
||||
merge_filtered_segments(&segments, target_settings, non_filter, output_directory)
|
||||
}
|
||||
|
||||
/// Advanced: Merges a list of segments from different indices in a new index.
|
||||
/// Additional you can provide a delete bitset for each segment to ignore doc_ids.
|
||||
///
|
||||
/// Returns `TantivyError` if the the indices list is empty or their
|
||||
/// schemas don't match.
|
||||
///
|
||||
/// `output_directory`: is assumed to be empty.
|
||||
///
|
||||
/// # Warning
|
||||
/// This function does NOT check or take the `IndexWriter` is running. It is not
|
||||
/// meant to work if you have an IndexWriter running for the origin indices, or
|
||||
/// the destination Index.
|
||||
#[doc(hidden)]
|
||||
pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
|
||||
segments: &[Segment],
|
||||
target_settings: IndexSettings,
|
||||
filter_doc_ids: Vec<Option<AliveBitSet>>,
|
||||
output_directory: T,
|
||||
) -> crate::Result<Index> {
|
||||
if segments.is_empty() {
|
||||
// If there are no indices to merge, there is no need to do anything.
|
||||
return Err(crate::TantivyError::InvalidArgument(
|
||||
"No segments given to marge".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let target_schema = segments[0].schema();
|
||||
|
||||
// let's check that all of the indices have the same schema
|
||||
if segments
|
||||
.iter()
|
||||
.skip(1)
|
||||
.any(|index| index.schema() != target_schema)
|
||||
{
|
||||
return Err(crate::TantivyError::InvalidArgument(
|
||||
"Attempt to merge different schema indices".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let mut merged_index = Index::create(
|
||||
output_directory,
|
||||
target_schema.clone(),
|
||||
target_settings.clone(),
|
||||
)?;
|
||||
let merged_segment = merged_index.new_segment();
|
||||
let merged_segment_id = merged_segment.id();
|
||||
let merger: IndexMerger = IndexMerger::open(
|
||||
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
||||
merged_index.schema(),
|
||||
merged_index.settings().clone(),
|
||||
&segments[..],
|
||||
segments,
|
||||
filter_doc_ids,
|
||||
)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
|
||||
let num_docs = merger.write(segment_serializer)?;
|
||||
@@ -225,7 +270,7 @@ pub fn merge_segments<Dir: Directory>(
|
||||
);
|
||||
|
||||
let index_meta = IndexMeta {
|
||||
index_settings: indices[0].load_metas()?.index_settings, // index_settings of all segments should be the same
|
||||
index_settings: target_settings, // index_settings of all segments should be the same
|
||||
segments: vec![segment_meta],
|
||||
schema: target_schema,
|
||||
opstamp: 0u64,
|
||||
@@ -306,37 +351,39 @@ impl SegmentUpdater {
|
||||
*self.merge_policy.write().unwrap() = arc_merge_policy;
|
||||
}
|
||||
|
||||
fn schedule_future<T: 'static + Send, F: Future<Output = crate::Result<T>> + 'static + Send>(
|
||||
async fn schedule_task<
|
||||
T: 'static + Send,
|
||||
F: Future<Output = crate::Result<T>> + 'static + Send,
|
||||
>(
|
||||
&self,
|
||||
f: F,
|
||||
) -> impl Future<Output = crate::Result<T>> {
|
||||
let (sender, receiver) = oneshot::channel();
|
||||
if self.is_alive() {
|
||||
self.pool.spawn_ok(async move {
|
||||
let _ = sender.send(f.await);
|
||||
});
|
||||
} else {
|
||||
let _ = sender.send(Err(crate::TantivyError::SystemError(
|
||||
task: F,
|
||||
) -> crate::Result<T> {
|
||||
if !self.is_alive() {
|
||||
return Err(crate::TantivyError::SystemError(
|
||||
"Segment updater killed".to_string(),
|
||||
)));
|
||||
));
|
||||
}
|
||||
receiver.unwrap_or_else(|_| {
|
||||
let (sender, receiver) = oneshot::channel();
|
||||
self.pool.spawn_ok(async move {
|
||||
let task_result = task.await;
|
||||
let _ = sender.send(task_result);
|
||||
});
|
||||
let task_result = receiver.await;
|
||||
task_result.unwrap_or_else(|_| {
|
||||
let err_msg =
|
||||
"A segment_updater future did not success. This should never happen.".to_string();
|
||||
Err(crate::TantivyError::SystemError(err_msg))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn schedule_add_segment(
|
||||
&self,
|
||||
segment_entry: SegmentEntry,
|
||||
) -> impl Future<Output = crate::Result<()>> {
|
||||
pub async fn schedule_add_segment(&self, segment_entry: SegmentEntry) -> crate::Result<()> {
|
||||
let segment_updater = self.clone();
|
||||
self.schedule_future(async move {
|
||||
self.schedule_task(async move {
|
||||
segment_updater.segment_manager.add_segment(segment_entry);
|
||||
segment_updater.consider_merge_options().await;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Orders `SegmentManager` to remove all segments
|
||||
@@ -403,11 +450,9 @@ impl SegmentUpdater {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn schedule_garbage_collect(
|
||||
&self,
|
||||
) -> impl Future<Output = crate::Result<GarbageCollectionResult>> {
|
||||
pub async fn schedule_garbage_collect(&self) -> crate::Result<GarbageCollectionResult> {
|
||||
let garbage_collect_future = garbage_collect_files(self.clone());
|
||||
self.schedule_future(garbage_collect_future)
|
||||
self.schedule_task(garbage_collect_future).await
|
||||
}
|
||||
|
||||
/// List the files that are useful to the index.
|
||||
@@ -425,13 +470,13 @@ impl SegmentUpdater {
|
||||
files
|
||||
}
|
||||
|
||||
pub fn schedule_commit(
|
||||
pub(crate) async fn schedule_commit(
|
||||
&self,
|
||||
opstamp: Opstamp,
|
||||
payload: Option<String>,
|
||||
) -> impl Future<Output = crate::Result<()>> {
|
||||
) -> crate::Result<()> {
|
||||
let segment_updater: SegmentUpdater = self.clone();
|
||||
self.schedule_future(async move {
|
||||
self.schedule_task(async move {
|
||||
let segment_entries = segment_updater.purge_deletes(opstamp)?;
|
||||
segment_updater.segment_manager.commit(segment_entries);
|
||||
segment_updater.save_metas(opstamp, payload)?;
|
||||
@@ -439,6 +484,7 @@ impl SegmentUpdater {
|
||||
segment_updater.consider_merge_options().await;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
fn store_meta(&self, index_meta: &IndexMeta) {
|
||||
@@ -513,9 +559,7 @@ impl SegmentUpdater {
|
||||
e
|
||||
);
|
||||
// ... cancel merge
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
}
|
||||
assert!(!cfg!(test), "Merge failed.");
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -568,14 +612,14 @@ impl SegmentUpdater {
|
||||
}
|
||||
}
|
||||
|
||||
fn end_merge(
|
||||
async fn end_merge(
|
||||
&self,
|
||||
merge_operation: MergeOperation,
|
||||
mut after_merge_segment_entry: SegmentEntry,
|
||||
) -> impl Future<Output = crate::Result<SegmentMeta>> {
|
||||
) -> crate::Result<SegmentMeta> {
|
||||
let segment_updater = self.clone();
|
||||
let after_merge_segment_meta = after_merge_segment_entry.meta().clone();
|
||||
let end_merge_future = self.schedule_future(async move {
|
||||
self.schedule_task(async move {
|
||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||
{
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
@@ -594,9 +638,8 @@ impl SegmentUpdater {
|
||||
merge_operation.segment_ids(),
|
||||
advance_deletes_err
|
||||
);
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
}
|
||||
assert!(!cfg!(test), "Merge failed.");
|
||||
|
||||
// ... cancel merge
|
||||
// `merge_operations` are tracked. As it is dropped, the
|
||||
// the segment_ids will be available again for merge.
|
||||
@@ -619,8 +662,9 @@ impl SegmentUpdater {
|
||||
|
||||
let _ = garbage_collect_files(segment_updater).await;
|
||||
Ok(())
|
||||
});
|
||||
end_merge_future.map_ok(|_| after_merge_segment_meta)
|
||||
})
|
||||
.await?;
|
||||
Ok(after_merge_segment_meta)
|
||||
}
|
||||
|
||||
/// Wait for current merging threads.
|
||||
@@ -646,11 +690,19 @@ impl SegmentUpdater {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::merge_segments;
|
||||
use super::merge_indices;
|
||||
use crate::collector::TopDocs;
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::indexer::merge_policy::tests::MergeWheneverPossible;
|
||||
use crate::indexer::merger::IndexMerger;
|
||||
use crate::indexer::segment_updater::merge_filtered_segments;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::*;
|
||||
use crate::Directory;
|
||||
use crate::DocAddress;
|
||||
use crate::Index;
|
||||
use crate::Segment;
|
||||
|
||||
#[test]
|
||||
fn test_delete_during_merge() -> crate::Result<()> {
|
||||
@@ -663,19 +715,19 @@ mod tests {
|
||||
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
|
||||
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
index_writer.add_document(doc!(text_field=>"d"));
|
||||
index_writer.add_document(doc!(text_field=>"c"))?;
|
||||
index_writer.add_document(doc!(text_field=>"d"))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
index_writer.add_document(doc!(text_field=>"e"));
|
||||
index_writer.add_document(doc!(text_field=>"f"));
|
||||
index_writer.add_document(doc!(text_field=>"e"))?;
|
||||
index_writer.add_document(doc!(text_field=>"f"))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let term = Term::from_field_text(text_field, "a");
|
||||
@@ -693,6 +745,50 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_all_docs_min() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
for _ in 0..10 {
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
let seg_ids = index.searchable_segment_ids()?;
|
||||
// docs exist, should have at least 1 segment
|
||||
assert!(!seg_ids.is_empty());
|
||||
|
||||
let term = Term::from_field_text(text_field, "a");
|
||||
index_writer.delete_term(term);
|
||||
index_writer.commit()?;
|
||||
|
||||
let term = Term::from_field_text(text_field, "b");
|
||||
index_writer.delete_term(term);
|
||||
index_writer.commit()?;
|
||||
|
||||
index_writer.wait_merging_threads()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
|
||||
let seg_ids = index.searchable_segment_ids()?;
|
||||
assert!(seg_ids.is_empty());
|
||||
|
||||
reader.reload()?;
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
// empty segments should be erased
|
||||
assert!(index.searchable_segment_metas()?.is_empty());
|
||||
assert!(reader.searcher().segment_readers().is_empty());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_all_docs() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -703,19 +799,19 @@ mod tests {
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
index_writer.add_document(doc!(text_field=>"d"));
|
||||
index_writer.add_document(doc!(text_field=>"c"))?;
|
||||
index_writer.add_document(doc!(text_field=>"d"))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
index_writer.add_document(doc!(text_field=>"e"));
|
||||
index_writer.add_document(doc!(text_field=>"f"));
|
||||
index_writer.add_document(doc!(text_field=>"e"))?;
|
||||
index_writer.add_document(doc!(text_field=>"f"))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let seg_ids = index.searchable_segment_ids()?;
|
||||
@@ -755,8 +851,8 @@ mod tests {
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
@@ -782,22 +878,22 @@ mod tests {
|
||||
// writing two segments
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"fizz"));
|
||||
index_writer.add_document(doc!(text_field=>"buzz"));
|
||||
index_writer.add_document(doc!(text_field=>"fizz"))?;
|
||||
index_writer.add_document(doc!(text_field=>"buzz"))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
for _ in 0..1000 {
|
||||
index_writer.add_document(doc!(text_field=>"foo"));
|
||||
index_writer.add_document(doc!(text_field=>"bar"));
|
||||
index_writer.add_document(doc!(text_field=>"foo"))?;
|
||||
index_writer.add_document(doc!(text_field=>"bar"))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
indices.push(index);
|
||||
}
|
||||
|
||||
assert_eq!(indices.len(), 3);
|
||||
let output_directory = RamDirectory::default();
|
||||
let index = merge_segments(&indices, output_directory)?;
|
||||
let output_directory: Box<dyn Directory> = Box::new(RamDirectory::default());
|
||||
let index = merge_indices(&indices, output_directory)?;
|
||||
assert_eq!(index.schema(), schema);
|
||||
|
||||
let segments = index.searchable_segments()?;
|
||||
@@ -811,7 +907,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_merge_empty_indices_array() {
|
||||
let merge_result = merge_segments(&[], RamDirectory::default());
|
||||
let merge_result = merge_indices(&[], RamDirectory::default());
|
||||
assert!(merge_result.is_err());
|
||||
}
|
||||
|
||||
@@ -822,7 +918,7 @@ mod tests {
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"some text"));
|
||||
index_writer.add_document(doc!(text_field=>"some text"))?;
|
||||
index_writer.commit()?;
|
||||
index
|
||||
};
|
||||
@@ -832,15 +928,197 @@ mod tests {
|
||||
let body_field = schema_builder.add_text_field("body", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(body_field=>"some body"));
|
||||
index_writer.add_document(doc!(body_field=>"some body"))?;
|
||||
index_writer.commit()?;
|
||||
index
|
||||
};
|
||||
|
||||
// mismatched schema index list
|
||||
let result = merge_segments(&[first_index, second_index], RamDirectory::default());
|
||||
let result = merge_indices(&[first_index, second_index], RamDirectory::default());
|
||||
assert!(result.is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_filtered_segments() -> crate::Result<()> {
|
||||
let first_index = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"some text 1"))?;
|
||||
index_writer.add_document(doc!(text_field=>"some text 2"))?;
|
||||
index_writer.commit()?;
|
||||
index
|
||||
};
|
||||
|
||||
let second_index = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"some text 3"))?;
|
||||
index_writer.add_document(doc!(text_field=>"some text 4"))?;
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "4"));
|
||||
|
||||
index_writer.commit()?;
|
||||
index
|
||||
};
|
||||
|
||||
let mut segments: Vec<Segment> = Vec::new();
|
||||
segments.extend(first_index.searchable_segments()?);
|
||||
segments.extend(second_index.searchable_segments()?);
|
||||
|
||||
let target_settings = first_index.settings().clone();
|
||||
|
||||
let filter_segment_1 = AliveBitSet::for_test_from_deleted_docs(&[1], 2);
|
||||
let filter_segment_2 = AliveBitSet::for_test_from_deleted_docs(&[0], 2);
|
||||
|
||||
let filter_segments = vec![Some(filter_segment_1), Some(filter_segment_2)];
|
||||
|
||||
let merged_index = merge_filtered_segments(
|
||||
&segments,
|
||||
target_settings,
|
||||
filter_segments,
|
||||
RamDirectory::default(),
|
||||
)?;
|
||||
|
||||
let segments = merged_index.searchable_segments()?;
|
||||
assert_eq!(segments.len(), 1);
|
||||
|
||||
let segment_metas = segments[0].meta();
|
||||
assert_eq!(segment_metas.num_deleted_docs(), 0);
|
||||
assert_eq!(segment_metas.num_docs(), 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_single_filtered_segments() -> crate::Result<()> {
|
||||
let first_index = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"test text"))?;
|
||||
index_writer.add_document(doc!(text_field=>"some text 2"))?;
|
||||
|
||||
index_writer.add_document(doc!(text_field=>"some text 3"))?;
|
||||
index_writer.add_document(doc!(text_field=>"some text 4"))?;
|
||||
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "4"));
|
||||
|
||||
index_writer.commit()?;
|
||||
index
|
||||
};
|
||||
|
||||
let mut segments: Vec<Segment> = Vec::new();
|
||||
segments.extend(first_index.searchable_segments()?);
|
||||
|
||||
let target_settings = first_index.settings().clone();
|
||||
|
||||
let filter_segment = AliveBitSet::for_test_from_deleted_docs(&[0], 4);
|
||||
|
||||
let filter_segments = vec![Some(filter_segment)];
|
||||
|
||||
let index = merge_filtered_segments(
|
||||
&segments,
|
||||
target_settings,
|
||||
filter_segments,
|
||||
RamDirectory::default(),
|
||||
)?;
|
||||
|
||||
let segments = index.searchable_segments()?;
|
||||
assert_eq!(segments.len(), 1);
|
||||
|
||||
let segment_metas = segments[0].meta();
|
||||
assert_eq!(segment_metas.num_deleted_docs(), 0);
|
||||
assert_eq!(segment_metas.num_docs(), 2);
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
{
|
||||
let text_field = index.schema().get_field("text").unwrap();
|
||||
|
||||
let do_search = |term: &str| {
|
||||
let query = QueryParser::for_index(&index, vec![text_field])
|
||||
.parse_query(term)
|
||||
.unwrap();
|
||||
let top_docs: Vec<(f32, DocAddress)> =
|
||||
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
||||
|
||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
assert_eq!(do_search("test"), vec![] as Vec<u32>);
|
||||
assert_eq!(do_search("text"), vec![0, 1]);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_apply_doc_id_filter_in_merger() -> crate::Result<()> {
|
||||
let first_index = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"some text 1"))?;
|
||||
index_writer.add_document(doc!(text_field=>"some text 2"))?;
|
||||
|
||||
index_writer.add_document(doc!(text_field=>"some text 3"))?;
|
||||
index_writer.add_document(doc!(text_field=>"some text 4"))?;
|
||||
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "4"));
|
||||
|
||||
index_writer.commit()?;
|
||||
index
|
||||
};
|
||||
|
||||
let mut segments: Vec<Segment> = Vec::new();
|
||||
segments.extend(first_index.searchable_segments()?);
|
||||
|
||||
let target_settings = first_index.settings().clone();
|
||||
{
|
||||
let filter_segment = AliveBitSet::for_test_from_deleted_docs(&[1], 4);
|
||||
let filter_segments = vec![Some(filter_segment)];
|
||||
let target_schema = segments[0].schema();
|
||||
let merged_index = Index::create(
|
||||
RamDirectory::default(),
|
||||
target_schema.clone(),
|
||||
target_settings.clone(),
|
||||
)?;
|
||||
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
||||
merged_index.schema(),
|
||||
merged_index.settings().clone(),
|
||||
&segments[..],
|
||||
filter_segments,
|
||||
)?;
|
||||
|
||||
let doc_ids_alive: Vec<_> = merger.readers[0].doc_ids_alive().collect();
|
||||
assert_eq!(doc_ids_alive, vec![0, 2]);
|
||||
}
|
||||
|
||||
{
|
||||
let filter_segments = vec![None];
|
||||
let target_schema = segments[0].schema();
|
||||
let merged_index = Index::create(
|
||||
RamDirectory::default(),
|
||||
target_schema.clone(),
|
||||
target_settings.clone(),
|
||||
)?;
|
||||
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
||||
merged_index.schema(),
|
||||
merged_index.settings().clone(),
|
||||
&segments[..],
|
||||
filter_segments,
|
||||
)?;
|
||||
|
||||
let doc_ids_alive: Vec<_> = merger.readers[0].doc_ids_alive().collect();
|
||||
assert_eq!(doc_ids_alive, vec![0, 1, 2]);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
149
src/lib.rs
149
src/lib.rs
@@ -10,6 +10,7 @@
|
||||
)]
|
||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||
#![warn(missing_docs)]
|
||||
#![allow(clippy::len_without_is_empty)]
|
||||
|
||||
//! # `tantivy`
|
||||
//!
|
||||
@@ -62,7 +63,7 @@
|
||||
//! body => "He was an old man who fished alone in a skiff in \
|
||||
//! the Gulf Stream and he had gone eighty-four days \
|
||||
//! now without taking a fish."
|
||||
//! ));
|
||||
//! ))?;
|
||||
//!
|
||||
//! // We need to call .commit() explicitly to force the
|
||||
//! // index_writer to finish processing the documents in the queue,
|
||||
@@ -103,7 +104,7 @@
|
||||
//! A good place for you to get started is to check out
|
||||
//! the example code (
|
||||
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
|
||||
//! [source code](https://github.com/tantivy-search/tantivy/blob/main/examples/basic_search.rs))
|
||||
//! [source code](https://github.com/quickwit-inc/tantivy/blob/main/examples/basic_search.rs))
|
||||
|
||||
#[cfg_attr(test, macro_use)]
|
||||
extern crate serde_json;
|
||||
@@ -135,7 +136,6 @@ pub type Result<T> = std::result::Result<T, TantivyError>;
|
||||
/// Tantivy DateTime
|
||||
pub type DateTime = chrono::DateTime<chrono::Utc>;
|
||||
|
||||
mod common;
|
||||
mod core;
|
||||
mod indexer;
|
||||
|
||||
@@ -163,8 +163,6 @@ pub use self::snippet::{Snippet, SnippetGenerator};
|
||||
|
||||
mod docset;
|
||||
pub use self::docset::{DocSet, TERMINATED};
|
||||
pub use crate::common::HasLen;
|
||||
pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
pub use crate::core::{Executor, SegmentComponent};
|
||||
pub use crate::core::{
|
||||
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher, Segment,
|
||||
@@ -172,12 +170,16 @@ pub use crate::core::{
|
||||
};
|
||||
pub use crate::core::{InvertedIndexReader, SegmentReader};
|
||||
pub use crate::directory::Directory;
|
||||
pub use crate::indexer::merge_segments;
|
||||
pub use crate::indexer::demuxer::*;
|
||||
pub use crate::indexer::merge_filtered_segments;
|
||||
pub use crate::indexer::merge_indices;
|
||||
pub use crate::indexer::operation::UserOperation;
|
||||
pub use crate::indexer::IndexWriter;
|
||||
pub use crate::indexer::{IndexWriter, PreparedCommit};
|
||||
pub use crate::postings::Postings;
|
||||
pub use crate::reader::LeasedItem;
|
||||
pub use crate::schema::{Document, Term};
|
||||
pub use common::HasLen;
|
||||
pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use std::fmt;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -293,7 +295,7 @@ pub struct DocAddress {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
pub mod tests {
|
||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
@@ -304,11 +306,18 @@ mod tests {
|
||||
use crate::Index;
|
||||
use crate::Postings;
|
||||
use crate::ReloadPolicy;
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use rand::distributions::Bernoulli;
|
||||
use rand::distributions::Uniform;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
/// Checks if left and right are close one to each other.
|
||||
/// Panics if the two values are more than 0.5% apart.
|
||||
#[macro_export]
|
||||
@@ -370,24 +379,22 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
let index = Index::create_from_tempdir(schema)?;
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
{
|
||||
let doc = doc!(text_field=>"af b");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field=>"a b c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field=>"a b c d");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let doc = doc!(text_field=>"af b");
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field=>"a b c");
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field=>"a b c d");
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -397,12 +404,12 @@ mod tests {
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.add_document(doc!(text_field=>"a a"));
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a a"))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
index_writer.add_document(doc!(text_field=>"c"))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -424,7 +431,7 @@ mod tests {
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
||||
index_writer.commit()?;
|
||||
let index_reader = index.reader()?;
|
||||
let searcher = index_reader.searcher();
|
||||
@@ -446,9 +453,9 @@ mod tests {
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||
index_writer.add_document(doc!());
|
||||
index_writer.add_document(doc!(text_field=>"a b"));
|
||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -490,20 +497,20 @@ mod tests {
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
// 0
|
||||
index_writer.add_document(doc!(text_field=>"a b"));
|
||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
||||
// 1
|
||||
index_writer.add_document(doc!(text_field=>" a c"));
|
||||
index_writer.add_document(doc!(text_field=>" a c"))?;
|
||||
// 2
|
||||
index_writer.add_document(doc!(text_field=>" b c"));
|
||||
index_writer.add_document(doc!(text_field=>" b c"))?;
|
||||
// 3
|
||||
index_writer.add_document(doc!(text_field=>" b d"));
|
||||
index_writer.add_document(doc!(text_field=>" b d"))?;
|
||||
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||
// 4
|
||||
index_writer.add_document(doc!(text_field=>" b c"));
|
||||
index_writer.add_document(doc!(text_field=>" b c"))?;
|
||||
// 5
|
||||
index_writer.add_document(doc!(text_field=>" a"));
|
||||
index_writer.add_document(doc!(text_field=>" a"))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
{
|
||||
@@ -537,7 +544,7 @@ mod tests {
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
// 0
|
||||
index_writer.add_document(doc!(text_field=>"a b"));
|
||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
||||
// 1
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||
index_writer.rollback()?;
|
||||
@@ -573,7 +580,7 @@ mod tests {
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a b"));
|
||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||
index_writer.rollback()?;
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||
@@ -623,7 +630,7 @@ mod tests {
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(field=>1u64));
|
||||
index_writer.add_document(doc!(field=>1u64))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -647,7 +654,7 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let negative_val = -1i64;
|
||||
index_writer.add_document(doc!(value_field => negative_val));
|
||||
index_writer.add_document(doc!(value_field => negative_val))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -671,7 +678,7 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let val = std::f64::consts::PI;
|
||||
index_writer.add_document(doc!(value_field => val));
|
||||
index_writer.add_document(doc!(value_field => val))?;
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -694,7 +701,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -717,14 +724,14 @@ mod tests {
|
||||
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"63"));
|
||||
index_writer.add_document(doc!(text_field=>"70"));
|
||||
index_writer.add_document(doc!(text_field=>"34"));
|
||||
index_writer.add_document(doc!(text_field=>"1"));
|
||||
index_writer.add_document(doc!(text_field=>"38"));
|
||||
index_writer.add_document(doc!(text_field=>"33"));
|
||||
index_writer.add_document(doc!(text_field=>"40"));
|
||||
index_writer.add_document(doc!(text_field=>"17"));
|
||||
index_writer.add_document(doc!(text_field=>"63"))?;
|
||||
index_writer.add_document(doc!(text_field=>"70"))?;
|
||||
index_writer.add_document(doc!(text_field=>"34"))?;
|
||||
index_writer.add_document(doc!(text_field=>"1"))?;
|
||||
index_writer.add_document(doc!(text_field=>"38"))?;
|
||||
index_writer.add_document(doc!(text_field=>"33"))?;
|
||||
index_writer.add_document(doc!(text_field=>"40"))?;
|
||||
index_writer.add_document(doc!(text_field=>"17"))?;
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "38"));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "34"));
|
||||
index_writer.commit()?;
|
||||
@@ -742,7 +749,7 @@ mod tests {
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"af af af bc bc"));
|
||||
index_writer.add_document(doc!(text_field=>"af af af bc bc"))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
{
|
||||
@@ -774,9 +781,9 @@ mod tests {
|
||||
let reader = index.reader()?;
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"af af af b"));
|
||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||
index_writer.add_document(doc!(text_field=>"a b c d"));
|
||||
index_writer.add_document(doc!(text_field=>"af af af b"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c d"))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
reader.reload()?;
|
||||
@@ -838,9 +845,9 @@ mod tests {
|
||||
assert_eq!(reader.searcher().num_docs(), 0u64);
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field=>"af b"));
|
||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||
index_writer.add_document(doc!(text_field=>"a b c d"));
|
||||
index_writer.add_document(doc!(text_field=>"af b"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
||||
index_writer.add_document(doc!(text_field=>"a b c d"))?;
|
||||
index_writer.commit()?;
|
||||
reader.reload()?;
|
||||
assert_eq!(reader.searcher().num_docs(), 3u64);
|
||||
@@ -880,7 +887,7 @@ mod tests {
|
||||
{
|
||||
let document =
|
||||
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
|
||||
index_writer.add_document(document);
|
||||
index_writer.add_document(document)?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let reader = index.reader()?;
|
||||
@@ -947,7 +954,7 @@ mod tests {
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
|
||||
for doc_id in 0u64..DOC_COUNT {
|
||||
index_writer.add_document(doc!(id => doc_id));
|
||||
index_writer.add_document(doc!(id => doc_id))?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
|
||||
@@ -964,7 +971,7 @@ mod tests {
|
||||
index_writer.delete_term(Term::from_field_u64(id, doc_id));
|
||||
index_writer.commit()?;
|
||||
index_reader.reload()?;
|
||||
index_writer.add_document(doc!(id => doc_id));
|
||||
index_writer.add_document(doc!(id => doc_id))?;
|
||||
index_writer.commit()?;
|
||||
index_reader.reload()?;
|
||||
let searcher = index_reader.searcher();
|
||||
@@ -993,8 +1000,24 @@ mod tests {
|
||||
#[test]
|
||||
fn test_validate_checksum() -> crate::Result<()> {
|
||||
let index_path = tempfile::tempdir().expect("dir");
|
||||
let schema = Schema::builder().build();
|
||||
let mut builder = Schema::builder();
|
||||
let body = builder.add_text_field("body", TEXT | STORED);
|
||||
let schema = builder.build();
|
||||
let index = Index::create_in_dir(&index_path, schema)?;
|
||||
let mut writer = index.writer(50_000_000)?;
|
||||
for _ in 0..5000 {
|
||||
writer.add_document(doc!(body => "foo"))?;
|
||||
writer.add_document(doc!(body => "boo"))?;
|
||||
}
|
||||
writer.commit()?;
|
||||
assert!(index.validate_checksum()?.is_empty());
|
||||
|
||||
// delete few docs
|
||||
writer.delete_term(Term::from_field_text(body, "foo"));
|
||||
writer.commit()?;
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let _ = futures::executor::block_on(writer.merge(&segment_ids));
|
||||
|
||||
assert!(index.validate_checksum()?.is_empty());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use std::io;
|
||||
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::positions::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::compression::{BlockDecoder, VIntDecoder};
|
||||
use common::{BinarySerializable, VInt};
|
||||
|
||||
/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
|
||||
/// This means we need to skip to the `nth` positions efficiently.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::common::{BinarySerializable, CountingWriter, VInt};
|
||||
use crate::positions::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::compression::BlockEncoder;
|
||||
use crate::postings::compression::VIntEncoder;
|
||||
use common::{BinarySerializable, CountingWriter, VInt};
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// The PositionSerializer is in charge of serializing all of the positions
|
||||
|
||||
@@ -1,241 +1,98 @@
|
||||
use std::ops::Range;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
|
||||
use crate::postings::compression::AlignedBuffer;
|
||||
|
||||
/// This modules define the logic used to search for a doc in a given
|
||||
/// block. (at most 128 docs)
|
||||
/// Search the first index containing an element greater or equal to
|
||||
/// the target.
|
||||
///
|
||||
/// Searching within a block is a hotspot when running intersection.
|
||||
/// so it was worth defining it in its own module.
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod sse2 {
|
||||
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
use std::arch::x86_64::__m128i as DataType;
|
||||
use std::arch::x86_64::_mm_add_epi32 as op_add;
|
||||
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
|
||||
use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
|
||||
use std::arch::x86_64::_mm_set1_epi32 as set1;
|
||||
use std::arch::x86_64::_mm_setzero_si128 as set0;
|
||||
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
|
||||
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
|
||||
|
||||
const MASK1: i32 = 78;
|
||||
const MASK2: i32 = 177;
|
||||
|
||||
/// Performs an exhaustive linear search over the
|
||||
///
|
||||
/// There is no early exit here. We simply count the
|
||||
/// number of elements that are `< target`.
|
||||
pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
|
||||
unsafe {
|
||||
let ptr = arr as *const AlignedBuffer as *const DataType;
|
||||
let vkey = set1(target as i32);
|
||||
let mut cnt = set0();
|
||||
// We work over 4 `__m128i` at a time.
|
||||
// A single `__m128i` actual contains 4 `u32`.
|
||||
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
|
||||
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
|
||||
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
|
||||
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
|
||||
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
|
||||
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
|
||||
cnt = op_sub(cnt, sum);
|
||||
}
|
||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
|
||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
|
||||
_mm_cvtsi128_si32(cnt) as usize
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::linear_search_sse2_128;
|
||||
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
#[test]
|
||||
fn test_linear_search_sse2_128_u32() {
|
||||
let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
|
||||
for el in 0u32..128u32 {
|
||||
block[el as usize] = (el * 2 + 1) << 18;
|
||||
}
|
||||
let target = block[64] + 1;
|
||||
assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This `linear search` browser exhaustively through the array.
|
||||
/// but the early exit is very difficult to predict.
|
||||
/// The results should be equivalent to
|
||||
/// ```compile_fail
|
||||
/// block[..]
|
||||
// .iter()
|
||||
// .take_while(|&&val| val < target)
|
||||
// .count()
|
||||
/// ```
|
||||
///
|
||||
/// Coupled with `exponential search` this function is likely
|
||||
/// to be called with the same `len`
|
||||
fn linear_search(arr: &[u32], target: u32) -> usize {
|
||||
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
|
||||
}
|
||||
|
||||
fn exponential_search(arr: &[u32], target: u32) -> Range<usize> {
|
||||
let end = arr.len();
|
||||
let mut begin = 0;
|
||||
for &pivot in &[1, 3, 7, 15, 31, 63] {
|
||||
if pivot >= end {
|
||||
break;
|
||||
/// the `start` argument is just used to hint that the response is
|
||||
/// greater than beyond `start`. the implementation may or may not use
|
||||
/// it for optimization.
|
||||
///
|
||||
/// # Assumption
|
||||
///
|
||||
/// - The block is sorted. Some elements may appear several times. This is the case at the
|
||||
/// end of the last block for instance.
|
||||
/// - The target is assumed smaller or equal to the last element of the block.
|
||||
pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
|
||||
let mut start = 0;
|
||||
let mut len = arr.len();
|
||||
for _ in 0..7 {
|
||||
len /= 2;
|
||||
let pivot = unsafe { *arr.get_unchecked(start + len - 1) };
|
||||
if pivot < target {
|
||||
start += len;
|
||||
}
|
||||
if arr[pivot] > target {
|
||||
return begin..pivot;
|
||||
}
|
||||
begin = pivot;
|
||||
}
|
||||
begin..end
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn galloping(block_docs: &[u32], target: u32) -> usize {
|
||||
let range = exponential_search(block_docs, target);
|
||||
range.start + linear_search(&block_docs[range], target)
|
||||
}
|
||||
|
||||
/// Tantivy may rely on SIMD instructions to search for a specific document within
|
||||
/// a given block.
|
||||
#[derive(Clone, Copy, PartialEq)]
|
||||
pub enum BlockSearcher {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
Sse2,
|
||||
Scalar,
|
||||
}
|
||||
|
||||
impl BlockSearcher {
|
||||
/// Search the first index containing an element greater or equal to
|
||||
/// the target.
|
||||
///
|
||||
/// The results should be equivalent to
|
||||
/// ```compile_fail
|
||||
/// block[..]
|
||||
// .iter()
|
||||
// .take_while(|&&val| val < target)
|
||||
// .count()
|
||||
/// ```
|
||||
///
|
||||
/// The `start` argument is just used to hint that the response is
|
||||
/// greater than beyond `start`. The implementation may or may not use
|
||||
/// it for optimization.
|
||||
///
|
||||
/// # Assumption
|
||||
///
|
||||
/// The array len is > start.
|
||||
/// The block is sorted
|
||||
/// The target is assumed greater or equal to the `arr[start]`.
|
||||
/// The target is assumed smaller or equal to the last element of the block.
|
||||
///
|
||||
/// Currently the scalar implementation starts by an exponential search, and
|
||||
/// then operates a linear search in the result subarray.
|
||||
///
|
||||
/// If SSE2 instructions are available in the `(platform, running CPU)`,
|
||||
/// then we use a different implementation that does an exhaustive linear search over
|
||||
/// the block regardless of whether the block is full or not.
|
||||
///
|
||||
/// Indeed, if the block is not full, the remaining items are TERMINATED.
|
||||
/// It is surprisingly faster, most likely because of the lack of branch misprediction.
|
||||
pub(crate) fn search_in_block(self, block_docs: &AlignedBuffer, target: u32) -> usize {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if self == BlockSearcher::Sse2 {
|
||||
return sse2::linear_search_sse2_128(block_docs, target);
|
||||
}
|
||||
}
|
||||
galloping(&block_docs.0[..], target)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BlockSearcher {
|
||||
fn default() -> BlockSearcher {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if is_x86_feature_detected!("sse2") {
|
||||
return BlockSearcher::Sse2;
|
||||
}
|
||||
}
|
||||
BlockSearcher::Scalar
|
||||
}
|
||||
start
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::exponential_search;
|
||||
use super::linear_search;
|
||||
use super::BlockSearcher;
|
||||
use super::branchless_binary_search;
|
||||
use crate::docset::TERMINATED;
|
||||
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
#[test]
|
||||
fn test_linear_search() {
|
||||
let len: usize = 50;
|
||||
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
|
||||
for target in 1..*arr.last().unwrap() {
|
||||
let res = linear_search(&arr[..], target);
|
||||
if res > 0 {
|
||||
assert!(arr[res - 1] < target);
|
||||
}
|
||||
if res < len {
|
||||
assert!(arr[res] >= target);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exponentiel_search() {
|
||||
assert_eq!(exponential_search(&[1, 2], 0), 0..1);
|
||||
assert_eq!(exponential_search(&[1, 2], 1), 0..1);
|
||||
assert_eq!(
|
||||
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
|
||||
3..7
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
|
||||
let cursor = search_in_block_trivial_but_slow(block, target);
|
||||
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
|
||||
let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
|
||||
output_buffer[..block.len()].copy_from_slice(block);
|
||||
assert_eq!(
|
||||
block_searcher.search_in_block(&AlignedBuffer(output_buffer), target),
|
||||
cursor
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
|
||||
use std::collections::HashSet;
|
||||
let mut targets = HashSet::new();
|
||||
for (i, val) in block.iter().cloned().enumerate() {
|
||||
if i > 0 {
|
||||
targets.insert(val - 1);
|
||||
}
|
||||
targets.insert(val);
|
||||
}
|
||||
for target in targets {
|
||||
util_test_search_in_block(block_searcher, block, target);
|
||||
}
|
||||
}
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use proptest::prelude::*;
|
||||
use std::collections::HashSet;
|
||||
|
||||
fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
||||
block.iter().take_while(|&&val| val < target).count()
|
||||
}
|
||||
|
||||
fn test_search_in_block_util(block_searcher: BlockSearcher) {
|
||||
for len in 1u32..128u32 {
|
||||
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
|
||||
util_test_search_in_block_all(block_searcher, &v[..]);
|
||||
fn util_test_search_in_block(block: &[u32], target: u32) {
|
||||
let cursor = search_in_block_trivial_but_slow(block, target);
|
||||
assert!(cursor < COMPRESSION_BLOCK_SIZE);
|
||||
assert!(block[cursor] >= target);
|
||||
if cursor > 0 {
|
||||
assert!(block[cursor - 1] < target);
|
||||
}
|
||||
assert_eq!(block.len(), COMPRESSION_BLOCK_SIZE);
|
||||
let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
|
||||
output_buffer[..block.len()].copy_from_slice(block);
|
||||
assert_eq!(branchless_binary_search(&output_buffer, target), cursor);
|
||||
}
|
||||
|
||||
fn util_test_search_in_block_all(block: &[u32]) {
|
||||
let mut targets = HashSet::new();
|
||||
targets.insert(0);
|
||||
for &val in block {
|
||||
if val > 0 {
|
||||
targets.insert(val - 1);
|
||||
}
|
||||
targets.insert(val);
|
||||
}
|
||||
for target in targets {
|
||||
util_test_search_in_block(block, target);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_search_in_block_scalar() {
|
||||
test_search_in_block_util(BlockSearcher::Scalar);
|
||||
fn test_search_in_branchless_binary_search() {
|
||||
let v: Vec<u32> = (0..COMPRESSION_BLOCK_SIZE).map(|i| i as u32 * 2).collect();
|
||||
util_test_search_in_block_all(&v[..]);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[test]
|
||||
fn test_search_in_block_sse2() {
|
||||
test_search_in_block_util(BlockSearcher::Sse2);
|
||||
fn monotonous_block() -> impl Strategy<Value = Vec<u32>> {
|
||||
prop::collection::vec(0u32..5u32, COMPRESSION_BLOCK_SIZE).prop_map(|mut deltas| {
|
||||
let mut el = 0;
|
||||
for i in 0..COMPRESSION_BLOCK_SIZE {
|
||||
el += deltas[i];
|
||||
deltas[i] = el;
|
||||
}
|
||||
deltas
|
||||
})
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_proptest_branchless_binary_search(block in monotonous_block()) {
|
||||
util_test_search_in_block_all(&block[..]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +1,14 @@
|
||||
use std::io;
|
||||
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::compression::{
|
||||
AlignedBuffer, BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE,
|
||||
};
|
||||
use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::postings::{BlockInfo, FreqReadingOption, SkipReader};
|
||||
use crate::query::Bm25Weight;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{DocId, Score, TERMINATED};
|
||||
use common::{BinarySerializable, VInt};
|
||||
|
||||
fn max_score<I: Iterator<Item = Score>>(mut it: I) -> Option<Score> {
|
||||
it.next().map(|first| it.fold(first, Score::max))
|
||||
@@ -209,9 +207,9 @@ impl BlockSegmentPostings {
|
||||
///
|
||||
/// This method is useful to run SSE2 linear search.
|
||||
#[inline]
|
||||
pub(crate) fn docs_aligned(&self) -> &AlignedBuffer {
|
||||
pub(crate) fn full_block(&self) -> &[DocId; COMPRESSION_BLOCK_SIZE] {
|
||||
debug_assert!(self.block_is_loaded());
|
||||
self.doc_decoder.output_aligned()
|
||||
self.doc_decoder.full_output()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
@@ -349,7 +347,6 @@ impl BlockSegmentPostings {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::BlockSegmentPostings;
|
||||
use crate::common::HasLen;
|
||||
use crate::core::Index;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
@@ -360,6 +357,7 @@ mod tests {
|
||||
use crate::schema::Term;
|
||||
use crate::schema::INDEXED;
|
||||
use crate::DocId;
|
||||
use common::HasLen;
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
@@ -395,8 +393,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings() {
|
||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||
fn test_block_segment_postings() -> crate::Result<()> {
|
||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>())?;
|
||||
let mut offset: u32 = 0u32;
|
||||
// checking that the `doc_freq` is correct
|
||||
assert_eq!(block_segments.doc_freq(), 100_000);
|
||||
@@ -411,16 +409,17 @@ mod tests {
|
||||
offset += block.len() as u32;
|
||||
block_segments.advance();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_right_at_new_block() {
|
||||
fn test_skip_right_at_new_block() -> crate::Result<()> {
|
||||
let mut doc_ids = (0..128).collect::<Vec<u32>>();
|
||||
// 128 is missing
|
||||
doc_ids.push(129);
|
||||
doc_ids.push(130);
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let block_segments = build_block_postings(&doc_ids)?;
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.seek(128), 129);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
@@ -429,7 +428,7 @@ mod tests {
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let block_segments = build_block_postings(&doc_ids).unwrap();
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.seek(129), 129);
|
||||
assert_eq!(docset.doc(), 129);
|
||||
@@ -438,46 +437,47 @@ mod tests {
|
||||
assert_eq!(docset.advance(), TERMINATED);
|
||||
}
|
||||
{
|
||||
let block_segments = build_block_postings(&doc_ids);
|
||||
let block_segments = build_block_postings(&doc_ids)?;
|
||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||
assert_eq!(docset.doc(), 0);
|
||||
assert_eq!(docset.seek(131), TERMINATED);
|
||||
assert_eq!(docset.doc(), TERMINATED);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
|
||||
fn build_block_postings(docs: &[DocId]) -> crate::Result<BlockSegmentPostings> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let mut last_doc = 0u32;
|
||||
for &doc in docs {
|
||||
for _ in last_doc..doc {
|
||||
index_writer.add_document(doc!(int_field=>1u64));
|
||||
index_writer.add_document(doc!(int_field=>1u64))?;
|
||||
}
|
||||
index_writer.add_document(doc!(int_field=>0u64));
|
||||
index_writer.add_document(doc!(int_field=>0u64))?;
|
||||
last_doc = doc + 1;
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
let term_info = inverted_index.get_term_info(&term).unwrap().unwrap();
|
||||
inverted_index
|
||||
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
|
||||
.unwrap()
|
||||
let term_info = inverted_index.get_term_info(&term)?.unwrap();
|
||||
let block_postings = inverted_index
|
||||
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?;
|
||||
Ok(block_postings)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings_seek() {
|
||||
fn test_block_segment_postings_seek() -> crate::Result<()> {
|
||||
let mut docs = vec![0];
|
||||
for i in 0..1300 {
|
||||
docs.push((i * i / 100) + i);
|
||||
}
|
||||
let mut block_postings = build_block_postings(&docs[..]);
|
||||
let mut block_postings = build_block_postings(&docs[..])?;
|
||||
for i in &[0, 424, 10000] {
|
||||
block_postings.seek(*i);
|
||||
let docs = block_postings.docs();
|
||||
@@ -486,6 +486,7 @@ mod tests {
|
||||
}
|
||||
block_postings.seek(100_000);
|
||||
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -499,7 +500,7 @@ mod tests {
|
||||
// the other containing odd numbers.
|
||||
for i in 0..6 {
|
||||
let doc = doc!(int_field=> (i % 2) as u64);
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::common::FixedSize;
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
use common::FixedSize;
|
||||
|
||||
pub const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * u32::SIZE_IN_BYTES;
|
||||
@@ -49,16 +49,10 @@ impl BlockEncoder {
|
||||
}
|
||||
}
|
||||
|
||||
/// We ensure that the OutputBuffer is align on 128 bits
|
||||
/// in order to run SSE2 linear search on it.
|
||||
#[repr(align(128))]
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BlockDecoder {
|
||||
bitpacker: BitPacker4x,
|
||||
output: AlignedBuffer,
|
||||
output: [u32; COMPRESSION_BLOCK_SIZE],
|
||||
pub output_len: usize,
|
||||
}
|
||||
|
||||
@@ -72,7 +66,7 @@ impl BlockDecoder {
|
||||
pub fn with_val(val: u32) -> BlockDecoder {
|
||||
BlockDecoder {
|
||||
bitpacker: BitPacker4x::new(),
|
||||
output: AlignedBuffer([val; COMPRESSION_BLOCK_SIZE]),
|
||||
output: [val; COMPRESSION_BLOCK_SIZE],
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
@@ -85,28 +79,28 @@ impl BlockDecoder {
|
||||
) -> usize {
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker
|
||||
.decompress_sorted(offset, compressed_data, &mut self.output.0, num_bits)
|
||||
.decompress_sorted(offset, compressed_data, &mut self.output, num_bits)
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker
|
||||
.decompress(compressed_data, &mut self.output.0, num_bits)
|
||||
.decompress(compressed_data, &mut self.output, num_bits)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output_array(&self) -> &[u32] {
|
||||
&self.output.0[..self.output_len]
|
||||
&self.output[..self.output_len]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn output_aligned(&self) -> &AlignedBuffer {
|
||||
pub(crate) fn full_output(&self) -> &[u32; COMPRESSION_BLOCK_SIZE] {
|
||||
&self.output
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output(&self, idx: usize) -> u32 {
|
||||
self.output.0[idx]
|
||||
self.output[idx]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -190,8 +184,8 @@ impl VIntDecoder for BlockDecoder {
|
||||
padding: u32,
|
||||
) -> usize {
|
||||
self.output_len = num_els;
|
||||
self.output.0.iter_mut().for_each(|el| *el = padding);
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
|
||||
self.output.iter_mut().for_each(|el| *el = padding);
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted(
|
||||
@@ -201,12 +195,12 @@ impl VIntDecoder for BlockDecoder {
|
||||
padding: u32,
|
||||
) -> usize {
|
||||
self.output_len = num_els;
|
||||
self.output.0.iter_mut().for_each(|el| *el = padding);
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
|
||||
self.output.iter_mut().for_each(|el| *el = padding);
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted_until_end(&mut self, compressed_data: &[u8]) {
|
||||
let num_els = vint::uncompress_unsorted_until_end(compressed_data, &mut self.output.0);
|
||||
let num_els = vint::uncompress_unsorted_until_end(compressed_data, &mut self.output);
|
||||
self.output_len = num_els;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,9 @@ Postings module (also called inverted index)
|
||||
*/
|
||||
|
||||
mod block_search;
|
||||
|
||||
pub(crate) use self::block_search::branchless_binary_search;
|
||||
|
||||
mod block_segment_postings;
|
||||
pub(crate) mod compression;
|
||||
mod postings;
|
||||
@@ -14,7 +17,6 @@ mod skip;
|
||||
mod stacker;
|
||||
mod term_info;
|
||||
|
||||
pub(crate) use self::block_search::BlockSearcher;
|
||||
pub use self::block_segment_postings::BlockSegmentPostings;
|
||||
pub use self::postings::Postings;
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
@@ -85,12 +87,12 @@ pub mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(title => r#"abc abc abc"#));
|
||||
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
|
||||
index_writer.add_document(doc!(title => r#"abc abc abc"#))?;
|
||||
index_writer.add_document(doc!(title => r#"abc be be be be abc"#))?;
|
||||
for _ in 0..1_000 {
|
||||
index_writer.add_document(doc!(title => r#"abc abc abc"#));
|
||||
index_writer.add_document(doc!(title => r#"abc abc abc"#))?;
|
||||
}
|
||||
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
|
||||
index_writer.add_document(doc!(title => r#"abc be be be be abc"#))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
@@ -152,10 +154,7 @@ pub mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_drop_token_that_are_too_long() -> crate::Result<()> {
|
||||
let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
|
||||
let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1);
|
||||
exceeding_token_text.push_str(" hello");
|
||||
pub fn test_index_max_length_token() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_options = TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
@@ -168,33 +167,54 @@ pub mod tests {
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
let reader = index.reader().unwrap();
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>exceeding_token_text));
|
||||
index_writer.commit().unwrap();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
assert_eq!(&bytes, b"hello");
|
||||
}
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
|
||||
index_writer.commit().unwrap();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(1u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
assert_eq!(&bytes[..], ok_token_text.as_bytes());
|
||||
}
|
||||
let reader = index.reader()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
|
||||
index_writer.add_document(doc!(text_field=>ok_token_text.clone()))?;
|
||||
index_writer.commit()?;
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
assert_eq!(&bytes[..], ok_token_text.as_bytes());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_drop_token_that_are_too_long() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_options = TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
||||
.set_tokenizer("simple_no_truncation"),
|
||||
);
|
||||
let text_field = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
let reader = index.reader()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1);
|
||||
exceeding_token_text.push_str(" hello");
|
||||
index_writer.add_document(doc!(text_field=>exceeding_token_text))?;
|
||||
index_writer.commit()?;
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
assert_eq!(&bytes, b"hello");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -313,13 +333,13 @@ pub mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(text_field => "g b b d c g c"));
|
||||
index_writer.add_document(doc!(text_field => "g a b b a d c g c"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(text_field => "g b b d c g c"))?;
|
||||
index_writer.add_document(doc!(text_field => "g a b b a d c g c"))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut postings = segment_reader
|
||||
.inverted_index(text_field)?
|
||||
@@ -348,7 +368,7 @@ pub mod tests {
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for i in 0u64..num_docs as u64 {
|
||||
let doc = doc!(value_field => 2u64, value_field => i % 2u64);
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc)?;
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ use crate::docset::DocSet;
|
||||
/// but other implementations mocking `SegmentPostings` exist,
|
||||
/// for merging segments or for testing.
|
||||
pub trait Postings: DocSet + 'static {
|
||||
/// Returns the term frequency
|
||||
/// The number of times the term appears in the document.
|
||||
fn term_freq(&self) -> u32;
|
||||
|
||||
/// Returns the positions offseted with a given value.
|
||||
|
||||
@@ -33,7 +33,7 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter>
|
||||
SpecializedPostingsWriter::<TfAndPositionRecorder>::new_boxed()
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||
.unwrap_or_else(SpecializedPostingsWriter::<NothingRecorder>::new_boxed),
|
||||
FieldType::U64(_)
|
||||
| FieldType::I64(_)
|
||||
| FieldType::F64(_)
|
||||
@@ -133,7 +133,8 @@ impl MultiFieldPostingsWriter {
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
|
||||
self.term_index.iter().collect();
|
||||
Vec::with_capacity(self.term_index.len());
|
||||
term_offsets.extend(self.term_index.iter());
|
||||
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
|
||||
|
||||
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::FieldSerializer;
|
||||
use crate::DocId;
|
||||
use crate::{
|
||||
common::{read_u32_vint, write_u32_vint},
|
||||
indexer::doc_id_mapping::DocIdMapping,
|
||||
};
|
||||
use common::{read_u32_vint, write_u32_vint};
|
||||
|
||||
const POSITION_END: u32 = 0;
|
||||
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
use crate::common::HasLen;
|
||||
use crate::docset::DocSet;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::branchless_binary_search;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::BlockSearcher;
|
||||
use crate::postings::BlockSegmentPostings;
|
||||
use crate::postings::Postings;
|
||||
use crate::{DocId, TERMINATED};
|
||||
use common::HasLen;
|
||||
|
||||
/// `SegmentPostings` represents the inverted list or postings associated to
|
||||
/// a term in a `Segment`.
|
||||
@@ -18,7 +18,6 @@ pub struct SegmentPostings {
|
||||
pub(crate) block_cursor: BlockSegmentPostings,
|
||||
cur: usize,
|
||||
position_reader: Option<PositionReader>,
|
||||
block_searcher: BlockSearcher,
|
||||
}
|
||||
|
||||
impl SegmentPostings {
|
||||
@@ -28,7 +27,6 @@ impl SegmentPostings {
|
||||
block_cursor: BlockSegmentPostings::empty(),
|
||||
cur: 0,
|
||||
position_reader: None,
|
||||
block_searcher: BlockSearcher::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,7 +34,7 @@ impl SegmentPostings {
|
||||
///
|
||||
/// This method will clone and scan through the posting lists.
|
||||
/// (this is a rather expensive operation).
|
||||
pub fn doc_freq_given_deletes(&self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
pub fn doc_freq_given_deletes(&self, alive_bitset: &AliveBitSet) -> u32 {
|
||||
let mut docset = self.clone();
|
||||
let mut doc_freq = 0;
|
||||
loop {
|
||||
@@ -44,7 +42,7 @@ impl SegmentPostings {
|
||||
if doc == TERMINATED {
|
||||
return doc_freq;
|
||||
}
|
||||
if delete_bitset.is_alive(doc) {
|
||||
if alive_bitset.is_alive(doc) {
|
||||
doc_freq += 1u32;
|
||||
}
|
||||
docset.advance();
|
||||
@@ -154,7 +152,6 @@ impl SegmentPostings {
|
||||
block_cursor: segment_block_postings,
|
||||
cur: 0, // cursor within the block
|
||||
position_reader,
|
||||
block_searcher: BlockSearcher::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -183,8 +180,8 @@ impl DocSet for SegmentPostings {
|
||||
self.block_cursor.seek(target);
|
||||
|
||||
// At this point we are on the block, that might contain our document.
|
||||
let output = self.block_cursor.docs_aligned();
|
||||
self.cur = self.block_searcher.search_in_block(output, target);
|
||||
let output = self.block_cursor.full_block();
|
||||
self.cur = branchless_binary_search(output, target);
|
||||
|
||||
// The last block is not full and padded with the value TERMINATED,
|
||||
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
|
||||
@@ -197,7 +194,7 @@ impl DocSet for SegmentPostings {
|
||||
// with the value `TERMINATED`.
|
||||
//
|
||||
// After the search, the cursor should point to the first value of TERMINATED.
|
||||
let doc = output.0[self.cur];
|
||||
let doc = output[self.cur];
|
||||
debug_assert!(doc >= target);
|
||||
debug_assert_eq!(doc, self.doc());
|
||||
doc
|
||||
@@ -268,10 +265,10 @@ impl Postings for SegmentPostings {
|
||||
mod tests {
|
||||
|
||||
use super::SegmentPostings;
|
||||
use crate::common::HasLen;
|
||||
use common::HasLen;
|
||||
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::postings::postings::Postings;
|
||||
|
||||
#[test]
|
||||
@@ -299,9 +296,10 @@ mod tests {
|
||||
fn test_doc_freq() {
|
||||
let docs = SegmentPostings::create_from_docs(&[0, 2, 10]);
|
||||
assert_eq!(docs.doc_freq(), 3);
|
||||
let delete_bitset = DeleteBitSet::for_test(&[2], 12);
|
||||
assert_eq!(docs.doc_freq_given_deletes(&delete_bitset), 2);
|
||||
let all_deleted = DeleteBitSet::for_test(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[2], 12);
|
||||
assert_eq!(docs.doc_freq_given_deletes(&alive_bitset), 2);
|
||||
let all_deleted =
|
||||
AliveBitSet::for_test_from_deleted_docs(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 12);
|
||||
assert_eq!(docs.doc_freq_given_deletes(&all_deleted), 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use super::TermInfo;
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::common::{CompositeWrite, CountingWriter};
|
||||
use crate::core::Segment;
|
||||
use crate::directory::CompositeWrite;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::positions::PositionSerializer;
|
||||
@@ -12,6 +11,9 @@ use crate::schema::{Field, FieldEntry, FieldType};
|
||||
use crate::schema::{IndexRecordOption, Schema};
|
||||
use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||
use crate::{DocId, Score};
|
||||
use common::CountingWriter;
|
||||
use common::{BinarySerializable, VInt};
|
||||
use fail::fail_point;
|
||||
use std::cmp::Ordering;
|
||||
use std::io::{self, Write};
|
||||
|
||||
@@ -211,6 +213,9 @@ impl<'a> FieldSerializer<'a> {
|
||||
/// If the current block is incomplete, it need to be encoded
|
||||
/// using `VInt` encoding.
|
||||
pub fn close_term(&mut self) -> io::Result<()> {
|
||||
fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
|
||||
Err(io::Error::new(io::ErrorKind::Other, format!("{:?}", msg)))
|
||||
});
|
||||
if self.term_open {
|
||||
self.postings_serializer
|
||||
.close_term(self.current_term_info.doc_freq)?;
|
||||
@@ -442,10 +447,8 @@ impl<W: Write> PostingsSerializer<W> {
|
||||
let skip_data = self.skip_write.data();
|
||||
VInt(skip_data.len() as u64).serialize(&mut self.output_write)?;
|
||||
self.output_write.write_all(skip_data)?;
|
||||
self.output_write.write_all(&self.postings_write[..])?;
|
||||
} else {
|
||||
self.output_write.write_all(&self.postings_write[..])?;
|
||||
}
|
||||
self.output_write.write_all(&self.postings_write[..])?;
|
||||
self.skip_write.clear();
|
||||
self.postings_write.clear();
|
||||
self.bm25_weight = None;
|
||||
|
||||
@@ -148,6 +148,10 @@ impl TermHashMap {
|
||||
unordered_term_id
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Iter<'_> {
|
||||
Iter {
|
||||
inner: self.occupied.iter(),
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user