mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 01:32:53 +00:00
Compare commits
2 Commits
warming
...
wasm-frien
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
89f91b1b58 | ||
|
|
19965c46bc |
7
.github/dependabot.yml
vendored
7
.github/dependabot.yml
vendored
@@ -6,10 +6,3 @@ updates:
|
|||||||
interval: daily
|
interval: daily
|
||||||
time: "20:00"
|
time: "20:00"
|
||||||
open-pull-requests-limit: 10
|
open-pull-requests-limit: 10
|
||||||
|
|
||||||
- package-ecosystem: "github-actions"
|
|
||||||
directory: "/"
|
|
||||||
schedule:
|
|
||||||
interval: daily
|
|
||||||
time: "20:00"
|
|
||||||
open-pull-requests-limit: 10
|
|
||||||
|
|||||||
2
.github/workflows/coverage.yml
vendored
2
.github/workflows/coverage.yml
vendored
@@ -18,7 +18,7 @@ jobs:
|
|||||||
- name: Generate code coverage
|
- name: Generate code coverage
|
||||||
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
|
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
|
||||||
- name: Upload coverage to Codecov
|
- name: Upload coverage to Codecov
|
||||||
uses: codecov/codecov-action@v2
|
uses: codecov/codecov-action@v1
|
||||||
with:
|
with:
|
||||||
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
|
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
|
||||||
files: lcov.info
|
files: lcov.info
|
||||||
|
|||||||
4
.github/workflows/test.yml
vendored
4
.github/workflows/test.yml
vendored
@@ -21,10 +21,10 @@ jobs:
|
|||||||
- name: Install latest nightly to test also against unstable feature flag
|
- name: Install latest nightly to test also against unstable feature flag
|
||||||
uses: actions-rs/toolchain@v1
|
uses: actions-rs/toolchain@v1
|
||||||
with:
|
with:
|
||||||
toolchain: stable
|
toolchain: nightly
|
||||||
override: true
|
override: true
|
||||||
components: rustfmt
|
components: rustfmt
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: cargo test --features mmap,brotli-compression,lz4-compression,snappy-compression,failpoints --verbose --workspace
|
run: cargo test --all-features --verbose --workspace
|
||||||
- name: Check Formatting
|
- name: Check Formatting
|
||||||
run: cargo fmt --all -- --check
|
run: cargo fmt --all -- --check
|
||||||
|
|||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,5 +1,4 @@
|
|||||||
tantivy.iml
|
tantivy.iml
|
||||||
.cargo
|
|
||||||
proptest-regressions
|
proptest-regressions
|
||||||
*.swp
|
*.swp
|
||||||
target
|
target
|
||||||
|
|||||||
92
.travis.yml
Normal file
92
.travis.yml
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
# Based on the "trust" template v0.1.2
|
||||||
|
# https://github.com/japaric/trust/tree/v0.1.2
|
||||||
|
|
||||||
|
dist: trusty
|
||||||
|
language: rust
|
||||||
|
services: docker
|
||||||
|
sudo: required
|
||||||
|
|
||||||
|
env:
|
||||||
|
global:
|
||||||
|
- CRATE_NAME=tantivy
|
||||||
|
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
|
||||||
|
# - secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||||
|
|
||||||
|
addons:
|
||||||
|
apt:
|
||||||
|
sources:
|
||||||
|
- ubuntu-toolchain-r-test
|
||||||
|
- kalakris-cmake
|
||||||
|
packages:
|
||||||
|
- gcc-4.8
|
||||||
|
- g++-4.8
|
||||||
|
- libcurl4-openssl-dev
|
||||||
|
- libelf-dev
|
||||||
|
- libdw-dev
|
||||||
|
- binutils-dev
|
||||||
|
- cmake
|
||||||
|
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
# Android
|
||||||
|
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
||||||
|
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||||
|
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||||
|
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||||
|
#- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
|
||||||
|
|
||||||
|
# Linux
|
||||||
|
#- env: TARGET=aarch64-unknown-linux-gnu
|
||||||
|
#- env: TARGET=i686-unknown-linux-gnu
|
||||||
|
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 #UPLOAD_DOCS=1
|
||||||
|
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
|
||||||
|
# OSX
|
||||||
|
#- env: TARGET=x86_64-apple-darwin
|
||||||
|
# os: osx
|
||||||
|
|
||||||
|
before_install:
|
||||||
|
- set -e
|
||||||
|
- rustup self update
|
||||||
|
- rustup component add rustfmt
|
||||||
|
|
||||||
|
install:
|
||||||
|
- sh ci/install.sh
|
||||||
|
- source ~/.cargo/env || true
|
||||||
|
- env | grep "TRAVIS"
|
||||||
|
|
||||||
|
before_script:
|
||||||
|
- export PATH=$HOME/.cargo/bin:$PATH
|
||||||
|
- cargo install cargo-update || echo "cargo-update already installed"
|
||||||
|
- cargo install cargo-travis || echo "cargo-travis already installed"
|
||||||
|
|
||||||
|
script:
|
||||||
|
- bash ci/script.sh
|
||||||
|
- cargo fmt --all -- --check
|
||||||
|
|
||||||
|
before_deploy:
|
||||||
|
- sh ci/before_deploy.sh
|
||||||
|
|
||||||
|
after_success:
|
||||||
|
# Needs GH_TOKEN env var to be set in travis settings
|
||||||
|
- if [[ -v GH_TOKEN ]]; then echo "GH TOKEN IS SET"; else echo "GH TOKEN NOT SET"; fi
|
||||||
|
- if [[ -v UPLOAD_DOCS ]]; then cargo doc; cargo doc-upload; else echo "doc upload disabled."; fi
|
||||||
|
|
||||||
|
#cache: cargo
|
||||||
|
#before_cache:
|
||||||
|
# # Travis can't cache files that are not readable by "others"
|
||||||
|
# - chmod -R a+r $HOME/.cargo
|
||||||
|
# - find ./target/debug -type f -maxdepth 1 -delete
|
||||||
|
# - rm -f ./target/.rustc_info.json
|
||||||
|
# - rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
|
||||||
|
# - rm -r target/debug/examples/
|
||||||
|
# - ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
|
||||||
|
|
||||||
|
#branches:
|
||||||
|
# only:
|
||||||
|
# # release tags
|
||||||
|
# - /^v\d+\.\d+\.\d+.*$/
|
||||||
|
# - master
|
||||||
|
|
||||||
|
notifications:
|
||||||
|
email:
|
||||||
|
on_success: never
|
||||||
17
CHANGELOG.md
17
CHANGELOG.md
@@ -1,21 +1,6 @@
|
|||||||
Tantivy 0.17
|
|
||||||
================================
|
|
||||||
- LogMergePolicy now triggers merges if the ratio of deleted documents reaches a threshold (@shikhar) [#115](https://github.com/quickwit-inc/tantivy/issues/115)
|
|
||||||
- Adds a searcher Warmer API (@shikhar)
|
|
||||||
- Change to non-strict schema. Ignore fields in data which are not defined in schema. Previously this returned an error. #1211
|
|
||||||
- Facets are necessarily indexed. Existing index with indexed facets should work out of the box. Index without facets that are marked with index: false should be broken (but they were already broken in a sense). (@fulmicoton) #1195 .
|
|
||||||
- Bugfix that could in theory impact durability in theory on some filesystems [#1224](https://github.com/quickwit-inc/tantivy/issues/1224)
|
|
||||||
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-inc/tantivy/issues/922)
|
|
||||||
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-inc/tantivy/issues/1225)
|
|
||||||
|
|
||||||
Tantivy 0.16.2
|
|
||||||
================================
|
|
||||||
- Bugfix in FuzzyTermQuery. (tranposition_cost_one was not doing anything)
|
|
||||||
|
|
||||||
Tantivy 0.16.1
|
Tantivy 0.16.1
|
||||||
========================
|
========================
|
||||||
- Major Bugfix on multivalued fastfield. #1151
|
- Major Bugfix on multivalued fastfield. #1151
|
||||||
- Demux operation (@PSeitz)
|
|
||||||
|
|
||||||
Tantivy 0.16.0
|
Tantivy 0.16.0
|
||||||
=========================
|
=========================
|
||||||
@@ -128,7 +113,7 @@ Tantivy 0.12.0
|
|||||||
## How to update?
|
## How to update?
|
||||||
|
|
||||||
Crates relying on custom tokenizer, or registering tokenizer in the manager will require some
|
Crates relying on custom tokenizer, or registering tokenizer in the manager will require some
|
||||||
minor changes. Check https://github.com/quickwit-inc/tantivy/blob/main/examples/custom_tokenizer.rs
|
minor changes. Check https://github.com/tantivy-search/tantivy/blob/main/examples/custom_tokenizer.rs
|
||||||
to check for some code sample.
|
to check for some code sample.
|
||||||
|
|
||||||
Tantivy 0.11.3
|
Tantivy 0.11.3
|
||||||
|
|||||||
23
Cargo.toml
23
Cargo.toml
@@ -1,13 +1,13 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.17.0-dev"
|
version = "0.16.1"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
description = """Search engine library"""
|
description = """Search engine library"""
|
||||||
documentation = "https://docs.rs/tantivy/"
|
documentation = "https://docs.rs/tantivy/"
|
||||||
homepage = "https://github.com/quickwit-inc/tantivy"
|
homepage = "https://github.com/tantivy-search/tantivy"
|
||||||
repository = "https://github.com/quickwit-inc/tantivy"
|
repository = "https://github.com/tantivy-search/tantivy"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
keywords = ["search", "information", "retrieval"]
|
keywords = ["search", "information", "retrieval"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
@@ -20,12 +20,13 @@ once_cell = "1.7.2"
|
|||||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||||
tantivy-fst = "0.3"
|
tantivy-fst = "0.3"
|
||||||
memmap2 = {version = "0.5", optional=true}
|
memmap2 = {version = "0.5", optional=true}
|
||||||
lz4_flex = { version = "0.9", default-features = false, features = ["checked-decode"], optional = true }
|
lz4_flex = { version = "0.9.0", default-features = false, features = ["checked-decode"], optional = true }
|
||||||
brotli = { version = "3.3", optional = true }
|
brotli = { version = "3.3", optional = true }
|
||||||
snap = { version = "1.0.5", optional = true }
|
snap = { version = "1.0.5", optional = true }
|
||||||
tempfile = { version = "3.2", optional = true }
|
tempfile = { version = "3.2", optional = true }
|
||||||
log = "0.4.14"
|
log = "0.4.14"
|
||||||
serde = { version = "1.0.126", features = ["derive"] }
|
serde = { version = "1.0.126", features = ["derive"] }
|
||||||
|
serde_closure = "0.3"
|
||||||
serde_json = "1.0.64"
|
serde_json = "1.0.64"
|
||||||
num_cpus = "1.13"
|
num_cpus = "1.13"
|
||||||
fs2={ version = "0.4.3", optional = true }
|
fs2={ version = "0.4.3", optional = true }
|
||||||
@@ -37,7 +38,7 @@ tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
|
|||||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||||
common = { version = "0.1", path = "./common/", package = "tantivy-common" }
|
common = { version = "0.1", path = "./common/", package = "tantivy-common" }
|
||||||
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
|
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
|
||||||
ownedbytes = { version="0.2", path="./ownedbytes" }
|
ownedbytes = { version="0.1", path="./ownedbytes" }
|
||||||
stable_deref_trait = "1.2"
|
stable_deref_trait = "1.2"
|
||||||
rust-stemmers = "1.2"
|
rust-stemmers = "1.2"
|
||||||
downcast-rs = "1.2"
|
downcast-rs = "1.2"
|
||||||
@@ -46,15 +47,16 @@ census = "0.4"
|
|||||||
fnv = "1.0.7"
|
fnv = "1.0.7"
|
||||||
thiserror = "1.0.24"
|
thiserror = "1.0.24"
|
||||||
htmlescape = "0.3.1"
|
htmlescape = "0.3.1"
|
||||||
fail = "0.5"
|
fail = "0.4"
|
||||||
murmurhash32 = "0.2"
|
murmurhash32 = "0.2"
|
||||||
chrono = "0.4.19"
|
chrono = "0.4.19"
|
||||||
smallvec = "1.6.1"
|
smallvec = "1.6.1"
|
||||||
rayon = "1.5"
|
|
||||||
lru = "0.7.0"
|
lru = "0.7.0"
|
||||||
fastdivide = "0.3"
|
fastdivide = "0.3"
|
||||||
itertools = "0.10.0"
|
itertools = "0.10.0"
|
||||||
measure_time = "0.8.0"
|
measure_time = "0.7.0"
|
||||||
|
wasm-mt = "0.1"
|
||||||
|
wasm-mt-pool = "0.1"
|
||||||
|
|
||||||
[target.'cfg(windows)'.dependencies]
|
[target.'cfg(windows)'.dependencies]
|
||||||
winapi = "0.3.9"
|
winapi = "0.3.9"
|
||||||
@@ -65,11 +67,11 @@ maplit = "1.0.2"
|
|||||||
matches = "0.1.8"
|
matches = "0.1.8"
|
||||||
proptest = "1.0"
|
proptest = "1.0"
|
||||||
criterion = "0.3.5"
|
criterion = "0.3.5"
|
||||||
test-log = "0.2.8"
|
test-env-log = "0.2.7"
|
||||||
env_logger = "0.9.0"
|
env_logger = "0.9.0"
|
||||||
|
|
||||||
[dev-dependencies.fail]
|
[dev-dependencies.fail]
|
||||||
version = "0.5"
|
version = "0.4"
|
||||||
features = ["failpoints"]
|
features = ["failpoints"]
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
@@ -91,6 +93,7 @@ snappy-compression = ["snap"]
|
|||||||
|
|
||||||
failpoints = ["fail/failpoints"]
|
failpoints = ["fail/failpoints"]
|
||||||
unstable = [] # useful for benches.
|
unstable = [] # useful for benches.
|
||||||
|
wasm-bindgen = ["uuid/wasm-bindgen"]
|
||||||
|
|
||||||
[workspace]
|
[workspace]
|
||||||
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"]
|
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"]
|
||||||
|
|||||||
16
README.md
16
README.md
@@ -1,8 +1,8 @@
|
|||||||
|
|
||||||
[](https://docs.rs/crate/tantivy/)
|
[](https://docs.rs/crate/tantivy/)
|
||||||
[](https://github.com/quickwit-inc/tantivy/actions/workflows/test.yml)
|
[](https://github.com/tantivy-search/tantivy/actions/workflows/test.yml)
|
||||||
[](https://codecov.io/gh/quickwit-inc/tantivy)
|
[](https://codecov.io/gh/tantivy-search/tantivy)
|
||||||
[](https://discord.gg/MT27AG5EVE)
|
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
[](https://crates.io/crates/tantivy)
|
[](https://crates.io/crates/tantivy)
|
||||||
|
|
||||||
@@ -17,6 +17,9 @@
|
|||||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
|
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
|
||||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
|
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
|
||||||
|
|
||||||
|
[](https://www.patreon.com/fulmicoton)
|
||||||
|
|
||||||
|
|
||||||
**Tantivy** is a **full text search engine library** written in Rust.
|
**Tantivy** is a **full text search engine library** written in Rust.
|
||||||
|
|
||||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||||
@@ -75,12 +78,13 @@ It walks you through getting a wikipedia search engine up and running in a few m
|
|||||||
|
|
||||||
There are many ways to support this project.
|
There are many ways to support this project.
|
||||||
|
|
||||||
- Use Tantivy and tell us about your experience on [Discord](https://discord.gg/MT27AG5EVE) or by email (paul.masurel@gmail.com)
|
- Use Tantivy and tell us about your experience on [Gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
|
||||||
- Report bugs
|
- Report bugs
|
||||||
- Write a blog post
|
- Write a blog post
|
||||||
- Help with documentation by asking questions or submitting PRs
|
- Help with documentation by asking questions or submitting PRs
|
||||||
- Contribute code (you can join [our Discord server](https://discord.gg/MT27AG5EVE))
|
- Contribute code (you can join [our Gitter](https://gitter.im/tantivy-search/tantivy))
|
||||||
- Talk about Tantivy around you
|
- Talk about Tantivy around you
|
||||||
|
- [](https://www.patreon.com/fulmicoton)
|
||||||
|
|
||||||
# Contributing code
|
# Contributing code
|
||||||
|
|
||||||
@@ -92,7 +96,7 @@ Tantivy compiles on stable Rust but requires `Rust >= 1.27`.
|
|||||||
To check out and run tests, you can simply run:
|
To check out and run tests, you can simply run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/quickwit-inc/tantivy.git
|
git clone https://github.com/tantivy-search/tantivy.git
|
||||||
cd tantivy
|
cd tantivy
|
||||||
cargo build
|
cargo build
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
|||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = []
|
categories = []
|
||||||
description = """Tantivy-sub crate: bitpacking"""
|
description = """Tantivy-sub crate: bitpacking"""
|
||||||
repository = "https://github.com/quickwit-inc/tantivy"
|
repository = "https://github.com/tantivy-search/tantivy"
|
||||||
keywords = []
|
keywords = []
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ description = "common traits and utility functions used by multiple tantivy subc
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
byteorder = "1.4.3"
|
byteorder = "1.4.3"
|
||||||
ownedbytes = { version="0.2", path="../ownedbytes" }
|
ownedbytes = { version="0.1", path="../ownedbytes" }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
proptest = "1.0.0"
|
proptest = "1.0.0"
|
||||||
|
|||||||
@@ -36,14 +36,10 @@ impl TinySet {
|
|||||||
writer.write_all(self.0.to_le_bytes().as_ref())
|
writer.write_all(self.0.to_le_bytes().as_ref())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn into_bytes(self) -> [u8; 8] {
|
|
||||||
self.0.to_le_bytes()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn deserialize(data: [u8; 8]) -> Self {
|
pub fn deserialize(data: [u8; 8]) -> io::Result<Self> {
|
||||||
let val: u64 = u64::from_le_bytes(data);
|
let val: u64 = u64::from_le_bytes(data);
|
||||||
TinySet(val)
|
Ok(TinySet(val))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns an empty `TinySet`.
|
/// Returns an empty `TinySet`.
|
||||||
@@ -62,30 +58,29 @@ impl TinySet {
|
|||||||
self.0 = 0u64;
|
self.0 = 0u64;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
/// Returns the complement of the set in `[0, 64[`.
|
/// Returns the complement of the set in `[0, 64[`.
|
||||||
///
|
///
|
||||||
/// Careful on making this function public, as it will break the padding handling in the last
|
/// Careful on making this function public, as it will break the padding handling in the last
|
||||||
/// bucket.
|
/// bucket.
|
||||||
#[inline]
|
|
||||||
fn complement(self) -> TinySet {
|
fn complement(self) -> TinySet {
|
||||||
TinySet(!self.0)
|
TinySet(!self.0)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true iff the `TinySet` contains the element `el`.
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
/// Returns true iff the `TinySet` contains the element `el`.
|
||||||
pub fn contains(self, el: u32) -> bool {
|
pub fn contains(self, el: u32) -> bool {
|
||||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the number of elements in the TinySet.
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
/// Returns the number of elements in the TinySet.
|
||||||
pub fn len(self) -> u32 {
|
pub fn len(self) -> u32 {
|
||||||
self.0.count_ones()
|
self.0.count_ones()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the intersection of `self` and `other`
|
|
||||||
#[inline]
|
#[inline]
|
||||||
#[must_use]
|
/// Returns the intersection of `self` and `other`
|
||||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||||
TinySet(self.0 & other.0)
|
TinySet(self.0 & other.0)
|
||||||
}
|
}
|
||||||
@@ -99,14 +94,12 @@ impl TinySet {
|
|||||||
|
|
||||||
/// Insert a new element within [0..64)
|
/// Insert a new element within [0..64)
|
||||||
#[inline]
|
#[inline]
|
||||||
#[must_use]
|
|
||||||
pub fn insert(self, el: u32) -> TinySet {
|
pub fn insert(self, el: u32) -> TinySet {
|
||||||
self.union(TinySet::singleton(el))
|
self.union(TinySet::singleton(el))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Removes an element within [0..64)
|
/// Removes an element within [0..64)
|
||||||
#[inline]
|
#[inline]
|
||||||
#[must_use]
|
|
||||||
pub fn remove(self, el: u32) -> TinySet {
|
pub fn remove(self, el: u32) -> TinySet {
|
||||||
self.intersect(TinySet::singleton(el).complement())
|
self.intersect(TinySet::singleton(el).complement())
|
||||||
}
|
}
|
||||||
@@ -133,7 +126,6 @@ impl TinySet {
|
|||||||
|
|
||||||
/// Returns the union of two tinysets
|
/// Returns the union of two tinysets
|
||||||
#[inline]
|
#[inline]
|
||||||
#[must_use]
|
|
||||||
pub fn union(self, other: TinySet) -> TinySet {
|
pub fn union(self, other: TinySet) -> TinySet {
|
||||||
TinySet(self.0 | other.0)
|
TinySet(self.0 | other.0)
|
||||||
}
|
}
|
||||||
@@ -190,20 +182,42 @@ impl BitSet {
|
|||||||
///
|
///
|
||||||
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
|
pub fn serialize<T: Write>(&self, writer: &mut T) -> io::Result<()> {
|
||||||
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
|
writer.write_all(self.max_value.to_le_bytes().as_ref())?;
|
||||||
for tinyset in self.tinysets.iter().cloned() {
|
|
||||||
writer.write_all(&tinyset.into_bytes())?;
|
for tinyset in self.tinysets.iter() {
|
||||||
|
tinyset.serialize(writer)?;
|
||||||
}
|
}
|
||||||
writer.flush()?;
|
writer.flush()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Deserialize a `BitSet`.
|
||||||
|
///
|
||||||
|
#[cfg(test)]
|
||||||
|
pub fn deserialize(mut data: &[u8]) -> io::Result<Self> {
|
||||||
|
let max_value: u32 = u32::from_le_bytes(data[..4].try_into().unwrap());
|
||||||
|
data = &data[4..];
|
||||||
|
|
||||||
|
let mut len: u64 = 0;
|
||||||
|
let mut tinysets = vec![];
|
||||||
|
for chunk in data.chunks_exact(8) {
|
||||||
|
let tinyset = TinySet::deserialize(chunk.try_into().unwrap())?;
|
||||||
|
len += tinyset.len() as u64;
|
||||||
|
tinysets.push(tinyset);
|
||||||
|
}
|
||||||
|
Ok(BitSet {
|
||||||
|
tinysets: tinysets.into_boxed_slice(),
|
||||||
|
len,
|
||||||
|
max_value,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
/// Create a new `BitSet` that may contain elements
|
/// Create a new `BitSet` that may contain elements
|
||||||
/// within `[0, max_val)`.
|
/// within `[0, max_val)`.
|
||||||
pub fn with_max_value(max_value: u32) -> BitSet {
|
pub fn with_max_value(max_value: u32) -> BitSet {
|
||||||
let num_buckets = num_buckets(max_value);
|
let num_buckets = num_buckets(max_value);
|
||||||
let tinybitsets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
||||||
BitSet {
|
BitSet {
|
||||||
tinysets: tinybitsets,
|
tinysets: tinybisets,
|
||||||
len: 0,
|
len: 0,
|
||||||
max_value,
|
max_value,
|
||||||
}
|
}
|
||||||
@@ -213,15 +227,14 @@ impl BitSet {
|
|||||||
/// within `[0, max_val)`.
|
/// within `[0, max_val)`.
|
||||||
pub fn with_max_value_and_full(max_value: u32) -> BitSet {
|
pub fn with_max_value_and_full(max_value: u32) -> BitSet {
|
||||||
let num_buckets = num_buckets(max_value);
|
let num_buckets = num_buckets(max_value);
|
||||||
let mut tinybitsets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
|
let mut tinybisets = vec![TinySet::full(); num_buckets as usize].into_boxed_slice();
|
||||||
|
|
||||||
// Fix padding
|
// Fix padding
|
||||||
let lower = max_value % 64u32;
|
let lower = max_value % 64u32;
|
||||||
if lower != 0 {
|
tinybisets[tinybisets.len() - 1] = TinySet::range_lower(lower);
|
||||||
tinybitsets[tinybitsets.len() - 1] = TinySet::range_lower(lower);
|
|
||||||
}
|
|
||||||
BitSet {
|
BitSet {
|
||||||
tinysets: tinybitsets,
|
tinysets: tinybisets,
|
||||||
len: max_value as u64,
|
len: max_value as u64,
|
||||||
max_value,
|
max_value,
|
||||||
}
|
}
|
||||||
@@ -234,22 +247,7 @@ impl BitSet {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Intersect with serialized bitset
|
|
||||||
pub fn intersect_update(&mut self, other: &ReadOnlyBitSet) {
|
|
||||||
self.intersect_update_with_iter(other.iter_tinysets());
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Intersect with tinysets
|
|
||||||
fn intersect_update_with_iter(&mut self, other: impl Iterator<Item = TinySet>) {
|
|
||||||
self.len = 0;
|
|
||||||
for (left, right) in self.tinysets.iter_mut().zip(other) {
|
|
||||||
*left = left.intersect(right);
|
|
||||||
self.len += left.len() as u64;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the number of elements in the `BitSet`.
|
/// Returns the number of elements in the `BitSet`.
|
||||||
#[inline]
|
|
||||||
pub fn len(&self) -> usize {
|
pub fn len(&self) -> usize {
|
||||||
self.len as usize
|
self.len as usize
|
||||||
}
|
}
|
||||||
@@ -299,7 +297,6 @@ impl BitSet {
|
|||||||
.map(|delta_bucket| bucket + delta_bucket as u32)
|
.map(|delta_bucket| bucket + delta_bucket as u32)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
|
||||||
pub fn max_value(&self) -> u32 {
|
pub fn max_value(&self) -> u32 {
|
||||||
self.max_value
|
self.max_value
|
||||||
}
|
}
|
||||||
@@ -314,34 +311,16 @@ impl BitSet {
|
|||||||
|
|
||||||
/// Serialized BitSet.
|
/// Serialized BitSet.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct ReadOnlyBitSet {
|
pub struct ReadSerializedBitSet {
|
||||||
data: OwnedBytes,
|
data: OwnedBytes,
|
||||||
max_value: u32,
|
max_value: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn intersect_bitsets(left: &ReadOnlyBitSet, other: &ReadOnlyBitSet) -> ReadOnlyBitSet {
|
impl ReadSerializedBitSet {
|
||||||
assert_eq!(left.max_value(), other.max_value());
|
|
||||||
assert_eq!(left.data.len(), other.data.len());
|
|
||||||
let union_tinyset_it = left
|
|
||||||
.iter_tinysets()
|
|
||||||
.zip(other.iter_tinysets())
|
|
||||||
.map(|(left_tinyset, right_tinyset)| left_tinyset.intersect(right_tinyset));
|
|
||||||
let mut output_dataset: Vec<u8> = Vec::with_capacity(left.data.len());
|
|
||||||
for tinyset in union_tinyset_it {
|
|
||||||
output_dataset.extend_from_slice(&tinyset.into_bytes());
|
|
||||||
}
|
|
||||||
ReadOnlyBitSet {
|
|
||||||
data: OwnedBytes::new(output_dataset),
|
|
||||||
max_value: left.max_value(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ReadOnlyBitSet {
|
|
||||||
pub fn open(data: OwnedBytes) -> Self {
|
pub fn open(data: OwnedBytes) -> Self {
|
||||||
let (max_value_data, data) = data.split(4);
|
let (max_value_data, data) = data.split(4);
|
||||||
assert_eq!(data.len() % 8, 0);
|
|
||||||
let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap());
|
let max_value: u32 = u32::from_le_bytes(max_value_data.as_ref().try_into().unwrap());
|
||||||
ReadOnlyBitSet { data, max_value }
|
ReadSerializedBitSet { data, max_value }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Number of elements in the bitset.
|
/// Number of elements in the bitset.
|
||||||
@@ -355,9 +334,10 @@ impl ReadOnlyBitSet {
|
|||||||
/// Iterate the tinyset on the fly from serialized data.
|
/// Iterate the tinyset on the fly from serialized data.
|
||||||
///
|
///
|
||||||
#[inline]
|
#[inline]
|
||||||
fn iter_tinysets(&self) -> impl Iterator<Item = TinySet> + '_ {
|
fn iter_tinysets<'a>(&'a self) -> impl Iterator<Item = TinySet> + 'a {
|
||||||
|
assert!((self.data.len()) % 8 == 0);
|
||||||
self.data.chunks_exact(8).map(move |chunk| {
|
self.data.chunks_exact(8).map(move |chunk| {
|
||||||
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap());
|
let tinyset: TinySet = TinySet::deserialize(chunk.try_into().unwrap()).unwrap();
|
||||||
tinyset
|
tinyset
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -365,7 +345,7 @@ impl ReadOnlyBitSet {
|
|||||||
/// Iterate over the positions of the elements.
|
/// Iterate over the positions of the elements.
|
||||||
///
|
///
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
|
pub fn iter<'a>(&'a self) -> impl Iterator<Item = u32> + 'a {
|
||||||
self.iter_tinysets()
|
self.iter_tinysets()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.flat_map(move |(chunk_num, tinyset)| {
|
.flat_map(move |(chunk_num, tinyset)| {
|
||||||
@@ -395,56 +375,20 @@ impl ReadOnlyBitSet {
|
|||||||
pub fn max_value(&self) -> u32 {
|
pub fn max_value(&self) -> u32 {
|
||||||
self.max_value
|
self.max_value
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Number of bytes used in the bitset representation.
|
|
||||||
pub fn num_bytes(&self) -> usize {
|
|
||||||
self.data.len()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> From<&'a BitSet> for ReadOnlyBitSet {
|
|
||||||
fn from(bitset: &'a BitSet) -> ReadOnlyBitSet {
|
|
||||||
let mut buffer = Vec::with_capacity(bitset.tinysets.len() * 8 + 4);
|
|
||||||
bitset
|
|
||||||
.serialize(&mut buffer)
|
|
||||||
.expect("serializing into a buffer should never fail");
|
|
||||||
ReadOnlyBitSet::open(OwnedBytes::new(buffer))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::BitSet;
|
use super::BitSet;
|
||||||
use super::ReadOnlyBitSet;
|
use super::ReadSerializedBitSet;
|
||||||
use super::TinySet;
|
use super::TinySet;
|
||||||
use ownedbytes::OwnedBytes;
|
use ownedbytes::OwnedBytes;
|
||||||
use rand::distributions::Bernoulli;
|
use rand::distributions::Bernoulli;
|
||||||
use rand::rngs::StdRng;
|
use rand::rngs::StdRng;
|
||||||
use rand::{Rng, SeedableRng};
|
use rand::{Rng, SeedableRng};
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
use std::convert::TryInto;
|
||||||
#[test]
|
|
||||||
fn test_read_serialized_bitset_full_multi() {
|
|
||||||
for i in 0..1000 {
|
|
||||||
let bitset = BitSet::with_max_value_and_full(i);
|
|
||||||
let mut out = vec![];
|
|
||||||
bitset.serialize(&mut out).unwrap();
|
|
||||||
|
|
||||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
|
||||||
assert_eq!(bitset.len() as usize, i as usize);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_read_serialized_bitset_full_block() {
|
|
||||||
let bitset = BitSet::with_max_value_and_full(64);
|
|
||||||
let mut out = vec![];
|
|
||||||
bitset.serialize(&mut out).unwrap();
|
|
||||||
|
|
||||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
|
||||||
assert_eq!(bitset.len() as usize, 64 as usize);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_read_serialized_bitset_full() {
|
fn test_read_serialized_bitset_full() {
|
||||||
@@ -453,50 +397,10 @@ mod tests {
|
|||||||
let mut out = vec![];
|
let mut out = vec![];
|
||||||
bitset.serialize(&mut out).unwrap();
|
bitset.serialize(&mut out).unwrap();
|
||||||
|
|
||||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
|
||||||
assert_eq!(bitset.len(), 4);
|
assert_eq!(bitset.len(), 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_bitset_intersect() {
|
|
||||||
let bitset_serialized = {
|
|
||||||
let mut bitset = BitSet::with_max_value_and_full(5);
|
|
||||||
bitset.remove(1);
|
|
||||||
bitset.remove(3);
|
|
||||||
let mut out = vec![];
|
|
||||||
bitset.serialize(&mut out).unwrap();
|
|
||||||
|
|
||||||
ReadOnlyBitSet::open(OwnedBytes::new(out))
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut bitset = BitSet::with_max_value_and_full(5);
|
|
||||||
bitset.remove(1);
|
|
||||||
bitset.intersect_update(&bitset_serialized);
|
|
||||||
|
|
||||||
assert!(bitset.contains(0));
|
|
||||||
assert!(!bitset.contains(1));
|
|
||||||
assert!(bitset.contains(2));
|
|
||||||
assert!(!bitset.contains(3));
|
|
||||||
assert!(bitset.contains(4));
|
|
||||||
|
|
||||||
bitset.intersect_update_with_iter(vec![TinySet::singleton(0)].into_iter());
|
|
||||||
|
|
||||||
assert!(bitset.contains(0));
|
|
||||||
assert!(!bitset.contains(1));
|
|
||||||
assert!(!bitset.contains(2));
|
|
||||||
assert!(!bitset.contains(3));
|
|
||||||
assert!(!bitset.contains(4));
|
|
||||||
assert_eq!(bitset.len(), 1);
|
|
||||||
|
|
||||||
bitset.intersect_update_with_iter(vec![TinySet::singleton(1)].into_iter());
|
|
||||||
assert!(!bitset.contains(0));
|
|
||||||
assert!(!bitset.contains(1));
|
|
||||||
assert!(!bitset.contains(2));
|
|
||||||
assert!(!bitset.contains(3));
|
|
||||||
assert!(!bitset.contains(4));
|
|
||||||
assert_eq!(bitset.len(), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_read_serialized_bitset_empty() {
|
fn test_read_serialized_bitset_empty() {
|
||||||
let mut bitset = BitSet::with_max_value(5);
|
let mut bitset = BitSet::with_max_value(5);
|
||||||
@@ -504,14 +408,14 @@ mod tests {
|
|||||||
let mut out = vec![];
|
let mut out = vec![];
|
||||||
bitset.serialize(&mut out).unwrap();
|
bitset.serialize(&mut out).unwrap();
|
||||||
|
|
||||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
|
||||||
assert_eq!(bitset.len(), 1);
|
assert_eq!(bitset.len(), 1);
|
||||||
|
|
||||||
{
|
{
|
||||||
let bitset = BitSet::with_max_value(5);
|
let bitset = BitSet::with_max_value(5);
|
||||||
let mut out = vec![];
|
let mut out = vec![];
|
||||||
bitset.serialize(&mut out).unwrap();
|
bitset.serialize(&mut out).unwrap();
|
||||||
let bitset = ReadOnlyBitSet::open(OwnedBytes::new(out));
|
let bitset = ReadSerializedBitSet::open(OwnedBytes::new(out));
|
||||||
assert_eq!(bitset.len(), 0);
|
assert_eq!(bitset.len(), 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -575,9 +479,13 @@ mod tests {
|
|||||||
assert!(u.pop_lowest().is_none());
|
assert!(u.pop_lowest().is_none());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let original = TinySet::empty().insert(63u32).insert(5);
|
let u = TinySet::empty().insert(63u32).insert(5);
|
||||||
let after_serialize_deserialize = TinySet::deserialize(original.into_bytes());
|
let mut data = vec![];
|
||||||
assert_eq!(original, after_serialize_deserialize);
|
u.serialize(&mut data).unwrap();
|
||||||
|
let mut u = TinySet::deserialize(data[..8].try_into().unwrap()).unwrap();
|
||||||
|
assert_eq!(u.pop_lowest(), Some(5u32));
|
||||||
|
assert_eq!(u.pop_lowest(), Some(63u32));
|
||||||
|
assert!(u.pop_lowest().is_none());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -599,12 +507,12 @@ mod tests {
|
|||||||
// test deser
|
// test deser
|
||||||
let mut data = vec![];
|
let mut data = vec![];
|
||||||
bitset.serialize(&mut data).unwrap();
|
bitset.serialize(&mut data).unwrap();
|
||||||
let ro_bitset = ReadOnlyBitSet::open(OwnedBytes::new(data));
|
let bitset = BitSet::deserialize(&data).unwrap();
|
||||||
for el in 0..max_value {
|
for el in 0..max_value {
|
||||||
assert_eq!(hashset.contains(&el), ro_bitset.contains(el));
|
assert_eq!(hashset.contains(&el), bitset.contains(el));
|
||||||
}
|
}
|
||||||
assert_eq!(ro_bitset.max_value(), max_value);
|
assert_eq!(bitset.max_value(), max_value);
|
||||||
assert_eq!(ro_bitset.len(), els.len());
|
assert_eq!(bitset.len(), els.len());
|
||||||
};
|
};
|
||||||
|
|
||||||
test_against_hashset(&[], 0);
|
test_against_hashset(&[], 0);
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
#![allow(clippy::len_without_is_empty)]
|
|
||||||
|
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
|
|
||||||
pub use byteorder::LittleEndian as Endianness;
|
pub use byteorder::LittleEndian as Endianness;
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite.html#tymethod.terminate_ref) directly
|
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
|
||||||
///
|
///
|
||||||
/// The point is that while the type is public, it cannot be built by anyone
|
/// The point is that while the type is public, it cannot be built by anyone
|
||||||
/// outside of this module.
|
/// outside of this module.
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ Note: Tantivy 0.16 does not do this optimization yet.
|
|||||||
In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?)
|
In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?)
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used.
|
The index sorting can be configured setting [`sort_by_field`](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used.
|
||||||
|
|
||||||
```
|
```
|
||||||
let settings = IndexSettings {
|
let settings = IndexSettings {
|
||||||
@@ -55,7 +55,7 @@ let index = index_builder.create_in_ram().unwrap();
|
|||||||
|
|
||||||
## Implementation details
|
## Implementation details
|
||||||
|
|
||||||
Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/quickwit-inc/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073).
|
Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/tantivy-search/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073).
|
||||||
|
|
||||||
In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets).
|
In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets).
|
||||||
|
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
);
|
);
|
||||||
|
|
||||||
// ... and add it to the `IndexWriter`.
|
// ... and add it to the `IndexWriter`.
|
||||||
index_writer.add_document(old_man_doc)?;
|
index_writer.add_document(old_man_doc);
|
||||||
|
|
||||||
// For convenience, tantivy also comes with a macro to
|
// For convenience, tantivy also comes with a macro to
|
||||||
// reduce the boilerplate above.
|
// reduce the boilerplate above.
|
||||||
@@ -110,7 +110,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
limbs and branches that arch over the pool"
|
limbs and branches that arch over the pool"
|
||||||
))?;
|
));
|
||||||
|
|
||||||
// Multivalued field just need to be repeated.
|
// Multivalued field just need to be repeated.
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
@@ -120,7 +120,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||||
increasing confidence in the success of my undertaking."
|
increasing confidence in the success of my undertaking."
|
||||||
))?;
|
));
|
||||||
|
|
||||||
// This is an example, so we will only index 3 documents
|
// This is an example, so we will only index 3 documents
|
||||||
// here. You can check out tantivy's tutorial to index
|
// here. You can check out tantivy's tutorial to index
|
||||||
|
|||||||
@@ -145,23 +145,23 @@ fn main() -> tantivy::Result<()> {
|
|||||||
product_description => "While it is ok for short distance travel, this broom \
|
product_description => "While it is ok for short distance travel, this broom \
|
||||||
was designed quiditch. It will up your game.",
|
was designed quiditch. It will up your game.",
|
||||||
price => 30_200u64
|
price => 30_200u64
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
product_name => "Turbulobroom",
|
product_name => "Turbulobroom",
|
||||||
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
|
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
|
||||||
You'll enjoy its sharp turns, and rapid acceleration",
|
You'll enjoy its sharp turns, and rapid acceleration",
|
||||||
price => 29_240u64
|
price => 29_240u64
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
product_name => "Broomio",
|
product_name => "Broomio",
|
||||||
product_description => "Great value for the price. This broom is a market favorite",
|
product_description => "Great value for the price. This broom is a market favorite",
|
||||||
price => 21_240u64
|
price => 21_240u64
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
product_name => "Whack a Mole",
|
product_name => "Whack a Mole",
|
||||||
product_description => "Prime quality bat.",
|
product_description => "Prime quality bat.",
|
||||||
price => 5_200u64
|
price => 5_200u64
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
title => "The Old Man and the Sea",
|
title => "The Old Man and the Sea",
|
||||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||||
he had gone eighty-four days now without taking a fish."
|
he had gone eighty-four days now without taking a fish."
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
|
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
|
||||||
@@ -79,14 +79,14 @@ fn main() -> tantivy::Result<()> {
|
|||||||
fresh and green with every spring, carrying in their lower leaf junctures the
|
fresh and green with every spring, carrying in their lower leaf junctures the
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent
|
||||||
limbs and branches that arch over the pool"#
|
limbs and branches that arch over the pool"#
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Frankenstein",
|
title => "Frankenstein",
|
||||||
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
|
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
|
||||||
enterprise which you have regarded with such evil forebodings. I arrived here
|
enterprise which you have regarded with such evil forebodings. I arrived here
|
||||||
yesterday, and my first task is to assure my dear sister of my welfare and
|
yesterday, and my first task is to assure my dear sister of my welfare and
|
||||||
increasing confidence in the success of my undertaking."#
|
increasing confidence in the success of my undertaking."#
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
|
|||||||
@@ -76,15 +76,15 @@ fn main() -> tantivy::Result<()> {
|
|||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
isbn => "978-0099908401",
|
isbn => "978-0099908401",
|
||||||
title => "The old Man and the see"
|
title => "The old Man and the see"
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
isbn => "978-0140177398",
|
isbn => "978-0140177398",
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Frankentein", //< Oops there is a typo here.
|
title => "Frankentein", //< Oops there is a typo here.
|
||||||
isbn => "978-9176370711",
|
isbn => "978-9176370711",
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
|
|
||||||
@@ -122,7 +122,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Frankenstein",
|
title => "Frankenstein",
|
||||||
isbn => "978-9176370711",
|
isbn => "978-9176370711",
|
||||||
))?;
|
));
|
||||||
|
|
||||||
// You are guaranteed that your clients will only observe your index in
|
// You are guaranteed that your clients will only observe your index in
|
||||||
// the state it was in after a commit.
|
// the state it was in after a commit.
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
|
|
||||||
let name = schema_builder.add_text_field("felin_name", TEXT | STORED);
|
let name = schema_builder.add_text_field("felin_name", TEXT | STORED);
|
||||||
// this is our faceted field: its scientific classification
|
// this is our faceted field: its scientific classification
|
||||||
let classification = schema_builder.add_facet_field("classification", FacetOptions::default());
|
let classification = schema_builder.add_facet_field("classification", INDEXED);
|
||||||
|
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -35,35 +35,35 @@ fn main() -> tantivy::Result<()> {
|
|||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
name => "Cat",
|
name => "Cat",
|
||||||
classification => Facet::from("/Felidae/Felinae/Felis")
|
classification => Facet::from("/Felidae/Felinae/Felis")
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
name => "Canada lynx",
|
name => "Canada lynx",
|
||||||
classification => Facet::from("/Felidae/Felinae/Lynx")
|
classification => Facet::from("/Felidae/Felinae/Lynx")
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
name => "Cheetah",
|
name => "Cheetah",
|
||||||
classification => Facet::from("/Felidae/Felinae/Acinonyx")
|
classification => Facet::from("/Felidae/Felinae/Acinonyx")
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
name => "Tiger",
|
name => "Tiger",
|
||||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
name => "Lion",
|
name => "Lion",
|
||||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
name => "Jaguar",
|
name => "Jaguar",
|
||||||
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
classification => Facet::from("/Felidae/Pantherinae/Panthera")
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
name => "Sunda clouded leopard",
|
name => "Sunda clouded leopard",
|
||||||
classification => Facet::from("/Felidae/Pantherinae/Neofelis")
|
classification => Facet::from("/Felidae/Pantherinae/Neofelis")
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
name => "Fossa",
|
name => "Fossa",
|
||||||
classification => Facet::from("/Eupleridae/Cryptoprocta")
|
classification => Facet::from("/Eupleridae/Cryptoprocta")
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
let title = schema_builder.add_text_field("title", STORED);
|
let title = schema_builder.add_text_field("title", STORED);
|
||||||
let ingredient = schema_builder.add_facet_field("ingredient", FacetOptions::default());
|
let ingredient = schema_builder.add_facet_field("ingredient", INDEXED);
|
||||||
|
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -20,14 +20,14 @@ fn main() -> tantivy::Result<()> {
|
|||||||
title => "Fried egg",
|
title => "Fried egg",
|
||||||
ingredient => Facet::from("/ingredient/egg"),
|
ingredient => Facet::from("/ingredient/egg"),
|
||||||
ingredient => Facet::from("/ingredient/oil"),
|
ingredient => Facet::from("/ingredient/oil"),
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Scrambled egg",
|
title => "Scrambled egg",
|
||||||
ingredient => Facet::from("/ingredient/egg"),
|
ingredient => Facet::from("/ingredient/egg"),
|
||||||
ingredient => Facet::from("/ingredient/butter"),
|
ingredient => Facet::from("/ingredient/butter"),
|
||||||
ingredient => Facet::from("/ingredient/milk"),
|
ingredient => Facet::from("/ingredient/milk"),
|
||||||
ingredient => Facet::from("/ingredient/salt"),
|
ingredient => Facet::from("/ingredient/salt"),
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Egg rolls",
|
title => "Egg rolls",
|
||||||
ingredient => Facet::from("/ingredient/egg"),
|
ingredient => Facet::from("/ingredient/egg"),
|
||||||
@@ -36,7 +36,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
ingredient => Facet::from("/ingredient/oil"),
|
ingredient => Facet::from("/ingredient/oil"),
|
||||||
ingredient => Facet::from("/ingredient/tortilla-wrap"),
|
ingredient => Facet::from("/ingredient/tortilla-wrap"),
|
||||||
ingredient => Facet::from("/ingredient/mushroom"),
|
ingredient => Facet::from("/ingredient/mushroom"),
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ use tantivy::query::RangeQuery;
|
|||||||
use tantivy::schema::{Schema, INDEXED};
|
use tantivy::schema::{Schema, INDEXED};
|
||||||
use tantivy::{doc, Index, Result};
|
use tantivy::{doc, Index, Result};
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn run() -> Result<()> {
|
||||||
// For the sake of simplicity, this schema will only have 1 field
|
// For the sake of simplicity, this schema will only have 1 field
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
@@ -19,7 +19,7 @@ fn main() -> Result<()> {
|
|||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
|
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
|
||||||
for year in 1950u64..2019u64 {
|
for year in 1950u64..2019u64 {
|
||||||
index_writer.add_document(doc!(year_field => year))?;
|
index_writer.add_document(doc!(year_field => year));
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
// The index will be a range of years
|
// The index will be a range of years
|
||||||
@@ -33,3 +33,7 @@ fn main() -> Result<()> {
|
|||||||
assert_eq!(num_60s_books, 10);
|
assert_eq!(num_60s_books, 10);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
run().unwrap()
|
||||||
|
}
|
||||||
|
|||||||
@@ -25,9 +25,9 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
|
index_writer.add_document(doc!(title => "The Old Man and the Sea"));
|
||||||
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
|
index_writer.add_document(doc!(title => "Of Mice and Men"));
|
||||||
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
index_writer.add_document(doc!(title => "The modern Promotheus"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ use std::sync::{Arc, RwLock};
|
|||||||
use std::thread;
|
use std::thread;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tantivy::schema::{Schema, STORED, TEXT};
|
use tantivy::schema::{Schema, STORED, TEXT};
|
||||||
use tantivy::{doc, Index, IndexWriter, Opstamp, TantivyError};
|
use tantivy::{doc, Index, IndexWriter, Opstamp};
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
// # Defining the schema
|
// # Defining the schema
|
||||||
@@ -59,11 +59,10 @@ fn main() -> tantivy::Result<()> {
|
|||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
limbs and branches that arch over the pool"
|
limbs and branches that arch over the pool"
|
||||||
))?;
|
));
|
||||||
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
|
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
|
||||||
thread::sleep(Duration::from_millis(20));
|
thread::sleep(Duration::from_millis(20));
|
||||||
}
|
}
|
||||||
Result::<(), TantivyError>::Ok(())
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// # Second indexing thread.
|
// # Second indexing thread.
|
||||||
@@ -79,12 +78,11 @@ fn main() -> tantivy::Result<()> {
|
|||||||
index_writer_rlock.add_document(doc!(
|
index_writer_rlock.add_document(doc!(
|
||||||
title => "Manufacturing consent",
|
title => "Manufacturing consent",
|
||||||
body => "Some great book description..."
|
body => "Some great book description..."
|
||||||
))?
|
))
|
||||||
};
|
};
|
||||||
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
|
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
|
||||||
thread::sleep(Duration::from_millis(10));
|
thread::sleep(Duration::from_millis(10));
|
||||||
}
|
}
|
||||||
Result::<(), TantivyError>::Ok(())
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// # In the main thread, we commit 10 times, once every 500ms.
|
// # In the main thread, we commit 10 times, once every 500ms.
|
||||||
@@ -92,7 +90,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let opstamp: Opstamp = {
|
let opstamp: Opstamp = {
|
||||||
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
|
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
|
||||||
let mut index_writer_wlock = index_writer.write().unwrap();
|
let mut index_writer_wlock = index_writer.write().unwrap();
|
||||||
index_writer_wlock.commit()?
|
index_writer_wlock.commit().unwrap()
|
||||||
};
|
};
|
||||||
println!("committed with opstamp {}", opstamp);
|
println!("committed with opstamp {}", opstamp);
|
||||||
thread::sleep(Duration::from_millis(500));
|
thread::sleep(Duration::from_millis(500));
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let old_man_doc = doc!(title => title_tok, body => body_tok);
|
let old_man_doc = doc!(title => title_tok, body => body_tok);
|
||||||
|
|
||||||
// ... now let's just add it to the IndexWriter
|
// ... now let's just add it to the IndexWriter
|
||||||
index_writer.add_document(old_man_doc)?;
|
index_writer.add_document(old_man_doc);
|
||||||
|
|
||||||
// Pretokenized text can also be fed as JSON
|
// Pretokenized text can also be fed as JSON
|
||||||
let short_man_json = r#"{
|
let short_man_json = r#"{
|
||||||
@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
|
|
||||||
let short_man_doc = schema.parse_document(short_man_json)?;
|
let short_man_doc = schema.parse_document(short_man_json)?;
|
||||||
|
|
||||||
index_writer.add_document(short_man_doc)?;
|
index_writer.add_document(short_man_doc);
|
||||||
|
|
||||||
// Let's commit changes
|
// Let's commit changes
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
@@ -106,7 +106,9 @@ fn main() -> tantivy::Result<()> {
|
|||||||
IndexRecordOption::Basic,
|
IndexRecordOption::Basic,
|
||||||
);
|
);
|
||||||
|
|
||||||
let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
|
let (top_docs, count) = searcher
|
||||||
|
.search(&query, &(TopDocs::with_limit(2), Count))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
assert_eq!(count, 2);
|
assert_eq!(count, 2);
|
||||||
|
|
||||||
@@ -127,7 +129,9 @@ fn main() -> tantivy::Result<()> {
|
|||||||
IndexRecordOption::Basic,
|
IndexRecordOption::Basic,
|
||||||
);
|
);
|
||||||
|
|
||||||
let (_top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count))?;
|
let (_top_docs, count) = searcher
|
||||||
|
.search(&query, &(TopDocs::with_limit(2), Count))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
assert_eq!(count, 0);
|
assert_eq!(count, 0);
|
||||||
|
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
limbs and branches that arch over the pool"
|
limbs and branches that arch over the pool"
|
||||||
))?;
|
));
|
||||||
// ...
|
// ...
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
@@ -70,13 +70,13 @@ fn highlight(snippet: Snippet) -> String {
|
|||||||
let mut start_from = 0;
|
let mut start_from = 0;
|
||||||
|
|
||||||
for fragment_range in snippet.highlighted() {
|
for fragment_range in snippet.highlighted() {
|
||||||
result.push_str(&snippet.fragment()[start_from..fragment_range.start]);
|
result.push_str(&snippet.fragments()[start_from..fragment_range.start]);
|
||||||
result.push_str(" --> ");
|
result.push_str(" --> ");
|
||||||
result.push_str(&snippet.fragment()[fragment_range.clone()]);
|
result.push_str(&snippet.fragments()[fragment_range.clone()]);
|
||||||
result.push_str(" <-- ");
|
result.push_str(" <-- ");
|
||||||
start_from = fragment_range.end;
|
start_from = fragment_range.end;
|
||||||
}
|
}
|
||||||
|
|
||||||
result.push_str(&snippet.fragment()[start_from..]);
|
result.push_str(&snippet.fragments()[start_from..]);
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
title => "The Old Man and the Sea",
|
title => "The Old Man and the Sea",
|
||||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||||
he had gone eighty-four days now without taking a fish."
|
he had gone eighty-four days now without taking a fish."
|
||||||
))?;
|
));
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
@@ -80,7 +80,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
limbs and branches that arch over the pool"
|
limbs and branches that arch over the pool"
|
||||||
))?;
|
));
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Frankenstein",
|
title => "Frankenstein",
|
||||||
@@ -88,7 +88,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||||
increasing confidence in the success of my undertaking."
|
increasing confidence in the success of my undertaking."
|
||||||
))?;
|
));
|
||||||
|
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
|
|||||||
@@ -1,223 +0,0 @@
|
|||||||
use std::cmp::Reverse;
|
|
||||||
use std::collections::{HashMap, HashSet};
|
|
||||||
use std::sync::{Arc, RwLock, Weak};
|
|
||||||
|
|
||||||
use tantivy::collector::TopDocs;
|
|
||||||
use tantivy::fastfield::FastFieldReader;
|
|
||||||
use tantivy::query::QueryParser;
|
|
||||||
use tantivy::schema::{Field, Schema, FAST, TEXT};
|
|
||||||
use tantivy::{doc, DocAddress, DocId, Index, IndexReader, SegmentReader, TrackedObject};
|
|
||||||
use tantivy::{Opstamp, Searcher, SearcherGeneration, SegmentId, Warmer};
|
|
||||||
|
|
||||||
// This example shows how warmers can be used to
|
|
||||||
// load a values from an external sources using the Warmer API.
|
|
||||||
//
|
|
||||||
// In this example, we assume an e-commerce search engine.
|
|
||||||
|
|
||||||
type ProductId = u64;
|
|
||||||
|
|
||||||
/// Price
|
|
||||||
type Price = u32;
|
|
||||||
|
|
||||||
pub trait PriceFetcher: Send + Sync + 'static {
|
|
||||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct DynamicPriceColumn {
|
|
||||||
field: Field,
|
|
||||||
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
|
|
||||||
price_fetcher: Box<dyn PriceFetcher>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DynamicPriceColumn {
|
|
||||||
pub fn with_product_id_field<T: PriceFetcher>(field: Field, price_fetcher: T) -> Self {
|
|
||||||
DynamicPriceColumn {
|
|
||||||
field,
|
|
||||||
price_cache: Default::default(),
|
|
||||||
price_fetcher: Box::new(price_fetcher),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn price_for_segment(&self, segment_reader: &SegmentReader) -> Option<Arc<Vec<Price>>> {
|
|
||||||
let segment_key = (segment_reader.segment_id(), segment_reader.delete_opstamp());
|
|
||||||
self.price_cache.read().unwrap().get(&segment_key).cloned()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
impl Warmer for DynamicPriceColumn {
|
|
||||||
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
|
|
||||||
for segment in searcher.segment_readers() {
|
|
||||||
let key = (segment.segment_id(), segment.delete_opstamp());
|
|
||||||
let product_id_reader = segment.fast_fields().u64(self.field)?;
|
|
||||||
let product_ids: Vec<ProductId> = segment
|
|
||||||
.doc_ids_alive()
|
|
||||||
.map(|doc| product_id_reader.get(doc))
|
|
||||||
.collect();
|
|
||||||
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
|
|
||||||
let mut price_vals: Vec<Price> = Vec::new();
|
|
||||||
for doc in 0..segment.max_doc() {
|
|
||||||
if segment.is_deleted(doc) {
|
|
||||||
price_vals.push(0);
|
|
||||||
} else {
|
|
||||||
price_vals.push(prices_it.next().unwrap())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self.price_cache
|
|
||||||
.write()
|
|
||||||
.unwrap()
|
|
||||||
.insert(key, Arc::new(price_vals));
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn garbage_collect(&self, live_generations: &[TrackedObject<SearcherGeneration>]) {
|
|
||||||
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
|
|
||||||
live_generations
|
|
||||||
.iter()
|
|
||||||
.flat_map(|gen| gen.segments())
|
|
||||||
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
|
|
||||||
.collect();
|
|
||||||
let mut price_cache_wrt = self.price_cache.write().unwrap();
|
|
||||||
// let price_cache = std::mem::take(&mut *price_cache_wrt);
|
|
||||||
// Drain would be nicer here.
|
|
||||||
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
|
|
||||||
.into_iter()
|
|
||||||
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
|
|
||||||
.collect();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// For the sake of this example, the table is just an editable HashMap behind a RwLock.
|
|
||||||
/// This map represents a map (ProductId -> Price)
|
|
||||||
///
|
|
||||||
/// In practise, it could be fetching things from an external service, like a SQL table.
|
|
||||||
///
|
|
||||||
#[derive(Default, Clone)]
|
|
||||||
pub struct ExternalPriceTable {
|
|
||||||
prices: Arc<RwLock<HashMap<ProductId, Price>>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ExternalPriceTable {
|
|
||||||
pub fn update_price(&self, product_id: ProductId, price: Price) {
|
|
||||||
let mut prices_wrt = self.prices.write().unwrap();
|
|
||||||
prices_wrt.insert(product_id, price);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PriceFetcher for ExternalPriceTable {
|
|
||||||
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
|
|
||||||
let prices_read = self.prices.read().unwrap();
|
|
||||||
product_ids
|
|
||||||
.iter()
|
|
||||||
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
|
||||||
// Declaring our schema.
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
// The product id is assumed to be a primary id for our external price source.
|
|
||||||
let product_id = schema_builder.add_u64_field("product_id", FAST);
|
|
||||||
let text = schema_builder.add_text_field("text", TEXT);
|
|
||||||
let schema: Schema = schema_builder.build();
|
|
||||||
|
|
||||||
let price_table = ExternalPriceTable::default();
|
|
||||||
let price_dynamic_column = Arc::new(DynamicPriceColumn::with_product_id_field(
|
|
||||||
product_id,
|
|
||||||
price_table.clone(),
|
|
||||||
));
|
|
||||||
price_table.update_price(OLIVE_OIL, 12);
|
|
||||||
price_table.update_price(GLOVES, 13);
|
|
||||||
price_table.update_price(SNEAKERS, 80);
|
|
||||||
|
|
||||||
const OLIVE_OIL: ProductId = 323423;
|
|
||||||
const GLOVES: ProductId = 3966623;
|
|
||||||
const SNEAKERS: ProductId = 23222;
|
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut writer = index.writer_with_num_threads(1, 10_000_000)?;
|
|
||||||
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
|
|
||||||
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
|
|
||||||
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
|
|
||||||
writer.commit()?;
|
|
||||||
|
|
||||||
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
|
|
||||||
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
|
|
||||||
)];
|
|
||||||
let reader: IndexReader = index
|
|
||||||
.reader_builder()
|
|
||||||
.warmers(warmers)
|
|
||||||
.num_searchers(1)
|
|
||||||
.try_into()?;
|
|
||||||
reader.reload()?;
|
|
||||||
|
|
||||||
let query_parser = QueryParser::for_index(&index, vec![text]);
|
|
||||||
let query = query_parser.parse_query("cooking")?;
|
|
||||||
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let score_by_price = move |segment_reader: &SegmentReader| {
|
|
||||||
let price = price_dynamic_column
|
|
||||||
.price_for_segment(segment_reader)
|
|
||||||
.unwrap();
|
|
||||||
move |doc_id: DocId| Reverse(price[doc_id as usize])
|
|
||||||
};
|
|
||||||
|
|
||||||
let most_expensive_first = TopDocs::with_limit(10).custom_score(score_by_price);
|
|
||||||
|
|
||||||
let hits = searcher.search(&query, &most_expensive_first)?;
|
|
||||||
assert_eq!(
|
|
||||||
&hits,
|
|
||||||
&[
|
|
||||||
(
|
|
||||||
Reverse(12u32),
|
|
||||||
DocAddress {
|
|
||||||
segment_ord: 0,
|
|
||||||
doc_id: 0u32
|
|
||||||
}
|
|
||||||
),
|
|
||||||
(
|
|
||||||
Reverse(13u32),
|
|
||||||
DocAddress {
|
|
||||||
segment_ord: 0,
|
|
||||||
doc_id: 1u32
|
|
||||||
}
|
|
||||||
),
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
// Olive oil just got more expensive!
|
|
||||||
price_table.update_price(OLIVE_OIL, 15);
|
|
||||||
|
|
||||||
// The price update are directly reflected on `reload`.
|
|
||||||
//
|
|
||||||
// Be careful here though!...
|
|
||||||
// You may have spotted that we are still using the same `Searcher`.
|
|
||||||
//
|
|
||||||
// It is up to the `Warmer` implementer to decide how
|
|
||||||
// to control this behavior.
|
|
||||||
|
|
||||||
reader.reload()?;
|
|
||||||
|
|
||||||
let hits_with_new_prices = searcher.search(&query, &most_expensive_first)?;
|
|
||||||
assert_eq!(
|
|
||||||
&hits_with_new_prices,
|
|
||||||
&[
|
|
||||||
(
|
|
||||||
Reverse(13u32),
|
|
||||||
DocAddress {
|
|
||||||
segment_ord: 0,
|
|
||||||
doc_id: 1u32
|
|
||||||
}
|
|
||||||
),
|
|
||||||
(
|
|
||||||
Reverse(15u32),
|
|
||||||
DocAddress {
|
|
||||||
segment_ord: 0,
|
|
||||||
doc_id: 0u32
|
|
||||||
}
|
|
||||||
),
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
[package]
|
[package]
|
||||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||||
name = "ownedbytes"
|
name = "ownedbytes"
|
||||||
version = "0.2.0"
|
version = "0.1.0"
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
description = "Expose data as static slice"
|
description = "Expose data as static slice"
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
#![allow(clippy::return_self_not_must_use)]
|
|
||||||
|
|
||||||
use stable_deref_trait::StableDeref;
|
use stable_deref_trait::StableDeref;
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
@@ -37,8 +35,6 @@ impl OwnedBytes {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// creates a fileslice that is just a view over a slice of the data.
|
/// creates a fileslice that is just a view over a slice of the data.
|
||||||
#[must_use]
|
|
||||||
#[inline]
|
|
||||||
pub fn slice(&self, range: Range<usize>) -> Self {
|
pub fn slice(&self, range: Range<usize>) -> Self {
|
||||||
OwnedBytes {
|
OwnedBytes {
|
||||||
data: &self.data[range],
|
data: &self.data[range],
|
||||||
@@ -67,8 +63,6 @@ impl OwnedBytes {
|
|||||||
/// On the other hand, both `left` and `right` retain a handle over
|
/// On the other hand, both `left` and `right` retain a handle over
|
||||||
/// the entire slice of memory. In other words, the memory will only
|
/// the entire slice of memory. In other words, the memory will only
|
||||||
/// be released when both left and right are dropped.
|
/// be released when both left and right are dropped.
|
||||||
#[inline]
|
|
||||||
#[must_use]
|
|
||||||
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
|
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
|
||||||
let right_box_stable_deref = self.box_stable_deref.clone();
|
let right_box_stable_deref = self.box_stable_deref.clone();
|
||||||
let left = OwnedBytes {
|
let left = OwnedBytes {
|
||||||
@@ -82,19 +76,6 @@ impl OwnedBytes {
|
|||||||
(left, right)
|
(left, right)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Splits the right part of the `OwnedBytes` at the given offset.
|
|
||||||
///
|
|
||||||
/// `self` is truncated to `split_len`, left with the remaining bytes.
|
|
||||||
pub fn split_off(&mut self, split_len: usize) -> OwnedBytes {
|
|
||||||
let right_box_stable_deref = self.box_stable_deref.clone();
|
|
||||||
let right_piece = OwnedBytes {
|
|
||||||
data: &self.data[split_len..],
|
|
||||||
box_stable_deref: right_box_stable_deref,
|
|
||||||
};
|
|
||||||
self.data = &self.data[..split_len];
|
|
||||||
right_piece
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns true iff this `OwnedBytes` is empty.
|
/// Returns true iff this `OwnedBytes` is empty.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn is_empty(&self) -> bool {
|
pub fn is_empty(&self) -> bool {
|
||||||
@@ -103,6 +84,7 @@ impl OwnedBytes {
|
|||||||
|
|
||||||
/// Drops the left most `advance_len` bytes.
|
/// Drops the left most `advance_len` bytes.
|
||||||
///
|
///
|
||||||
|
/// See also [.clip(clip_len: usize))](#method.clip).
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn advance(&mut self, advance_len: usize) {
|
pub fn advance(&mut self, advance_len: usize) {
|
||||||
self.data = &self.data[advance_len..]
|
self.data = &self.data[advance_len..]
|
||||||
@@ -142,35 +124,6 @@ impl fmt::Debug for OwnedBytes {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PartialEq for OwnedBytes {
|
|
||||||
fn eq(&self, other: &OwnedBytes) -> bool {
|
|
||||||
self.as_slice() == other.as_slice()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Eq for OwnedBytes {}
|
|
||||||
|
|
||||||
impl PartialEq<[u8]> for OwnedBytes {
|
|
||||||
fn eq(&self, other: &[u8]) -> bool {
|
|
||||||
self.as_slice() == other
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PartialEq<str> for OwnedBytes {
|
|
||||||
fn eq(&self, other: &str) -> bool {
|
|
||||||
self.as_slice() == other.as_bytes()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a, T: ?Sized> PartialEq<&'a T> for OwnedBytes
|
|
||||||
where
|
|
||||||
OwnedBytes: PartialEq<T>,
|
|
||||||
{
|
|
||||||
fn eq(&self, other: &&'a T) -> bool {
|
|
||||||
*self == **other
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Deref for OwnedBytes {
|
impl Deref for OwnedBytes {
|
||||||
type Target = [u8];
|
type Target = [u8];
|
||||||
|
|
||||||
@@ -334,14 +287,4 @@ mod tests {
|
|||||||
assert_eq!(right.as_slice(), b"");
|
assert_eq!(right.as_slice(), b"");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_split_off() {
|
|
||||||
let mut data = OwnedBytes::new(b"abcdef".as_ref());
|
|
||||||
assert_eq!(data, "abcdef");
|
|
||||||
assert_eq!(data.split_off(2), "cdef");
|
|
||||||
assert_eq!(data, "ab");
|
|
||||||
assert_eq!(data.split_off(1), "b");
|
|
||||||
assert_eq!(data, "a");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,9 +5,9 @@ authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
|||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
description = """Search engine library"""
|
description = """Search engine library"""
|
||||||
documentation = "https://quickwit-inc.github.io/tantivy/tantivy/index.html"
|
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
|
||||||
homepage = "https://github.com/quickwit-inc/tantivy"
|
homepage = "https://github.com/tantivy-search/tantivy"
|
||||||
repository = "https://github.com/quickwit-inc/tantivy"
|
repository = "https://github.com/tantivy-search/tantivy"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
keywords = ["search", "information", "retrieval"]
|
keywords = ["search", "information", "retrieval"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|||||||
@@ -91,7 +91,6 @@ pub enum UserInputAst {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl UserInputAst {
|
impl UserInputAst {
|
||||||
#[must_use]
|
|
||||||
pub fn unary(self, occur: Occur) -> UserInputAst {
|
pub fn unary(self, occur: Occur) -> UserInputAst {
|
||||||
UserInputAst::Clause(vec![(Some(occur), self)])
|
UserInputAst::Clause(vec![(Some(occur), self)])
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -20,10 +20,10 @@ use crate::SegmentReader;
|
|||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
///
|
///
|
||||||
/// let mut index_writer = index.writer(3_000_000).unwrap();
|
/// let mut index_writer = index.writer(3_000_000).unwrap();
|
||||||
/// index_writer.add_document(doc!(title => "The Name of the Wind")).unwrap();
|
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib")).unwrap();
|
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||||
/// index_writer.add_document(doc!(title => "A Dairy Cow")).unwrap();
|
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl")).unwrap();
|
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||||
/// assert!(index_writer.commit().is_ok());
|
/// assert!(index_writer.commit().is_ok());
|
||||||
///
|
///
|
||||||
/// let reader = index.reader().unwrap();
|
/// let reader = index.reader().unwrap();
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// use tantivy::collector::FacetCollector;
|
/// use tantivy::collector::FacetCollector;
|
||||||
/// use tantivy::query::AllQuery;
|
/// use tantivy::query::AllQuery;
|
||||||
/// use tantivy::schema::{Facet, Schema, FacetOptions, TEXT};
|
/// use tantivy::schema::{Facet, Schema, INDEXED, TEXT};
|
||||||
/// use tantivy::{doc, Index};
|
/// use tantivy::{doc, Index};
|
||||||
///
|
///
|
||||||
/// fn example() -> tantivy::Result<()> {
|
/// fn example() -> tantivy::Result<()> {
|
||||||
@@ -92,7 +92,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// // Facet have their own specific type.
|
/// // Facet have their own specific type.
|
||||||
/// // It is not a bad practise to put all of your
|
/// // It is not a bad practise to put all of your
|
||||||
/// // facet information in the same field.
|
/// // facet information in the same field.
|
||||||
/// let facet = schema_builder.add_facet_field("facet", FacetOptions::default());
|
/// let facet = schema_builder.add_facet_field("facet", INDEXED);
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
@@ -103,23 +103,23 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// title => "The Name of the Wind",
|
/// title => "The Name of the Wind",
|
||||||
/// facet => Facet::from("/lang/en"),
|
/// facet => Facet::from("/lang/en"),
|
||||||
/// facet => Facet::from("/category/fiction/fantasy")
|
/// facet => Facet::from("/category/fiction/fantasy")
|
||||||
/// ))?;
|
/// ));
|
||||||
/// index_writer.add_document(doc!(
|
/// index_writer.add_document(doc!(
|
||||||
/// title => "Dune",
|
/// title => "Dune",
|
||||||
/// facet => Facet::from("/lang/en"),
|
/// facet => Facet::from("/lang/en"),
|
||||||
/// facet => Facet::from("/category/fiction/sci-fi")
|
/// facet => Facet::from("/category/fiction/sci-fi")
|
||||||
/// ))?;
|
/// ));
|
||||||
/// index_writer.add_document(doc!(
|
/// index_writer.add_document(doc!(
|
||||||
/// title => "La Vénus d'Ille",
|
/// title => "La Vénus d'Ille",
|
||||||
/// facet => Facet::from("/lang/fr"),
|
/// facet => Facet::from("/lang/fr"),
|
||||||
/// facet => Facet::from("/category/fiction/fantasy"),
|
/// facet => Facet::from("/category/fiction/fantasy"),
|
||||||
/// facet => Facet::from("/category/fiction/horror")
|
/// facet => Facet::from("/category/fiction/horror")
|
||||||
/// ))?;
|
/// ));
|
||||||
/// index_writer.add_document(doc!(
|
/// index_writer.add_document(doc!(
|
||||||
/// title => "The Diary of a Young Girl",
|
/// title => "The Diary of a Young Girl",
|
||||||
/// facet => Facet::from("/lang/en"),
|
/// facet => Facet::from("/lang/en"),
|
||||||
/// facet => Facet::from("/category/biography")
|
/// facet => Facet::from("/category/biography")
|
||||||
/// ))?;
|
/// ));
|
||||||
/// index_writer.commit()?;
|
/// index_writer.commit()?;
|
||||||
/// }
|
/// }
|
||||||
/// let reader = index.reader()?;
|
/// let reader = index.reader()?;
|
||||||
@@ -400,7 +400,7 @@ impl<'a> Iterator for FacetChildIterator<'a> {
|
|||||||
|
|
||||||
impl FacetCounts {
|
impl FacetCounts {
|
||||||
/// Returns an iterator over all of the facet count pairs inside this result.
|
/// Returns an iterator over all of the facet count pairs inside this result.
|
||||||
/// See the documentation for [FacetCollector] for a usage example.
|
/// See the documentation for `FacetCollector` for a usage example.
|
||||||
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator<'_>
|
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator<'_>
|
||||||
where
|
where
|
||||||
Facet: From<T>,
|
Facet: From<T>,
|
||||||
@@ -421,7 +421,7 @@ impl FacetCounts {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a vector of top `k` facets with their counts, sorted highest-to-lowest by counts.
|
/// Returns a vector of top `k` facets with their counts, sorted highest-to-lowest by counts.
|
||||||
/// See the documentation for [FacetCollector] for a usage example.
|
/// See the documentation for `FacetCollector` for a usage example.
|
||||||
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
|
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
|
||||||
where
|
where
|
||||||
Facet: From<T>,
|
Facet: From<T>,
|
||||||
@@ -462,7 +462,7 @@ mod tests {
|
|||||||
use crate::collector::Count;
|
use crate::collector::Count;
|
||||||
use crate::core::Index;
|
use crate::core::Index;
|
||||||
use crate::query::{AllQuery, QueryParser, TermQuery};
|
use crate::query::{AllQuery, QueryParser, TermQuery};
|
||||||
use crate::schema::{Document, Facet, FacetOptions, Field, IndexRecordOption, Schema};
|
use crate::schema::{Document, Facet, Field, IndexRecordOption, Schema, INDEXED};
|
||||||
use crate::Term;
|
use crate::Term;
|
||||||
use rand::distributions::Uniform;
|
use rand::distributions::Uniform;
|
||||||
use rand::prelude::SliceRandom;
|
use rand::prelude::SliceRandom;
|
||||||
@@ -470,13 +470,13 @@ mod tests {
|
|||||||
use std::iter;
|
use std::iter;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_collector_drilldown() -> crate::Result<()> {
|
fn test_facet_collector_drilldown() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
let num_facets: usize = 3 * 4 * 5;
|
let num_facets: usize = 3 * 4 * 5;
|
||||||
let facets: Vec<Facet> = (0..num_facets)
|
let facets: Vec<Facet> = (0..num_facets)
|
||||||
.map(|mut n| {
|
.map(|mut n| {
|
||||||
@@ -491,14 +491,14 @@ mod tests {
|
|||||||
for i in 0..num_facets * 10 {
|
for i in 0..num_facets * 10 {
|
||||||
let mut doc = Document::new();
|
let mut doc = Document::new();
|
||||||
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
||||||
index_writer.add_document(doc)?;
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet(Facet::from("/top1"));
|
facet_collector.add_facet(Facet::from("/top1"));
|
||||||
let counts = searcher.search(&AllQuery, &facet_collector)?;
|
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||||
|
|
||||||
{
|
{
|
||||||
let facets: Vec<(String, u64)> = counts
|
let facets: Vec<(String, u64)> = counts
|
||||||
@@ -518,7 +518,6 @@ mod tests {
|
|||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -531,49 +530,48 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_unsorted_multifacet() -> crate::Result<()> {
|
fn test_doc_unsorted_multifacet() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let facet_field = schema_builder.add_facet_field("facets", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facets", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
facet_field => Facet::from_text(&"/subjects/A/a").unwrap(),
|
facet_field => Facet::from_text(&"/subjects/A/a").unwrap(),
|
||||||
facet_field => Facet::from_text(&"/subjects/B/a").unwrap(),
|
facet_field => Facet::from_text(&"/subjects/B/a").unwrap(),
|
||||||
facet_field => Facet::from_text(&"/subjects/A/b").unwrap(),
|
facet_field => Facet::from_text(&"/subjects/A/b").unwrap(),
|
||||||
facet_field => Facet::from_text(&"/subjects/B/b").unwrap(),
|
facet_field => Facet::from_text(&"/subjects/B/b").unwrap(),
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
assert_eq!(searcher.num_docs(), 1);
|
assert_eq!(searcher.num_docs(), 1);
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet("/subjects");
|
facet_collector.add_facet("/subjects");
|
||||||
let counts = searcher.search(&AllQuery, &facet_collector)?;
|
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||||
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
|
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
|
||||||
assert_eq!(facets[0].1, 1);
|
assert_eq!(facets[0].1, 1);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_search_by_facet() -> crate::Result<()> {
|
fn test_doc_search_by_facet() -> crate::Result<()> {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
facet_field => Facet::from_text(&"/A/A").unwrap(),
|
facet_field => Facet::from_text(&"/A/A").unwrap(),
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
facet_field => Facet::from_text(&"/A/B").unwrap(),
|
facet_field => Facet::from_text(&"/A/B").unwrap(),
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
facet_field => Facet::from_text(&"/A/C/A").unwrap(),
|
facet_field => Facet::from_text(&"/A/C/A").unwrap(),
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
facet_field => Facet::from_text(&"/D/C/A").unwrap(),
|
facet_field => Facet::from_text(&"/D/C/A").unwrap(),
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -615,7 +613,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_facet_collector_topk() {
|
fn test_facet_collector_topk() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
@@ -639,7 +637,7 @@ mod tests {
|
|||||||
|
|
||||||
let mut index_writer = index.writer_for_tests().unwrap();
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
for doc in docs {
|
for doc in docs {
|
||||||
index_writer.add_document(doc).unwrap();
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
@@ -664,7 +662,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_facet_collector_topk_tie_break() -> crate::Result<()> {
|
fn test_facet_collector_topk_tie_break() -> crate::Result<()> {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
@@ -679,7 +677,7 @@ mod tests {
|
|||||||
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
for doc in docs {
|
for doc in docs {
|
||||||
index_writer.add_document(doc)?;
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
@@ -727,7 +725,7 @@ mod bench {
|
|||||||
|
|
||||||
let mut index_writer = index.writer_for_tests().unwrap();
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
for doc in docs {
|
for doc in docs {
|
||||||
index_writer.add_document(doc).unwrap();
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
let reader = index.reader().unwrap();
|
let reader = index.reader().unwrap();
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
|||||||
use crate::schema::Field;
|
use crate::schema::Field;
|
||||||
use crate::{Score, SegmentReader, TantivyError};
|
use crate::{Score, SegmentReader, TantivyError};
|
||||||
|
|
||||||
/// The `FilterCollector` filters docs using a fast field value and a predicate.
|
/// The `FilterCollector` collector filters docs using a fast field value and a predicate.
|
||||||
/// Only the documents for which the predicate returned "true" will be passed on to the next collector.
|
/// Only the documents for which the predicate returned "true" will be passed on to the next collector.
|
||||||
///
|
///
|
||||||
/// ```rust
|
/// ```rust
|
||||||
@@ -25,37 +25,34 @@ use crate::{Score, SegmentReader, TantivyError};
|
|||||||
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
|
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
|
||||||
/// use tantivy::{doc, DocAddress, Index};
|
/// use tantivy::{doc, DocAddress, Index};
|
||||||
///
|
///
|
||||||
/// # fn main() -> tantivy::Result<()> {
|
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
|
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
///
|
///
|
||||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||||
/// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64))?;
|
/// index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64));
|
||||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64))?;
|
/// index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64));
|
||||||
/// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64))?;
|
/// index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64));
|
||||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64))?;
|
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64));
|
||||||
/// index_writer.commit()?;
|
/// assert!(index_writer.commit().is_ok());
|
||||||
///
|
///
|
||||||
/// let reader = index.reader()?;
|
/// let reader = index.reader().unwrap();
|
||||||
/// let searcher = reader.searcher();
|
/// let searcher = reader.searcher();
|
||||||
///
|
///
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
/// let query = query_parser.parse_query("diary").unwrap();
|
||||||
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
|
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
|
||||||
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
|
/// let top_docs = searcher.search(&query, &no_filter_collector).unwrap();
|
||||||
///
|
///
|
||||||
/// assert_eq!(top_docs.len(), 1);
|
/// assert_eq!(top_docs.len(), 1);
|
||||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||||
///
|
///
|
||||||
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
|
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
|
||||||
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
|
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
|
||||||
///
|
///
|
||||||
/// assert_eq!(filtered_top_docs.len(), 0);
|
/// assert_eq!(filtered_top_docs.len(), 0);
|
||||||
/// # Ok(())
|
|
||||||
/// # }
|
|
||||||
/// ```
|
/// ```
|
||||||
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
|
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
|
||||||
where
|
where
|
||||||
|
|||||||
@@ -226,10 +226,10 @@ mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||||
writer.add_document(doc!(val_field=>12i64))?;
|
writer.add_document(doc!(val_field=>12i64));
|
||||||
writer.add_document(doc!(val_field=>-30i64))?;
|
writer.add_document(doc!(val_field=>-30i64));
|
||||||
writer.add_document(doc!(val_field=>-12i64))?;
|
writer.add_document(doc!(val_field=>-12i64));
|
||||||
writer.add_document(doc!(val_field=>-10i64))?;
|
writer.add_document(doc!(val_field=>-10i64));
|
||||||
writer.commit()?;
|
writer.commit()?;
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -247,13 +247,13 @@ mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||||
writer.add_document(doc!(val_field=>12i64))?;
|
writer.add_document(doc!(val_field=>12i64));
|
||||||
writer.commit()?;
|
writer.commit()?;
|
||||||
writer.add_document(doc!(val_field=>-30i64))?;
|
writer.add_document(doc!(val_field=>-30i64));
|
||||||
writer.commit()?;
|
writer.commit()?;
|
||||||
writer.add_document(doc!(val_field=>-12i64))?;
|
writer.add_document(doc!(val_field=>-12i64));
|
||||||
writer.commit()?;
|
writer.commit()?;
|
||||||
writer.add_document(doc!(val_field=>-10i64))?;
|
writer.add_document(doc!(val_field=>-10i64));
|
||||||
writer.commit()?;
|
writer.commit()?;
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -271,9 +271,9 @@ mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||||
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)))?;
|
writer.add_document(doc!(date_field=>Utc.ymd(1982, 9, 17).and_hms(0, 0,0)));
|
||||||
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)))?;
|
writer.add_document(doc!(date_field=>Utc.ymd(1986, 3, 9).and_hms(0, 0, 0)));
|
||||||
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)))?;
|
writer.add_document(doc!(date_field=>Utc.ymd(1983, 9, 27).and_hms(0, 0, 0)));
|
||||||
writer.commit()?;
|
writer.commit()?;
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
|
|||||||
@@ -48,10 +48,10 @@ use tantivy::collector::{Count, TopDocs};
|
|||||||
# let mut index_writer = index.writer(3_000_000)?;
|
# let mut index_writer = index.writer(3_000_000)?;
|
||||||
# index_writer.add_document(doc!(
|
# index_writer.add_document(doc!(
|
||||||
# title => "The Name of the Wind",
|
# title => "The Name of the Wind",
|
||||||
# ))?;
|
# ));
|
||||||
# index_writer.add_document(doc!(
|
# index_writer.add_document(doc!(
|
||||||
# title => "The Diary of Muadib",
|
# title => "The Diary of Muadib",
|
||||||
# ))?;
|
# ));
|
||||||
# index_writer.commit()?;
|
# index_writer.commit()?;
|
||||||
# let reader = index.reader()?;
|
# let reader = index.reader()?;
|
||||||
# let searcher = reader.searcher();
|
# let searcher = reader.searcher();
|
||||||
|
|||||||
@@ -112,19 +112,19 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
|||||||
/// use tantivy::schema::{Schema, TEXT};
|
/// use tantivy::schema::{Schema, TEXT};
|
||||||
/// use tantivy::{doc, Index};
|
/// use tantivy::{doc, Index};
|
||||||
///
|
///
|
||||||
/// # fn main() -> tantivy::Result<()> {
|
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
/// let mut index_writer = index.writer(3_000_000)?;
|
|
||||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
|
|
||||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
|
|
||||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
|
|
||||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
|
|
||||||
/// index_writer.commit()?;
|
|
||||||
///
|
///
|
||||||
/// let reader = index.reader()?;
|
/// let mut index_writer = index.writer(3_000_000).unwrap();
|
||||||
|
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||||
|
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||||
|
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||||
|
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||||
|
/// assert!(index_writer.commit().is_ok());
|
||||||
|
///
|
||||||
|
/// let reader = index.reader().unwrap();
|
||||||
/// let searcher = reader.searcher();
|
/// let searcher = reader.searcher();
|
||||||
///
|
///
|
||||||
/// let mut collectors = MultiCollector::new();
|
/// let mut collectors = MultiCollector::new();
|
||||||
@@ -139,8 +139,6 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
|||||||
///
|
///
|
||||||
/// assert_eq!(count, 2);
|
/// assert_eq!(count, 2);
|
||||||
/// assert_eq!(top_docs.len(), 2);
|
/// assert_eq!(top_docs.len(), 2);
|
||||||
/// # Ok(())
|
|
||||||
/// # }
|
|
||||||
/// ```
|
/// ```
|
||||||
#[allow(clippy::type_complexity)]
|
#[allow(clippy::type_complexity)]
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
@@ -254,24 +252,24 @@ mod tests {
|
|||||||
use crate::Term;
|
use crate::Term;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multi_collector() -> crate::Result<()> {
|
fn test_multi_collector() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let text = schema_builder.add_text_field("text", TEXT);
|
let text = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(text=>"abc"))?;
|
index_writer.add_document(doc!(text=>"abc"));
|
||||||
index_writer.add_document(doc!(text=>"abc abc abc"))?;
|
index_writer.add_document(doc!(text=>"abc abc abc"));
|
||||||
index_writer.add_document(doc!(text=>"abc abc"))?;
|
index_writer.add_document(doc!(text=>"abc abc"));
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
index_writer.add_document(doc!(text=>""))?;
|
index_writer.add_document(doc!(text=>""));
|
||||||
index_writer.add_document(doc!(text=>"abc abc abc abc"))?;
|
index_writer.add_document(doc!(text=>"abc abc abc abc"));
|
||||||
index_writer.add_document(doc!(text=>"abc"))?;
|
index_writer.add_document(doc!(text=>"abc"));
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let term = Term::from_field_text(text, "abc");
|
let term = Term::from_field_text(text, "abc");
|
||||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||||
|
|
||||||
@@ -282,6 +280,5 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(count_handler.extract(&mut multifruits), 5);
|
assert_eq!(count_handler.extract(&mut multifruits), 5);
|
||||||
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
|
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ pub const TEST_COLLECTOR_WITHOUT_SCORE: TestCollector = TestCollector {
|
|||||||
};
|
};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_filter_collector() -> crate::Result<()> {
|
pub fn test_filter_collector() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let title = schema_builder.add_text_field("title", TEXT);
|
let title = schema_builder.add_text_field("title", TEXT);
|
||||||
let price = schema_builder.add_u64_field("price", FAST);
|
let price = schema_builder.add_u64_field("price", FAST);
|
||||||
@@ -33,25 +33,25 @@ pub fn test_filter_collector() -> crate::Result<()> {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()))?;
|
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()));
|
||||||
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()))?;
|
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()));
|
||||||
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()))?;
|
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()));
|
||||||
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()))?;
|
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()));
|
||||||
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()))?;
|
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
let query_parser = QueryParser::for_index(&index, vec![title]);
|
let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
let query = query_parser.parse_query("diary")?;
|
let query = query_parser.parse_query("diary").unwrap();
|
||||||
let filter_some_collector = FilterCollector::new(
|
let filter_some_collector = FilterCollector::new(
|
||||||
price,
|
price,
|
||||||
&|value: u64| value > 20_120u64,
|
&|value: u64| value > 20_120u64,
|
||||||
TopDocs::with_limit(2),
|
TopDocs::with_limit(2),
|
||||||
);
|
);
|
||||||
let top_docs = searcher.search(&query, &filter_some_collector)?;
|
let top_docs = searcher.search(&query, &filter_some_collector).unwrap();
|
||||||
|
|
||||||
assert_eq!(top_docs.len(), 1);
|
assert_eq!(top_docs.len(), 1);
|
||||||
assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||||
@@ -67,10 +67,9 @@ pub fn test_filter_collector() -> crate::Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
|
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
|
||||||
let filtered_date_docs = searcher.search(&query, &filter_dates_collector)?;
|
let filtered_date_docs = searcher.search(&query, &filter_dates_collector).unwrap();
|
||||||
|
|
||||||
assert_eq!(filtered_date_docs.len(), 2);
|
assert_eq!(filtered_date_docs.len(), 2);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Stores all of the doc ids.
|
/// Stores all of the doc ids.
|
||||||
@@ -275,8 +274,8 @@ fn make_test_searcher() -> crate::Result<crate::LeasedItem<Searcher>> {
|
|||||||
let schema = Schema::builder().build();
|
let schema = Schema::builder().build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(Document::default())?;
|
index_writer.add_document(Document::default());
|
||||||
index_writer.add_document(Document::default())?;
|
index_writer.add_document(Document::default());
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
Ok(index.reader()?.searcher())
|
Ok(index.reader()?.searcher())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -70,7 +70,9 @@ where
|
|||||||
/// # Panics
|
/// # Panics
|
||||||
/// The method panics if limit is 0
|
/// The method panics if limit is 0
|
||||||
pub fn with_limit(limit: usize) -> TopCollector<T> {
|
pub fn with_limit(limit: usize) -> TopCollector<T> {
|
||||||
assert!(limit >= 1, "Limit must be strictly greater than 0.");
|
if limit < 1 {
|
||||||
|
panic!("Limit must be strictly greater than 0.");
|
||||||
|
}
|
||||||
Self {
|
Self {
|
||||||
limit,
|
limit,
|
||||||
offset: 0,
|
offset: 0,
|
||||||
|
|||||||
@@ -94,30 +94,27 @@ where
|
|||||||
/// use tantivy::schema::{Schema, TEXT};
|
/// use tantivy::schema::{Schema, TEXT};
|
||||||
/// use tantivy::{doc, DocAddress, Index};
|
/// use tantivy::{doc, DocAddress, Index};
|
||||||
///
|
///
|
||||||
/// # fn main() -> tantivy::Result<()> {
|
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
///
|
///
|
||||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
|
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
|
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
|
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
|
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||||
/// index_writer.commit()?;
|
/// assert!(index_writer.commit().is_ok());
|
||||||
///
|
///
|
||||||
/// let reader = index.reader()?;
|
/// let reader = index.reader().unwrap();
|
||||||
/// let searcher = reader.searcher();
|
/// let searcher = reader.searcher();
|
||||||
///
|
///
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
/// let query = query_parser.parse_query("diary").unwrap();
|
||||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2))?;
|
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2)).unwrap();
|
||||||
///
|
///
|
||||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
|
||||||
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
|
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
|
||||||
/// # Ok(())
|
|
||||||
/// # }
|
|
||||||
/// ```
|
/// ```
|
||||||
pub struct TopDocs(TopCollector<Score>);
|
pub struct TopDocs(TopCollector<Score>);
|
||||||
|
|
||||||
@@ -183,34 +180,30 @@ impl TopDocs {
|
|||||||
/// use tantivy::schema::{Schema, TEXT};
|
/// use tantivy::schema::{Schema, TEXT};
|
||||||
/// use tantivy::{doc, DocAddress, Index};
|
/// use tantivy::{doc, DocAddress, Index};
|
||||||
///
|
///
|
||||||
/// # fn main() -> tantivy::Result<()> {
|
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = Schema::builder();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
///
|
///
|
||||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"))?;
|
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"))?;
|
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"))?;
|
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"))?;
|
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||||
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"))?;
|
/// index_writer.add_document(doc!(title => "The Diary of Lena Mukhina"));
|
||||||
/// index_writer.commit()?;
|
/// assert!(index_writer.commit().is_ok());
|
||||||
///
|
///
|
||||||
/// let reader = index.reader()?;
|
/// let reader = index.reader().unwrap();
|
||||||
/// let searcher = reader.searcher();
|
/// let searcher = reader.searcher();
|
||||||
///
|
///
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
/// let query = query_parser.parse_query("diary").unwrap();
|
||||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1))?;
|
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2).and_offset(1)).unwrap();
|
||||||
///
|
///
|
||||||
/// assert_eq!(top_docs.len(), 2);
|
/// assert_eq!(top_docs.len(), 2);
|
||||||
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 4));
|
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 4));
|
||||||
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
|
/// assert_eq!(top_docs[1].1, DocAddress::new(0, 3));
|
||||||
/// Ok(())
|
|
||||||
/// # }
|
|
||||||
/// ```
|
/// ```
|
||||||
#[must_use]
|
|
||||||
pub fn and_offset(self, offset: usize) -> TopDocs {
|
pub fn and_offset(self, offset: usize) -> TopDocs {
|
||||||
TopDocs(self.0.and_offset(offset))
|
TopDocs(self.0.and_offset(offset))
|
||||||
}
|
}
|
||||||
@@ -241,11 +234,11 @@ impl TopDocs {
|
|||||||
/// #
|
/// #
|
||||||
/// # let index = Index::create_in_ram(schema);
|
/// # let index = Index::create_in_ram(schema);
|
||||||
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||||
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64))?;
|
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64));
|
||||||
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64))?;
|
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
|
||||||
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64))?;
|
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
|
||||||
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64))?;
|
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
|
||||||
/// # index_writer.commit()?;
|
/// # assert!(index_writer.commit().is_ok());
|
||||||
/// # let reader = index.reader()?;
|
/// # let reader = index.reader()?;
|
||||||
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
|
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
|
||||||
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
|
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
|
||||||
@@ -323,9 +316,9 @@ impl TopDocs {
|
|||||||
/// #
|
/// #
|
||||||
/// # let index = Index::create_in_ram(schema);
|
/// # let index = Index::create_in_ram(schema);
|
||||||
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||||
/// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64))?;
|
/// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64));
|
||||||
/// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64))?;
|
/// # index_writer.add_document(doc!(title => "Zozo Cow KKK", rating => 119_000_000i64));
|
||||||
/// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64))?;
|
/// # index_writer.add_document(doc!(title => "Declining Cow", rating => -63_000_000i64));
|
||||||
/// # assert!(index_writer.commit().is_ok());
|
/// # assert!(index_writer.commit().is_ok());
|
||||||
/// # let reader = index.reader()?;
|
/// # let reader = index.reader()?;
|
||||||
/// # let top_docs = docs_sorted_by_revenue(&reader.searcher(), &AllQuery, rating)?;
|
/// # let top_docs = docs_sorted_by_revenue(&reader.searcher(), &AllQuery, rating)?;
|
||||||
@@ -424,9 +417,9 @@ impl TopDocs {
|
|||||||
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||||
/// let product_name = index.schema().get_field("product_name").unwrap();
|
/// let product_name = index.schema().get_field("product_name").unwrap();
|
||||||
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
|
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
|
||||||
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64))?;
|
/// index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
|
||||||
/// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64))?;
|
/// index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64));
|
||||||
/// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64))?;
|
/// index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64));
|
||||||
/// index_writer.commit()?;
|
/// index_writer.commit()?;
|
||||||
/// Ok(index)
|
/// Ok(index)
|
||||||
/// }
|
/// }
|
||||||
@@ -534,9 +527,9 @@ impl TopDocs {
|
|||||||
/// #
|
/// #
|
||||||
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
|
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
|
||||||
/// let boosted: Field = index.schema().get_field("boosted").unwrap();
|
/// let boosted: Field = index.schema().get_field("boosted").unwrap();
|
||||||
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64))?;
|
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64));
|
||||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64))?;
|
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64));
|
||||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64))?;
|
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64));
|
||||||
/// # index_writer.commit()?;
|
/// # index_writer.commit()?;
|
||||||
/// // ...
|
/// // ...
|
||||||
/// # let user_query = "diary";
|
/// # let user_query = "diary";
|
||||||
@@ -720,18 +713,20 @@ mod tests {
|
|||||||
use crate::Score;
|
use crate::Score;
|
||||||
use crate::{DocAddress, DocId, SegmentReader};
|
use crate::{DocAddress, DocId, SegmentReader};
|
||||||
|
|
||||||
fn make_index() -> crate::Result<Index> {
|
fn make_index() -> Index {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
// writing the segment
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
// writing the segment
|
||||||
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."))?;
|
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"))?;
|
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
|
||||||
index_writer.add_document(doc!(text_field=>"I like Droopy"))?;
|
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
|
||||||
index_writer.commit()?;
|
index_writer.add_document(doc!(text_field=>"I like Droopy"));
|
||||||
Ok(index)
|
assert!(index_writer.commit().is_ok());
|
||||||
|
}
|
||||||
|
index
|
||||||
}
|
}
|
||||||
|
|
||||||
fn assert_results_equals(results: &[(Score, DocAddress)], expected: &[(Score, DocAddress)]) {
|
fn assert_results_equals(results: &[(Score, DocAddress)], expected: &[(Score, DocAddress)]) {
|
||||||
@@ -742,15 +737,17 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_not_at_capacity_without_offset() -> crate::Result<()> {
|
fn test_top_collector_not_at_capacity_without_offset() {
|
||||||
let index = make_index()?;
|
let index = make_index();
|
||||||
let field = index.schema().get_field("text").unwrap();
|
let field = index.schema().get_field("text").unwrap();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||||
let text_query = query_parser.parse_query("droopy tax")?;
|
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||||
let score_docs: Vec<(Score, DocAddress)> = index
|
let score_docs: Vec<(Score, DocAddress)> = index
|
||||||
.reader()?
|
.reader()
|
||||||
|
.unwrap()
|
||||||
.searcher()
|
.searcher()
|
||||||
.search(&text_query, &TopDocs::with_limit(4))?;
|
.search(&text_query, &TopDocs::with_limit(4))
|
||||||
|
.unwrap();
|
||||||
assert_results_equals(
|
assert_results_equals(
|
||||||
&score_docs,
|
&score_docs,
|
||||||
&[
|
&[
|
||||||
@@ -759,12 +756,11 @@ mod tests {
|
|||||||
(0.48527452, DocAddress::new(0, 0)),
|
(0.48527452, DocAddress::new(0, 0)),
|
||||||
],
|
],
|
||||||
);
|
);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_not_at_capacity_with_offset() {
|
fn test_top_collector_not_at_capacity_with_offset() {
|
||||||
let index = make_index().unwrap();
|
let index = make_index();
|
||||||
let field = index.schema().get_field("text").unwrap();
|
let field = index.schema().get_field("text").unwrap();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||||
@@ -779,7 +775,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_at_capacity() {
|
fn test_top_collector_at_capacity() {
|
||||||
let index = make_index().unwrap();
|
let index = make_index();
|
||||||
let field = index.schema().get_field("text").unwrap();
|
let field = index.schema().get_field("text").unwrap();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||||
@@ -800,7 +796,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_at_capacity_with_offset() {
|
fn test_top_collector_at_capacity_with_offset() {
|
||||||
let index = make_index().unwrap();
|
let index = make_index();
|
||||||
let field = index.schema().get_field("text").unwrap();
|
let field = index.schema().get_field("text").unwrap();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||||
@@ -821,7 +817,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_stable_sorting() {
|
fn test_top_collector_stable_sorting() {
|
||||||
let index = make_index().unwrap();
|
let index = make_index();
|
||||||
|
|
||||||
// using AllQuery to get a constant score
|
// using AllQuery to get a constant score
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
@@ -852,35 +848,29 @@ mod tests {
|
|||||||
const SIZE: &str = "size";
|
const SIZE: &str = "size";
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_field_collector_not_at_capacity() -> crate::Result<()> {
|
fn test_top_field_collector_not_at_capacity() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let (index, query) = index("beer", title, schema, |index_writer| {
|
let (index, query) = index("beer", title, schema, |index_writer| {
|
||||||
index_writer
|
index_writer.add_document(doc!(
|
||||||
.add_document(doc!(
|
title => "bottle of beer",
|
||||||
title => "bottle of beer",
|
size => 12u64,
|
||||||
size => 12u64,
|
));
|
||||||
))
|
index_writer.add_document(doc!(
|
||||||
.unwrap();
|
title => "growler of beer",
|
||||||
index_writer
|
size => 64u64,
|
||||||
.add_document(doc!(
|
));
|
||||||
title => "growler of beer",
|
index_writer.add_document(doc!(
|
||||||
size => 64u64,
|
title => "pint of beer",
|
||||||
))
|
size => 16u64,
|
||||||
.unwrap();
|
));
|
||||||
index_writer
|
|
||||||
.add_document(doc!(
|
|
||||||
title => "pint of beer",
|
|
||||||
size => 16u64,
|
|
||||||
))
|
|
||||||
.unwrap();
|
|
||||||
});
|
});
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
|
|
||||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
|
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
|
||||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
|
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&top_docs[..],
|
&top_docs[..],
|
||||||
&[
|
&[
|
||||||
@@ -889,7 +879,6 @@ mod tests {
|
|||||||
(12, DocAddress::new(0, 0))
|
(12, DocAddress::new(0, 0))
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -905,12 +894,12 @@ mod tests {
|
|||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
name => "Paul Robeson",
|
name => "Paul Robeson",
|
||||||
birthday => pr_birthday
|
birthday => pr_birthday
|
||||||
))?;
|
));
|
||||||
let mr_birthday = crate::DateTime::from_str("1947-11-08T00:00:00+00:00")?;
|
let mr_birthday = crate::DateTime::from_str("1947-11-08T00:00:00+00:00")?;
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
name => "Minnie Riperton",
|
name => "Minnie Riperton",
|
||||||
birthday => mr_birthday
|
birthday => mr_birthday
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
|
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
|
||||||
@@ -937,11 +926,11 @@ mod tests {
|
|||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
city => "georgetown",
|
city => "georgetown",
|
||||||
altitude => -1i64,
|
altitude => -1i64,
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
city => "tokyo",
|
city => "tokyo",
|
||||||
altitude => 40i64,
|
altitude => 40i64,
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
|
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
|
||||||
@@ -967,11 +956,11 @@ mod tests {
|
|||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
city => "georgetown",
|
city => "georgetown",
|
||||||
altitude => -1.0f64,
|
altitude => -1.0f64,
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
city => "tokyo",
|
city => "tokyo",
|
||||||
altitude => 40f64,
|
altitude => 40f64,
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
|
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
|
||||||
@@ -994,12 +983,10 @@ mod tests {
|
|||||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let (index, _) = index("beer", title, schema, |index_writer| {
|
let (index, _) = index("beer", title, schema, |index_writer| {
|
||||||
index_writer
|
index_writer.add_document(doc!(
|
||||||
.add_document(doc!(
|
title => "bottle of beer",
|
||||||
title => "bottle of beer",
|
size => 12u64,
|
||||||
size => 12u64,
|
));
|
||||||
))
|
|
||||||
.unwrap();
|
|
||||||
});
|
});
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
|
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
|
||||||
@@ -1016,7 +1003,7 @@ mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(size=>1u64))?;
|
index_writer.add_document(doc!(size=>1u64));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
let segment = searcher.segment_reader(0);
|
let segment = searcher.segment_reader(0);
|
||||||
@@ -1033,7 +1020,7 @@ mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(size=>1u64))?;
|
index_writer.add_document(doc!(size=>1u64));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
let segment = searcher.segment_reader(0);
|
let segment = searcher.segment_reader(0);
|
||||||
@@ -1046,26 +1033,30 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_tweak_score_top_collector_with_offset() -> crate::Result<()> {
|
fn test_tweak_score_top_collector_with_offset() {
|
||||||
let index = make_index()?;
|
let index = make_index();
|
||||||
let field = index.schema().get_field("text").unwrap();
|
let field = index.schema().get_field("text").unwrap();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||||
let text_query = query_parser.parse_query("droopy tax")?;
|
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||||
let collector = TopDocs::with_limit(2).and_offset(1).tweak_score(
|
let collector = TopDocs::with_limit(2).and_offset(1).tweak_score(
|
||||||
move |_segment_reader: &SegmentReader| move |doc: DocId, _original_score: Score| doc,
|
move |_segment_reader: &SegmentReader| move |doc: DocId, _original_score: Score| doc,
|
||||||
);
|
);
|
||||||
let score_docs: Vec<(u32, DocAddress)> =
|
let score_docs: Vec<(u32, DocAddress)> = index
|
||||||
index.reader()?.searcher().search(&text_query, &collector)?;
|
.reader()
|
||||||
|
.unwrap()
|
||||||
|
.searcher()
|
||||||
|
.search(&text_query, &collector)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
score_docs,
|
score_docs,
|
||||||
vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),]
|
vec![(1, DocAddress::new(0, 1)), (0, DocAddress::new(0, 0)),]
|
||||||
);
|
);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_custom_score_top_collector_with_offset() {
|
fn test_custom_score_top_collector_with_offset() {
|
||||||
let index = make_index().unwrap();
|
let index = make_index();
|
||||||
let field = index.schema().get_field("text").unwrap();
|
let field = index.schema().get_field("text").unwrap();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use crossbeam::channel;
|
use crossbeam::channel;
|
||||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
|
||||||
|
|
||||||
/// Search executor whether search request are single thread or multithread.
|
/// Search executor whether search request are single thread or multithread.
|
||||||
///
|
///
|
||||||
@@ -11,8 +10,6 @@ use rayon::{ThreadPool, ThreadPoolBuilder};
|
|||||||
pub enum Executor {
|
pub enum Executor {
|
||||||
/// Single thread variant of an Executor
|
/// Single thread variant of an Executor
|
||||||
SingleThread,
|
SingleThread,
|
||||||
/// Thread pool variant of an Executor
|
|
||||||
ThreadPool(ThreadPool),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Executor {
|
impl Executor {
|
||||||
@@ -21,15 +18,6 @@ impl Executor {
|
|||||||
Executor::SingleThread
|
Executor::SingleThread
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates an Executor that dispatches the tasks in a thread pool.
|
|
||||||
pub fn multi_thread(num_threads: usize, prefix: &'static str) -> crate::Result<Executor> {
|
|
||||||
let pool = ThreadPoolBuilder::new()
|
|
||||||
.num_threads(num_threads)
|
|
||||||
.thread_name(move |num| format!("{}{}", prefix, num))
|
|
||||||
.build()?;
|
|
||||||
Ok(Executor::ThreadPool(pool))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Perform a map in the thread pool.
|
/// Perform a map in the thread pool.
|
||||||
///
|
///
|
||||||
/// Regardless of the executor (`SingleThread` or `ThreadPool`), panics in the task
|
/// Regardless of the executor (`SingleThread` or `ThreadPool`), panics in the task
|
||||||
@@ -46,40 +34,6 @@ impl Executor {
|
|||||||
) -> crate::Result<Vec<R>> {
|
) -> crate::Result<Vec<R>> {
|
||||||
match self {
|
match self {
|
||||||
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
|
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
|
||||||
Executor::ThreadPool(pool) => {
|
|
||||||
let args_with_indices: Vec<(usize, A)> = args.enumerate().collect();
|
|
||||||
let num_fruits = args_with_indices.len();
|
|
||||||
let fruit_receiver = {
|
|
||||||
let (fruit_sender, fruit_receiver) = channel::unbounded();
|
|
||||||
pool.scope(|scope| {
|
|
||||||
for arg_with_idx in args_with_indices {
|
|
||||||
scope.spawn(|_| {
|
|
||||||
let (idx, arg) = arg_with_idx;
|
|
||||||
let fruit = f(arg);
|
|
||||||
if let Err(err) = fruit_sender.send((idx, fruit)) {
|
|
||||||
error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
fruit_receiver
|
|
||||||
// This ends the scope of fruit_sender.
|
|
||||||
// This is important as it makes it possible for the fruit_receiver iteration to
|
|
||||||
// terminate.
|
|
||||||
};
|
|
||||||
// This is lame, but safe.
|
|
||||||
let mut results_with_position = Vec::with_capacity(num_fruits);
|
|
||||||
for (pos, fruit_res) in fruit_receiver {
|
|
||||||
let fruit = fruit_res?;
|
|
||||||
results_with_position.push((pos, fruit));
|
|
||||||
}
|
|
||||||
results_with_position.sort_by_key(|(pos, _)| *pos);
|
|
||||||
assert_eq!(results_with_position.len(), num_fruits);
|
|
||||||
Ok(results_with_position
|
|
||||||
.into_iter()
|
|
||||||
.map(|(_, fruit)| fruit)
|
|
||||||
.collect::<Vec<_>>())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -123,8 +123,8 @@ impl IndexBuilder {
|
|||||||
/// If a previous index was in this directory, it returns an `IndexAlreadyExists` error.
|
/// If a previous index was in this directory, it returns an `IndexAlreadyExists` error.
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> {
|
pub fn create_in_dir<P: AsRef<Path>>(self, directory_path: P) -> crate::Result<Index> {
|
||||||
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::open(directory_path)?);
|
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||||
if Index::exists(&*mmap_directory)? {
|
if Index::exists(&mmap_directory)? {
|
||||||
return Err(TantivyError::IndexAlreadyExists);
|
return Err(TantivyError::IndexAlreadyExists);
|
||||||
}
|
}
|
||||||
self.create(mmap_directory)
|
self.create(mmap_directory)
|
||||||
@@ -139,7 +139,7 @@ impl IndexBuilder {
|
|||||||
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
|
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
pub fn create_from_tempdir(self) -> crate::Result<Index> {
|
pub fn create_from_tempdir(self) -> crate::Result<Index> {
|
||||||
let mmap_directory: Box<dyn Directory> = Box::new(MmapDirectory::create_from_tempdir()?);
|
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
||||||
self.create(mmap_directory)
|
self.create(mmap_directory)
|
||||||
}
|
}
|
||||||
fn get_expect_schema(&self) -> crate::Result<Schema> {
|
fn get_expect_schema(&self) -> crate::Result<Schema> {
|
||||||
@@ -149,9 +149,8 @@ impl IndexBuilder {
|
|||||||
.ok_or(TantivyError::IndexBuilderMissingArgument("schema"))
|
.ok_or(TantivyError::IndexBuilderMissingArgument("schema"))
|
||||||
}
|
}
|
||||||
/// Opens or creates a new index in the provided directory
|
/// Opens or creates a new index in the provided directory
|
||||||
pub fn open_or_create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
|
pub fn open_or_create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
|
||||||
let dir = dir.into();
|
if !Index::exists(&dir)? {
|
||||||
if !Index::exists(&*dir)? {
|
|
||||||
return self.create(dir);
|
return self.create(dir);
|
||||||
}
|
}
|
||||||
let index = Index::open(dir)?;
|
let index = Index::open(dir)?;
|
||||||
@@ -166,8 +165,7 @@ impl IndexBuilder {
|
|||||||
/// Creates a new index given an implementation of the trait `Directory`.
|
/// Creates a new index given an implementation of the trait `Directory`.
|
||||||
///
|
///
|
||||||
/// If a directory previously existed, it will be erased.
|
/// If a directory previously existed, it will be erased.
|
||||||
fn create<T: Into<Box<dyn Directory>>>(self, dir: T) -> crate::Result<Index> {
|
fn create<Dir: Directory>(self, dir: Dir) -> crate::Result<Index> {
|
||||||
let dir = dir.into();
|
|
||||||
let directory = ManagedDirectory::wrap(dir)?;
|
let directory = ManagedDirectory::wrap(dir)?;
|
||||||
save_new_metas(
|
save_new_metas(
|
||||||
self.get_expect_schema()?,
|
self.get_expect_schema()?,
|
||||||
@@ -200,7 +198,7 @@ impl Index {
|
|||||||
/// Examines the directory to see if it contains an index.
|
/// Examines the directory to see if it contains an index.
|
||||||
///
|
///
|
||||||
/// Effectively, it only checks for the presence of the `meta.json` file.
|
/// Effectively, it only checks for the presence of the `meta.json` file.
|
||||||
pub fn exists(dir: &dyn Directory) -> Result<bool, OpenReadError> {
|
pub fn exists<Dir: Directory>(dir: &Dir) -> Result<bool, OpenReadError> {
|
||||||
dir.exists(&META_FILEPATH)
|
dir.exists(&META_FILEPATH)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -217,7 +215,7 @@ impl Index {
|
|||||||
/// Replace the default single thread search executor pool
|
/// Replace the default single thread search executor pool
|
||||||
/// by a thread pool with a given number of threads.
|
/// by a thread pool with a given number of threads.
|
||||||
pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> {
|
pub fn set_multithread_executor(&mut self, num_threads: usize) -> crate::Result<()> {
|
||||||
self.executor = Arc::new(Executor::multi_thread(num_threads, "tantivy-search-")?);
|
self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-")?);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -252,11 +250,7 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Opens or creates a new index in the provided directory
|
/// Opens or creates a new index in the provided directory
|
||||||
pub fn open_or_create<T: Into<Box<dyn Directory>>>(
|
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> crate::Result<Index> {
|
||||||
dir: T,
|
|
||||||
schema: Schema,
|
|
||||||
) -> crate::Result<Index> {
|
|
||||||
let dir = dir.into();
|
|
||||||
IndexBuilder::new().schema(schema).open_or_create(dir)
|
IndexBuilder::new().schema(schema).open_or_create(dir)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -276,12 +270,11 @@ impl Index {
|
|||||||
/// Creates a new index given an implementation of the trait `Directory`.
|
/// Creates a new index given an implementation of the trait `Directory`.
|
||||||
///
|
///
|
||||||
/// If a directory previously existed, it will be erased.
|
/// If a directory previously existed, it will be erased.
|
||||||
pub fn create<T: Into<Box<dyn Directory>>>(
|
pub fn create<Dir: Directory>(
|
||||||
dir: T,
|
dir: Dir,
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
settings: IndexSettings,
|
settings: IndexSettings,
|
||||||
) -> crate::Result<Index> {
|
) -> crate::Result<Index> {
|
||||||
let dir: Box<dyn Directory> = dir.into();
|
|
||||||
let mut builder = IndexBuilder::new().schema(schema);
|
let mut builder = IndexBuilder::new().schema(schema);
|
||||||
builder = builder.settings(settings);
|
builder = builder.settings(settings);
|
||||||
builder.create(dir)
|
builder.create(dir)
|
||||||
@@ -372,8 +365,7 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Open the index using the provided directory
|
/// Open the index using the provided directory
|
||||||
pub fn open<T: Into<Box<dyn Directory>>>(directory: T) -> crate::Result<Index> {
|
pub fn open<D: Directory>(directory: D) -> crate::Result<Index> {
|
||||||
let directory = directory.into();
|
|
||||||
let directory = ManagedDirectory::wrap(directory)?;
|
let directory = ManagedDirectory::wrap(directory)?;
|
||||||
let inventory = SegmentMetaInventory::default();
|
let inventory = SegmentMetaInventory::default();
|
||||||
let metas = load_metas(&directory, &inventory)?;
|
let metas = load_metas(&directory, &inventory)?;
|
||||||
@@ -403,7 +395,9 @@ impl Index {
|
|||||||
///
|
///
|
||||||
/// # Errors
|
/// # Errors
|
||||||
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
|
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
|
||||||
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument`
|
///
|
||||||
|
/// # Panics
|
||||||
|
/// If the heap size per thread is too small, panics.
|
||||||
pub fn writer_with_num_threads(
|
pub fn writer_with_num_threads(
|
||||||
&self,
|
&self,
|
||||||
num_threads: usize,
|
num_threads: usize,
|
||||||
@@ -445,13 +439,14 @@ impl Index {
|
|||||||
/// Creates a multithreaded writer
|
/// Creates a multithreaded writer
|
||||||
///
|
///
|
||||||
/// Tantivy will automatically define the number of threads to use, but
|
/// Tantivy will automatically define the number of threads to use, but
|
||||||
/// no more than 8 threads.
|
/// no more than [`MAX_NUM_THREAD`] threads.
|
||||||
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split
|
/// `overall_heap_size_in_bytes` is the total target memory usage that will be split
|
||||||
/// between a given number of threads.
|
/// between a given number of threads.
|
||||||
///
|
///
|
||||||
/// # Errors
|
/// # Errors
|
||||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||||
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument`
|
/// # Panics
|
||||||
|
/// If the heap size per thread is too small, panics.
|
||||||
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> crate::Result<IndexWriter> {
|
pub fn writer(&self, overall_heap_size_in_bytes: usize) -> crate::Result<IndexWriter> {
|
||||||
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
|
let mut num_threads = std::cmp::min(num_cpus::get(), MAX_NUM_THREAD);
|
||||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||||
@@ -582,15 +577,15 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_index_exists() {
|
fn test_index_exists() {
|
||||||
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
|
let directory = RamDirectory::create();
|
||||||
assert!(!Index::exists(directory.as_ref()).unwrap());
|
assert!(!Index::exists(&directory).unwrap());
|
||||||
assert!(Index::create(
|
assert!(Index::create(
|
||||||
directory.clone(),
|
directory.clone(),
|
||||||
throw_away_schema(),
|
throw_away_schema(),
|
||||||
IndexSettings::default()
|
IndexSettings::default()
|
||||||
)
|
)
|
||||||
.is_ok());
|
.is_ok());
|
||||||
assert!(Index::exists(directory.as_ref()).unwrap());
|
assert!(Index::exists(&directory).unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -603,27 +598,27 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn open_or_create_should_open() {
|
fn open_or_create_should_open() {
|
||||||
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
|
let directory = RamDirectory::create();
|
||||||
assert!(Index::create(
|
assert!(Index::create(
|
||||||
directory.clone(),
|
directory.clone(),
|
||||||
throw_away_schema(),
|
throw_away_schema(),
|
||||||
IndexSettings::default()
|
IndexSettings::default()
|
||||||
)
|
)
|
||||||
.is_ok());
|
.is_ok());
|
||||||
assert!(Index::exists(directory.as_ref()).unwrap());
|
assert!(Index::exists(&directory).unwrap());
|
||||||
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
|
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn create_should_wipeoff_existing() {
|
fn create_should_wipeoff_existing() {
|
||||||
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
|
let directory = RamDirectory::create();
|
||||||
assert!(Index::create(
|
assert!(Index::create(
|
||||||
directory.clone(),
|
directory.clone(),
|
||||||
throw_away_schema(),
|
throw_away_schema(),
|
||||||
IndexSettings::default()
|
IndexSettings::default()
|
||||||
)
|
)
|
||||||
.is_ok());
|
.is_ok());
|
||||||
assert!(Index::exists(directory.as_ref()).unwrap());
|
assert!(Index::exists(&directory).unwrap());
|
||||||
assert!(Index::create(
|
assert!(Index::create(
|
||||||
directory,
|
directory,
|
||||||
Schema::builder().build(),
|
Schema::builder().build(),
|
||||||
@@ -657,7 +652,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_index_on_commit_reload_policy() -> crate::Result<()> {
|
fn test_index_on_commit_reload_policy() {
|
||||||
let schema = throw_away_schema();
|
let schema = throw_away_schema();
|
||||||
let field = schema.get_field("num_likes").unwrap();
|
let field = schema.get_field("num_likes").unwrap();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -667,7 +662,7 @@ mod tests {
|
|||||||
.try_into()
|
.try_into()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
assert_eq!(reader.searcher().num_docs(), 0);
|
||||||
test_index_on_commit_reload_policy_aux(field, &index, &reader)
|
test_index_on_commit_reload_policy_aux(field, &index, &reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
@@ -679,7 +674,7 @@ mod tests {
|
|||||||
use tempfile::TempDir;
|
use tempfile::TempDir;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_index_on_commit_reload_policy_mmap() -> crate::Result<()> {
|
fn test_index_on_commit_reload_policy_mmap() {
|
||||||
let schema = throw_away_schema();
|
let schema = throw_away_schema();
|
||||||
let field = schema.get_field("num_likes").unwrap();
|
let field = schema.get_field("num_likes").unwrap();
|
||||||
let tempdir = TempDir::new().unwrap();
|
let tempdir = TempDir::new().unwrap();
|
||||||
@@ -691,7 +686,7 @@ mod tests {
|
|||||||
.try_into()
|
.try_into()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
assert_eq!(reader.searcher().num_docs(), 0);
|
||||||
test_index_on_commit_reload_policy_aux(field, &index, &reader)
|
test_index_on_commit_reload_policy_aux(field, &index, &reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -706,7 +701,7 @@ mod tests {
|
|||||||
.reload_policy(ReloadPolicy::Manual)
|
.reload_policy(ReloadPolicy::Manual)
|
||||||
.try_into()?;
|
.try_into()?;
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
assert_eq!(reader.searcher().num_docs(), 0);
|
||||||
writer.add_document(doc!(field=>1u64))?;
|
writer.add_document(doc!(field=>1u64));
|
||||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||||
let _handle = index.directory_mut().watch(WatchCallback::new(move || {
|
let _handle = index.directory_mut().watch(WatchCallback::new(move || {
|
||||||
let _ = sender.send(());
|
let _ = sender.send(());
|
||||||
@@ -720,7 +715,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_index_on_commit_reload_policy_different_directories() -> crate::Result<()> {
|
fn test_index_on_commit_reload_policy_different_directories() {
|
||||||
let schema = throw_away_schema();
|
let schema = throw_away_schema();
|
||||||
let field = schema.get_field("num_likes").unwrap();
|
let field = schema.get_field("num_likes").unwrap();
|
||||||
let tempdir = TempDir::new().unwrap();
|
let tempdir = TempDir::new().unwrap();
|
||||||
@@ -733,14 +728,10 @@ mod tests {
|
|||||||
.try_into()
|
.try_into()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
assert_eq!(reader.searcher().num_docs(), 0);
|
||||||
test_index_on_commit_reload_policy_aux(field, &write_index, &reader)
|
test_index_on_commit_reload_policy_aux(field, &write_index, &reader);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn test_index_on_commit_reload_policy_aux(
|
fn test_index_on_commit_reload_policy_aux(field: Field, index: &Index, reader: &IndexReader) {
|
||||||
field: Field,
|
|
||||||
index: &Index,
|
|
||||||
reader: &IndexReader,
|
|
||||||
) -> crate::Result<()> {
|
|
||||||
let mut reader_index = reader.index();
|
let mut reader_index = reader.index();
|
||||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||||
let _watch_handle = reader_index
|
let _watch_handle = reader_index
|
||||||
@@ -748,9 +739,9 @@ mod tests {
|
|||||||
.watch(WatchCallback::new(move || {
|
.watch(WatchCallback::new(move || {
|
||||||
let _ = sender.send(());
|
let _ = sender.send(());
|
||||||
}));
|
}));
|
||||||
let mut writer = index.writer_for_tests()?;
|
let mut writer = index.writer_for_tests().unwrap();
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
assert_eq!(reader.searcher().num_docs(), 0);
|
||||||
writer.add_document(doc!(field=>1u64))?;
|
writer.add_document(doc!(field=>1u64));
|
||||||
writer.commit().unwrap();
|
writer.commit().unwrap();
|
||||||
// We need a loop here because it is possible for notify to send more than
|
// We need a loop here because it is possible for notify to send more than
|
||||||
// one modify event. It was observed on CI on MacOS.
|
// one modify event. It was observed on CI on MacOS.
|
||||||
@@ -760,7 +751,7 @@ mod tests {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
writer.add_document(doc!(field=>2u64))?;
|
writer.add_document(doc!(field=>2u64));
|
||||||
writer.commit().unwrap();
|
writer.commit().unwrap();
|
||||||
// ... Same as above
|
// ... Same as above
|
||||||
loop {
|
loop {
|
||||||
@@ -769,37 +760,37 @@ mod tests {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// This test will not pass on windows, because windows
|
// This test will not pass on windows, because windows
|
||||||
// prevent deleting files that are MMapped.
|
// prevent deleting files that are MMapped.
|
||||||
#[cfg(not(target_os = "windows"))]
|
#[cfg(not(target_os = "windows"))]
|
||||||
#[test]
|
#[test]
|
||||||
fn garbage_collect_works_as_intended() -> crate::Result<()> {
|
fn garbage_collect_works_as_intended() {
|
||||||
let directory = RamDirectory::create();
|
let directory = RamDirectory::create();
|
||||||
let schema = throw_away_schema();
|
let schema = throw_away_schema();
|
||||||
let field = schema.get_field("num_likes").unwrap();
|
let field = schema.get_field("num_likes").unwrap();
|
||||||
let index = Index::create(directory.clone(), schema, IndexSettings::default())?;
|
let index = Index::create(directory.clone(), schema, IndexSettings::default()).unwrap();
|
||||||
|
|
||||||
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
|
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
|
||||||
for i in 0u64..8_000u64 {
|
for i in 0u64..8_000u64 {
|
||||||
writer.add_document(doc!(field => i))?;
|
writer.add_document(doc!(field => i));
|
||||||
}
|
}
|
||||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||||
let _handle = directory.watch(WatchCallback::new(move || {
|
let _handle = directory.watch(WatchCallback::new(move || {
|
||||||
let _ = sender.send(());
|
let _ = sender.send(());
|
||||||
}));
|
}));
|
||||||
writer.commit()?;
|
writer.commit().unwrap();
|
||||||
let mem_right_after_commit = directory.total_mem_usage();
|
let mem_right_after_commit = directory.total_mem_usage();
|
||||||
assert!(receiver.recv().is_ok());
|
assert!(receiver.recv().is_ok());
|
||||||
let reader = index
|
let reader = index
|
||||||
.reader_builder()
|
.reader_builder()
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
.reload_policy(ReloadPolicy::Manual)
|
||||||
.try_into()?;
|
.try_into()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 8_000);
|
assert_eq!(reader.searcher().num_docs(), 8_000);
|
||||||
writer.wait_merging_threads()?;
|
writer.wait_merging_threads().unwrap();
|
||||||
let mem_right_after_merge_finished = directory.total_mem_usage();
|
let mem_right_after_merge_finished = directory.total_mem_usage();
|
||||||
|
|
||||||
reader.reload().unwrap();
|
reader.reload().unwrap();
|
||||||
@@ -811,6 +802,5 @@ mod tests {
|
|||||||
mem_right_after_merge_finished,
|
mem_right_after_merge_finished,
|
||||||
mem_right_after_commit
|
mem_right_after_commit
|
||||||
);
|
);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use super::SegmentComponent;
|
|||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
use crate::Opstamp;
|
use crate::Opstamp;
|
||||||
use crate::{core::SegmentId, store::Compressor};
|
use crate::{core::SegmentId, store::Compressor};
|
||||||
use crate::{Inventory, TrackedObject};
|
use census::{Inventory, TrackedObject};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::{collections::HashSet, sync::atomic::AtomicBool};
|
use std::{collections::HashSet, sync::atomic::AtomicBool};
|
||||||
@@ -189,10 +189,6 @@ impl SegmentMeta {
|
|||||||
|
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
||||||
assert!(
|
|
||||||
num_deleted_docs <= self.max_doc(),
|
|
||||||
"There cannot be more deleted docs than there are docs."
|
|
||||||
);
|
|
||||||
let delete_meta = DeleteMeta {
|
let delete_meta = DeleteMeta {
|
||||||
num_deleted_docs,
|
num_deleted_docs,
|
||||||
opstamp,
|
opstamp,
|
||||||
@@ -398,7 +394,7 @@ mod tests {
|
|||||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
json,
|
json,
|
||||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ pub use self::index_meta::{
|
|||||||
IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta, SegmentMetaInventory,
|
IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta, SegmentMetaInventory,
|
||||||
};
|
};
|
||||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||||
pub use self::searcher::{Searcher, SearcherGeneration};
|
pub use self::searcher::Searcher;
|
||||||
pub use self::segment::Segment;
|
pub use self::segment::Segment;
|
||||||
pub use self::segment_component::SegmentComponent;
|
pub use self::segment_component::SegmentComponent;
|
||||||
pub use self::segment_id::SegmentId;
|
pub use self::segment_id::SegmentId;
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use crate::collector::Collector;
|
use crate::collector::Collector;
|
||||||
use crate::core::Executor;
|
use crate::core::Executor;
|
||||||
|
|
||||||
use crate::core::SegmentReader;
|
use crate::core::SegmentReader;
|
||||||
use crate::query::Query;
|
use crate::query::Query;
|
||||||
use crate::schema::Document;
|
use crate::schema::Document;
|
||||||
@@ -9,62 +10,9 @@ use crate::space_usage::SearcherSpaceUsage;
|
|||||||
use crate::store::StoreReader;
|
use crate::store::StoreReader;
|
||||||
use crate::DocAddress;
|
use crate::DocAddress;
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use crate::Opstamp;
|
|
||||||
use crate::SegmentId;
|
|
||||||
use crate::TrackedObject;
|
|
||||||
|
|
||||||
use std::collections::BTreeMap;
|
|
||||||
use std::{fmt, io};
|
use std::{fmt, io};
|
||||||
|
|
||||||
/// Identifies the searcher generation accessed by a [Searcher].
|
|
||||||
///
|
|
||||||
/// While this might seem redundant, a [SearcherGeneration] contains
|
|
||||||
/// both a `generation_id` AND a list of `(SegmentId, DeleteOpstamp)`.
|
|
||||||
///
|
|
||||||
/// This is on purpose. This object is used by the `Warmer` API.
|
|
||||||
/// Having both information makes it possible to identify which
|
|
||||||
/// artifact should be refreshed or garbage collected.
|
|
||||||
///
|
|
||||||
/// Depending on the use case, `Warmer`'s implementers can decide to
|
|
||||||
/// produce artifacts per:
|
|
||||||
/// - `generation_id` (e.g. some searcher level aggregates)
|
|
||||||
/// - `(segment_id, delete_opstamp)` (e.g. segment level aggregates)
|
|
||||||
/// - `segment_id` (e.g. for immutable document level information)
|
|
||||||
/// - `(generation_id, segment_id)` (e.g. for consistent dynamic column)
|
|
||||||
/// - ...
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
|
||||||
pub struct SearcherGeneration {
|
|
||||||
segments: BTreeMap<SegmentId, Option<Opstamp>>,
|
|
||||||
generation_id: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SearcherGeneration {
|
|
||||||
pub(crate) fn from_segment_readers(
|
|
||||||
segment_readers: &[SegmentReader],
|
|
||||||
generation_id: u64,
|
|
||||||
) -> Self {
|
|
||||||
let mut segment_id_to_del_opstamp = BTreeMap::new();
|
|
||||||
for segment_reader in segment_readers {
|
|
||||||
segment_id_to_del_opstamp
|
|
||||||
.insert(segment_reader.segment_id(), segment_reader.delete_opstamp());
|
|
||||||
}
|
|
||||||
Self {
|
|
||||||
segments: segment_id_to_del_opstamp,
|
|
||||||
generation_id,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the searcher generation id.
|
|
||||||
pub fn generation_id(&self) -> u64 {
|
|
||||||
self.generation_id
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return a `(SegmentId -> DeleteOpstamp)` mapping.
|
|
||||||
pub fn segments(&self) -> &BTreeMap<SegmentId, Option<Opstamp>> {
|
|
||||||
&self.segments
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Holds a list of `SegmentReader`s ready for search.
|
/// Holds a list of `SegmentReader`s ready for search.
|
||||||
///
|
///
|
||||||
/// It guarantees that the `Segment` will not be removed before
|
/// It guarantees that the `Segment` will not be removed before
|
||||||
@@ -75,7 +23,6 @@ pub struct Searcher {
|
|||||||
index: Index,
|
index: Index,
|
||||||
segment_readers: Vec<SegmentReader>,
|
segment_readers: Vec<SegmentReader>,
|
||||||
store_readers: Vec<StoreReader>,
|
store_readers: Vec<StoreReader>,
|
||||||
generation: TrackedObject<SearcherGeneration>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Searcher {
|
impl Searcher {
|
||||||
@@ -84,7 +31,6 @@ impl Searcher {
|
|||||||
schema: Schema,
|
schema: Schema,
|
||||||
index: Index,
|
index: Index,
|
||||||
segment_readers: Vec<SegmentReader>,
|
segment_readers: Vec<SegmentReader>,
|
||||||
generation: TrackedObject<SearcherGeneration>,
|
|
||||||
) -> io::Result<Searcher> {
|
) -> io::Result<Searcher> {
|
||||||
let store_readers: Vec<StoreReader> = segment_readers
|
let store_readers: Vec<StoreReader> = segment_readers
|
||||||
.iter()
|
.iter()
|
||||||
@@ -95,7 +41,6 @@ impl Searcher {
|
|||||||
index,
|
index,
|
||||||
segment_readers,
|
segment_readers,
|
||||||
store_readers,
|
store_readers,
|
||||||
generation,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -104,11 +49,6 @@ impl Searcher {
|
|||||||
&self.index
|
&self.index
|
||||||
}
|
}
|
||||||
|
|
||||||
/// [SearcherGeneration] which identifies the version of the snapshot held by this `Searcher`.
|
|
||||||
pub fn generation(&self) -> &SearcherGeneration {
|
|
||||||
self.generation.as_ref()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||||
///
|
///
|
||||||
/// The searcher uses the segment ordinal to route the
|
/// The searcher uses the segment ordinal to route the
|
||||||
@@ -148,7 +88,7 @@ impl Searcher {
|
|||||||
&self.segment_readers
|
&self.segment_readers
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the segment_reader associated with the given segment_ord
|
/// Returns the segment_reader associated with the given segment_ordinal
|
||||||
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
|
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
|
||||||
&self.segment_readers[segment_ord as usize]
|
&self.segment_readers[segment_ord as usize]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ use crate::core::SegmentId;
|
|||||||
use crate::directory::CompositeFile;
|
use crate::directory::CompositeFile;
|
||||||
use crate::directory::FileSlice;
|
use crate::directory::FileSlice;
|
||||||
use crate::error::DataCorruption;
|
use crate::error::DataCorruption;
|
||||||
use crate::fastfield::intersect_alive_bitsets;
|
|
||||||
use crate::fastfield::AliveBitSet;
|
use crate::fastfield::AliveBitSet;
|
||||||
use crate::fastfield::FacetReader;
|
use crate::fastfield::FacetReader;
|
||||||
use crate::fastfield::FastFieldReaders;
|
use crate::fastfield::FastFieldReaders;
|
||||||
@@ -17,7 +16,6 @@ use crate::space_usage::SegmentSpaceUsage;
|
|||||||
use crate::store::StoreReader;
|
use crate::store::StoreReader;
|
||||||
use crate::termdict::TermDictionary;
|
use crate::termdict::TermDictionary;
|
||||||
use crate::DocId;
|
use crate::DocId;
|
||||||
use crate::Opstamp;
|
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -39,8 +37,6 @@ pub struct SegmentReader {
|
|||||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||||
|
|
||||||
segment_id: SegmentId,
|
segment_id: SegmentId,
|
||||||
delete_opstamp: Option<Opstamp>,
|
|
||||||
|
|
||||||
max_doc: DocId,
|
max_doc: DocId,
|
||||||
num_docs: DocId,
|
num_docs: DocId,
|
||||||
|
|
||||||
@@ -103,7 +99,7 @@ impl SegmentReader {
|
|||||||
let field_entry = self.schema.get_field_entry(field);
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
|
|
||||||
match field_entry.field_type() {
|
match field_entry.field_type() {
|
||||||
FieldType::Facet(_) => {
|
FieldType::HierarchicalFacet(_) => {
|
||||||
let term_ords_reader = self.fast_fields().u64s(field)?;
|
let term_ords_reader = self.fast_fields().u64s(field)?;
|
||||||
let termdict = self
|
let termdict = self
|
||||||
.termdict_composite
|
.termdict_composite
|
||||||
@@ -130,17 +126,13 @@ impl SegmentReader {
|
|||||||
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
|
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
|
||||||
let field_name = self.schema.get_field_name(field);
|
let field_name = self.schema.get_field_name(field);
|
||||||
let err_msg = format!(
|
let err_msg = format!(
|
||||||
"Field norm not found for field {:?}. Was the field set to record norm during indexing?",
|
"Field norm not found for field {:?}. Was it marked as indexed during indexing?",
|
||||||
field_name
|
field_name
|
||||||
);
|
);
|
||||||
crate::TantivyError::SchemaError(err_msg)
|
crate::TantivyError::SchemaError(err_msg)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn fieldnorms_readers(&self) -> &FieldNormReaders {
|
|
||||||
&self.fieldnorm_readers
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Accessor to the segment's `StoreReader`.
|
/// Accessor to the segment's `StoreReader`.
|
||||||
pub fn get_store_reader(&self) -> io::Result<StoreReader> {
|
pub fn get_store_reader(&self) -> io::Result<StoreReader> {
|
||||||
StoreReader::open(self.store_file.clone())
|
StoreReader::open(self.store_file.clone())
|
||||||
@@ -148,14 +140,6 @@ impl SegmentReader {
|
|||||||
|
|
||||||
/// Open a new segment for reading.
|
/// Open a new segment for reading.
|
||||||
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
|
pub fn open(segment: &Segment) -> crate::Result<SegmentReader> {
|
||||||
Self::open_with_custom_alive_set(segment, None)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Open a new segment for reading.
|
|
||||||
pub fn open_with_custom_alive_set(
|
|
||||||
segment: &Segment,
|
|
||||||
custom_bitset: Option<AliveBitSet>,
|
|
||||||
) -> crate::Result<SegmentReader> {
|
|
||||||
let termdict_file = segment.open_read(SegmentComponent::Terms)?;
|
let termdict_file = segment.open_read(SegmentComponent::Terms)?;
|
||||||
let termdict_composite = CompositeFile::open(&termdict_file)?;
|
let termdict_composite = CompositeFile::open(&termdict_file)?;
|
||||||
|
|
||||||
@@ -180,35 +164,27 @@ impl SegmentReader {
|
|||||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||||
let fast_field_readers =
|
let fast_field_readers =
|
||||||
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
|
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
|
||||||
|
|
||||||
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
|
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
|
||||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||||
|
|
||||||
let original_bitset = if segment.meta().has_deletes() {
|
let alive_bitset_opt = if segment.meta().has_deletes() {
|
||||||
let delete_file_slice = segment.open_read(SegmentComponent::Delete)?;
|
let alive_bitset_bytes = segment.open_read(SegmentComponent::Delete)?.read_bytes()?;
|
||||||
let delete_data = delete_file_slice.read_bytes()?;
|
let alive_bitset = AliveBitSet::open(alive_bitset_bytes);
|
||||||
Some(AliveBitSet::open(delete_data))
|
Some(alive_bitset)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
let alive_bitset_opt = intersect_alive_bitset(original_bitset, custom_bitset);
|
|
||||||
|
|
||||||
let max_doc = segment.meta().max_doc();
|
|
||||||
let num_docs = alive_bitset_opt
|
|
||||||
.as_ref()
|
|
||||||
.map(|alive_bitset| alive_bitset.num_alive_docs() as u32)
|
|
||||||
.unwrap_or(max_doc);
|
|
||||||
|
|
||||||
Ok(SegmentReader {
|
Ok(SegmentReader {
|
||||||
inv_idx_reader_cache: Default::default(),
|
inv_idx_reader_cache: Default::default(),
|
||||||
num_docs,
|
max_doc: segment.meta().max_doc(),
|
||||||
max_doc,
|
num_docs: segment.meta().num_docs(),
|
||||||
termdict_composite,
|
termdict_composite,
|
||||||
postings_composite,
|
postings_composite,
|
||||||
fast_fields_readers: fast_field_readers,
|
fast_fields_readers: fast_field_readers,
|
||||||
fieldnorm_readers,
|
fieldnorm_readers,
|
||||||
segment_id: segment.id(),
|
segment_id: segment.id(),
|
||||||
delete_opstamp: segment.meta().delete_opstamp(),
|
|
||||||
store_file,
|
store_file,
|
||||||
alive_bitset_opt,
|
alive_bitset_opt,
|
||||||
positions_composite,
|
positions_composite,
|
||||||
@@ -294,11 +270,6 @@ impl SegmentReader {
|
|||||||
self.segment_id
|
self.segment_id
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the delete opstamp
|
|
||||||
pub fn delete_opstamp(&self) -> Option<Opstamp> {
|
|
||||||
self.delete_opstamp
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the bitset representing
|
/// Returns the bitset representing
|
||||||
/// the documents that have been deleted.
|
/// the documents that have been deleted.
|
||||||
pub fn alive_bitset(&self) -> Option<&AliveBitSet> {
|
pub fn alive_bitset(&self) -> Option<&AliveBitSet> {
|
||||||
@@ -340,21 +311,6 @@ impl SegmentReader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn intersect_alive_bitset(
|
|
||||||
left_opt: Option<AliveBitSet>,
|
|
||||||
right_opt: Option<AliveBitSet>,
|
|
||||||
) -> Option<AliveBitSet> {
|
|
||||||
match (left_opt, right_opt) {
|
|
||||||
(Some(left), Some(right)) => {
|
|
||||||
assert_eq!(left.bitset().max_value(), right.bitset().max_value());
|
|
||||||
Some(intersect_alive_bitsets(left, right))
|
|
||||||
}
|
|
||||||
(Some(left), None) => Some(left),
|
|
||||||
(None, Some(right)) => Some(right),
|
|
||||||
(None, None) => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Debug for SegmentReader {
|
impl fmt::Debug for SegmentReader {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
write!(f, "SegmentReader({:?})", self.segment_id)
|
write!(f, "SegmentReader({:?})", self.segment_id)
|
||||||
@@ -377,10 +333,10 @@ mod test {
|
|||||||
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(name => "tantivy"))?;
|
index_writer.add_document(doc!(name => "tantivy"));
|
||||||
index_writer.add_document(doc!(name => "horse"))?;
|
index_writer.add_document(doc!(name => "horse"));
|
||||||
index_writer.add_document(doc!(name => "jockey"))?;
|
index_writer.add_document(doc!(name => "jockey"));
|
||||||
index_writer.add_document(doc!(name => "cap"))?;
|
index_writer.add_document(doc!(name => "cap"));
|
||||||
// we should now have one segment with two docs
|
// we should now have one segment with two docs
|
||||||
index_writer.delete_term(Term::from_field_text(name, "horse"));
|
index_writer.delete_term(Term::from_field_text(name, "horse"));
|
||||||
index_writer.delete_term(Term::from_field_text(name, "cap"));
|
index_writer.delete_term(Term::from_field_text(name, "cap"));
|
||||||
@@ -403,10 +359,10 @@ mod test {
|
|||||||
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(name => "tantivy"))?;
|
index_writer.add_document(doc!(name => "tantivy"));
|
||||||
index_writer.add_document(doc!(name => "horse"))?;
|
index_writer.add_document(doc!(name => "horse"));
|
||||||
index_writer.add_document(doc!(name => "jockey"))?;
|
index_writer.add_document(doc!(name => "jockey"));
|
||||||
index_writer.add_document(doc!(name => "cap"))?;
|
index_writer.add_document(doc!(name => "cap"));
|
||||||
// we should now have one segment with two docs
|
// we should now have one segment with two docs
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -43,8 +43,10 @@ impl RetryPolicy {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// The `DirectoryLock` is an object that represents a file lock.
|
/// The `DirectoryLock` is an object that represents a file lock.
|
||||||
|
/// See [`LockType`](struct.LockType.html)
|
||||||
///
|
///
|
||||||
/// It is associated to a lock file, that gets deleted on `Drop.`
|
/// It is transparently associated to a lock file, that gets deleted
|
||||||
|
/// on `Drop.` The lock is released automatically on `Drop`.
|
||||||
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
|
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
|
||||||
|
|
||||||
struct DirectoryLockGuard {
|
struct DirectoryLockGuard {
|
||||||
@@ -140,16 +142,10 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
|||||||
/// Opens a writer for the *virtual file* associated with
|
/// Opens a writer for the *virtual file* associated with
|
||||||
/// a Path.
|
/// a Path.
|
||||||
///
|
///
|
||||||
/// Right after this call, for the span of the execution of the program
|
/// Right after this call, the file should be created
|
||||||
/// the file should be created and any subsequent call to `open_read` for the
|
/// and any subsequent call to `open_read` for the
|
||||||
/// same path should return a `FileSlice`.
|
/// same path should return a `FileSlice`.
|
||||||
///
|
///
|
||||||
/// However, depending on the directory implementation,
|
|
||||||
/// it might be required to call `sync_directory` to ensure
|
|
||||||
/// that the file is durably created.
|
|
||||||
/// (The semantics here are the same when dealing with
|
|
||||||
/// a posix filesystem.)
|
|
||||||
///
|
|
||||||
/// Write operations may be aggressively buffered.
|
/// Write operations may be aggressively buffered.
|
||||||
/// The client of this trait is responsible for calling flush
|
/// The client of this trait is responsible for calling flush
|
||||||
/// to ensure that subsequent `read` operations
|
/// to ensure that subsequent `read` operations
|
||||||
@@ -180,12 +176,6 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
|||||||
/// The file may or may not previously exist.
|
/// The file may or may not previously exist.
|
||||||
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()>;
|
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()>;
|
||||||
|
|
||||||
/// Sync the directory.
|
|
||||||
///
|
|
||||||
/// This call is required to ensure that newly created files are
|
|
||||||
/// effectively stored durably.
|
|
||||||
fn sync_directory(&self) -> io::Result<()>;
|
|
||||||
|
|
||||||
/// Acquire a lock in the given directory.
|
/// Acquire a lock in the given directory.
|
||||||
///
|
///
|
||||||
/// The method is blocking or not depending on the `Lock` object.
|
/// The method is blocking or not depending on the `Lock` object.
|
||||||
@@ -240,15 +230,3 @@ where
|
|||||||
Box::new(self.clone())
|
Box::new(self.clone())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for Box<dyn Directory> {
|
|
||||||
fn clone(&self) -> Self {
|
|
||||||
self.box_clone()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: Directory + 'static> From<T> for Box<dyn Directory> {
|
|
||||||
fn from(t: T) -> Self {
|
|
||||||
Box::new(t)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -7,8 +7,8 @@ use std::path::PathBuf;
|
|||||||
/// [`LockParams`](./enum.LockParams.html).
|
/// [`LockParams`](./enum.LockParams.html).
|
||||||
/// Tantivy itself uses only two locks but client application
|
/// Tantivy itself uses only two locks but client application
|
||||||
/// can use the directory facility to define their own locks.
|
/// can use the directory facility to define their own locks.
|
||||||
/// - [INDEX_WRITER_LOCK]
|
/// - [INDEX_WRITER_LOCK](./struct.INDEX_WRITER_LOCK.html)
|
||||||
/// - [META_LOCK]
|
/// - [META_LOCK](./struct.META_LOCK.html)
|
||||||
///
|
///
|
||||||
/// Check out these locks documentation for more information.
|
/// Check out these locks documentation for more information.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -39,16 +39,6 @@ pub enum OpenDirectoryError {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
impl OpenDirectoryError {
|
|
||||||
/// Wraps an io error.
|
|
||||||
pub fn wrap_io_error(io_error: io::Error, directory_path: PathBuf) -> Self {
|
|
||||||
Self::IoError {
|
|
||||||
io_error,
|
|
||||||
directory_path,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Error that may occur when starting to write in a file
|
/// Error that may occur when starting to write in a file
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
pub enum OpenWriteError {
|
pub enum OpenWriteError {
|
||||||
|
|||||||
@@ -66,7 +66,6 @@ impl FileSlice {
|
|||||||
|
|
||||||
/// Wraps a FileHandle.
|
/// Wraps a FileHandle.
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
#[must_use]
|
|
||||||
pub fn new_with_num_bytes(file_handle: Box<dyn FileHandle>, num_bytes: usize) -> Self {
|
pub fn new_with_num_bytes(file_handle: Box<dyn FileHandle>, num_bytes: usize) -> Self {
|
||||||
FileSlice {
|
FileSlice {
|
||||||
data: Arc::from(file_handle),
|
data: Arc::from(file_handle),
|
||||||
|
|||||||
@@ -43,16 +43,14 @@ impl FileWatcher {
|
|||||||
thread::Builder::new()
|
thread::Builder::new()
|
||||||
.name("thread-tantivy-meta-file-watcher".to_string())
|
.name("thread-tantivy-meta-file-watcher".to_string())
|
||||||
.spawn(move || {
|
.spawn(move || {
|
||||||
let mut current_checksum_opt = None;
|
let mut current_checksum = None;
|
||||||
|
|
||||||
while state.load(Ordering::SeqCst) == 1 {
|
while state.load(Ordering::SeqCst) == 1 {
|
||||||
if let Ok(checksum) = FileWatcher::compute_checksum(&path) {
|
if let Ok(checksum) = FileWatcher::compute_checksum(&path) {
|
||||||
let metafile_has_changed = current_checksum_opt
|
// `None.unwrap_or_else(|| !checksum) != checksum` evaluates to `true`
|
||||||
.map(|current_checksum| current_checksum != checksum)
|
if current_checksum.unwrap_or_else(|| !checksum) != checksum {
|
||||||
.unwrap_or(true);
|
|
||||||
if metafile_has_changed {
|
|
||||||
info!("Meta file {:?} was modified", path);
|
info!("Meta file {:?} was modified", path);
|
||||||
current_checksum_opt = Some(checksum);
|
current_checksum = Some(checksum);
|
||||||
futures::executor::block_on(callbacks.broadcast());
|
futures::executor::block_on(callbacks.broadcast());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ fn save_managed_paths(
|
|||||||
|
|
||||||
impl ManagedDirectory {
|
impl ManagedDirectory {
|
||||||
/// Wraps a directory as managed directory.
|
/// Wraps a directory as managed directory.
|
||||||
pub fn wrap(directory: Box<dyn Directory>) -> crate::Result<ManagedDirectory> {
|
pub fn wrap<Dir: Directory>(directory: Dir) -> crate::Result<ManagedDirectory> {
|
||||||
match directory.atomic_read(&MANAGED_FILEPATH) {
|
match directory.atomic_read(&MANAGED_FILEPATH) {
|
||||||
Ok(data) => {
|
Ok(data) => {
|
||||||
let managed_files_json = String::from_utf8_lossy(&data);
|
let managed_files_json = String::from_utf8_lossy(&data);
|
||||||
@@ -76,14 +76,14 @@ impl ManagedDirectory {
|
|||||||
)
|
)
|
||||||
})?;
|
})?;
|
||||||
Ok(ManagedDirectory {
|
Ok(ManagedDirectory {
|
||||||
directory,
|
directory: Box::new(directory),
|
||||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||||
managed_paths: managed_files,
|
managed_paths: managed_files,
|
||||||
})),
|
})),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
|
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
|
||||||
directory,
|
directory: Box::new(directory),
|
||||||
meta_informations: Arc::default(),
|
meta_informations: Arc::default(),
|
||||||
}),
|
}),
|
||||||
io_err @ Err(OpenReadError::IoError { .. }) => Err(io_err.err().unwrap().into()),
|
io_err @ Err(OpenReadError::IoError { .. }) => Err(io_err.err().unwrap().into()),
|
||||||
@@ -192,7 +192,6 @@ impl ManagedDirectory {
|
|||||||
for delete_file in &deleted_files {
|
for delete_file in &deleted_files {
|
||||||
managed_paths_write.remove(delete_file);
|
managed_paths_write.remove(delete_file);
|
||||||
}
|
}
|
||||||
self.directory.sync_directory()?;
|
|
||||||
save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?;
|
save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -223,22 +222,9 @@ impl ManagedDirectory {
|
|||||||
.write()
|
.write()
|
||||||
.expect("Managed file lock poisoned");
|
.expect("Managed file lock poisoned");
|
||||||
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
||||||
if !has_changed {
|
if has_changed {
|
||||||
return Ok(());
|
save_managed_paths(self.directory.as_ref(), &meta_wlock)?;
|
||||||
}
|
}
|
||||||
save_managed_paths(self.directory.as_ref(), &meta_wlock)?;
|
|
||||||
// This is not the first file we add.
|
|
||||||
// Therefore, we are sure that `.managed.json` has been already
|
|
||||||
// properly created and we do not need to sync its parent directory.
|
|
||||||
//
|
|
||||||
// (It might seem like a nicer solution to create the managed_json on the
|
|
||||||
// creation of the ManagedDirectory instance but it would actually
|
|
||||||
// prevent the use of read-only directories..)
|
|
||||||
let managed_file_definitely_already_exists = meta_wlock.managed_paths.len() > 1;
|
|
||||||
if managed_file_definitely_already_exists {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
self.directory.sync_directory()?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -324,11 +310,6 @@ impl Directory for ManagedDirectory {
|
|||||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||||
self.directory.watch(watch_callback)
|
self.directory.watch(watch_callback)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn sync_directory(&self) -> io::Result<()> {
|
|
||||||
self.directory.sync_directory()?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for ManagedDirectory {
|
impl Clone for ManagedDirectory {
|
||||||
@@ -359,7 +340,7 @@ mod tests_mmap_specific {
|
|||||||
let test_path2: &'static Path = Path::new("some_path_for_test_2");
|
let test_path2: &'static Path = Path::new("some_path_for_test_2");
|
||||||
{
|
{
|
||||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||||
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap();
|
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||||
let write_file = managed_directory.open_write(test_path1).unwrap();
|
let write_file = managed_directory.open_write(test_path1).unwrap();
|
||||||
write_file.terminate().unwrap();
|
write_file.terminate().unwrap();
|
||||||
managed_directory
|
managed_directory
|
||||||
@@ -374,7 +355,7 @@ mod tests_mmap_specific {
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||||
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap();
|
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||||
assert!(managed_directory.exists(test_path1).unwrap());
|
assert!(managed_directory.exists(test_path1).unwrap());
|
||||||
assert!(!managed_directory.exists(test_path2).unwrap());
|
assert!(!managed_directory.exists(test_path2).unwrap());
|
||||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||||
@@ -393,7 +374,7 @@ mod tests_mmap_specific {
|
|||||||
let living_files = HashSet::new();
|
let living_files = HashSet::new();
|
||||||
|
|
||||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||||
let mut managed_directory = ManagedDirectory::wrap(Box::new(mmap_directory)).unwrap();
|
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||||
let mut write = managed_directory.open_write(test_path1).unwrap();
|
let mut write = managed_directory.open_write(test_path1).unwrap();
|
||||||
write.write_all(&[0u8, 1u8]).unwrap();
|
write.write_all(&[0u8, 1u8]).unwrap();
|
||||||
write.terminate().unwrap();
|
write.terminate().unwrap();
|
||||||
|
|||||||
@@ -74,12 +74,20 @@ pub struct CacheInfo {
|
|||||||
pub mmapped: Vec<PathBuf>,
|
pub mmapped: Vec<PathBuf>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
struct MmapCache {
|
struct MmapCache {
|
||||||
counters: CacheCounters,
|
counters: CacheCounters,
|
||||||
cache: HashMap<PathBuf, WeakArcBytes>,
|
cache: HashMap<PathBuf, WeakArcBytes>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Default for MmapCache {
|
||||||
|
fn default() -> MmapCache {
|
||||||
|
MmapCache {
|
||||||
|
counters: CacheCounters::default(),
|
||||||
|
cache: HashMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl MmapCache {
|
impl MmapCache {
|
||||||
fn get_info(&self) -> CacheInfo {
|
fn get_info(&self) -> CacheInfo {
|
||||||
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
|
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
|
||||||
@@ -193,19 +201,16 @@ impl MmapDirectory {
|
|||||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
|
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||||
let directory_path: &Path = directory_path.as_ref();
|
let directory_path: &Path = directory_path.as_ref();
|
||||||
if !directory_path.exists() {
|
if !directory_path.exists() {
|
||||||
return Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
|
Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
|
||||||
directory_path,
|
directory_path,
|
||||||
)));
|
)))
|
||||||
}
|
} else if !directory_path.is_dir() {
|
||||||
let canonical_path: PathBuf = directory_path.canonicalize().map_err(|io_err| {
|
Err(OpenDirectoryError::NotADirectory(PathBuf::from(
|
||||||
OpenDirectoryError::wrap_io_error(io_err, PathBuf::from(directory_path))
|
|
||||||
})?;
|
|
||||||
if !canonical_path.is_dir() {
|
|
||||||
return Err(OpenDirectoryError::NotADirectory(PathBuf::from(
|
|
||||||
directory_path,
|
directory_path,
|
||||||
)));
|
)))
|
||||||
|
} else {
|
||||||
|
Ok(MmapDirectory::new(PathBuf::from(directory_path), None))
|
||||||
}
|
}
|
||||||
Ok(MmapDirectory::new(canonical_path, None))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Joins a relative_path to the directory `root_path`
|
/// Joins a relative_path to the directory `root_path`
|
||||||
@@ -214,6 +219,33 @@ impl MmapDirectory {
|
|||||||
self.inner.root_path.join(relative_path)
|
self.inner.root_path.join(relative_path)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Sync the root directory.
|
||||||
|
/// In certain FS, this is required to persistently create
|
||||||
|
/// a file.
|
||||||
|
fn sync_directory(&self) -> Result<(), io::Error> {
|
||||||
|
let mut open_opts = OpenOptions::new();
|
||||||
|
|
||||||
|
// Linux needs read to be set, otherwise returns EINVAL
|
||||||
|
// write must not be set, or it fails with EISDIR
|
||||||
|
open_opts.read(true);
|
||||||
|
|
||||||
|
// On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
|
||||||
|
// and calling sync_all() only works if write access is requested.
|
||||||
|
#[cfg(windows)]
|
||||||
|
{
|
||||||
|
use std::os::windows::fs::OpenOptionsExt;
|
||||||
|
use winapi::um::winbase;
|
||||||
|
|
||||||
|
open_opts
|
||||||
|
.write(true)
|
||||||
|
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
|
||||||
|
}
|
||||||
|
|
||||||
|
let fd = open_opts.open(&self.inner.root_path)?;
|
||||||
|
fd.sync_all()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns some statistical information
|
/// Returns some statistical information
|
||||||
/// about the Mmap cache.
|
/// about the Mmap cache.
|
||||||
///
|
///
|
||||||
@@ -264,7 +296,8 @@ impl Write for SafeFileWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn flush(&mut self) -> io::Result<()> {
|
fn flush(&mut self) -> io::Result<()> {
|
||||||
Ok(())
|
self.0.flush()?;
|
||||||
|
self.0.sync_all()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -276,9 +309,7 @@ impl Seek for SafeFileWriter {
|
|||||||
|
|
||||||
impl TerminatingWrite for SafeFileWriter {
|
impl TerminatingWrite for SafeFileWriter {
|
||||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
|
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
|
||||||
self.0.flush()?;
|
self.flush()
|
||||||
self.0.sync_data()?;
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -308,7 +339,6 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> {
|
|||||||
let mut tempfile = tempfile::Builder::new().tempfile_in(&parent_path)?;
|
let mut tempfile = tempfile::Builder::new().tempfile_in(&parent_path)?;
|
||||||
tempfile.write_all(content)?;
|
tempfile.write_all(content)?;
|
||||||
tempfile.flush()?;
|
tempfile.flush()?;
|
||||||
tempfile.as_file_mut().sync_data()?;
|
|
||||||
tempfile.into_temp_path().persist(path)?;
|
tempfile.into_temp_path().persist(path)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -343,17 +373,22 @@ impl Directory for MmapDirectory {
|
|||||||
/// removed before the file is deleted.
|
/// removed before the file is deleted.
|
||||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||||
let full_path = self.resolve_path(path);
|
let full_path = self.resolve_path(path);
|
||||||
fs::remove_file(&full_path).map_err(|e| {
|
match fs::remove_file(&full_path) {
|
||||||
if e.kind() == io::ErrorKind::NotFound {
|
Ok(_) => self.sync_directory().map_err(|e| DeleteError::IoError {
|
||||||
DeleteError::FileDoesNotExist(path.to_owned())
|
io_error: e,
|
||||||
} else {
|
filepath: path.to_path_buf(),
|
||||||
DeleteError::IoError {
|
}),
|
||||||
io_error: e,
|
Err(e) => {
|
||||||
filepath: path.to_path_buf(),
|
if e.kind() == io::ErrorKind::NotFound {
|
||||||
|
Err(DeleteError::FileDoesNotExist(path.to_owned()))
|
||||||
|
} else {
|
||||||
|
Err(DeleteError::IoError {
|
||||||
|
io_error: e,
|
||||||
|
filepath: path.to_path_buf(),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})?;
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn exists(&self, path: &Path) -> Result<bool, OpenReadError> {
|
fn exists(&self, path: &Path) -> Result<bool, OpenReadError> {
|
||||||
@@ -382,13 +417,10 @@ impl Directory for MmapDirectory {
|
|||||||
file.flush()
|
file.flush()
|
||||||
.map_err(|io_error| OpenWriteError::wrap_io_error(io_error, path.to_path_buf()))?;
|
.map_err(|io_error| OpenWriteError::wrap_io_error(io_error, path.to_path_buf()))?;
|
||||||
|
|
||||||
// Note we actually do not sync the parent directory here.
|
// Apparetntly, on some filesystem syncing the parent
|
||||||
//
|
// directory is required.
|
||||||
// A newly created file, may, in some case, be created and even flushed to disk.
|
self.sync_directory()
|
||||||
// and then lost...
|
.map_err(|io_err| OpenWriteError::wrap_io_error(io_err, path.to_path_buf()))?;
|
||||||
//
|
|
||||||
// The file will only be durably written after we terminate AND
|
|
||||||
// sync_directory() is called.
|
|
||||||
|
|
||||||
let writer = SafeFileWriter::new(file);
|
let writer = SafeFileWriter::new(file);
|
||||||
Ok(BufWriter::new(Box::new(writer)))
|
Ok(BufWriter::new(Box::new(writer)))
|
||||||
@@ -418,7 +450,7 @@ impl Directory for MmapDirectory {
|
|||||||
debug!("Atomic Write {:?}", path);
|
debug!("Atomic Write {:?}", path);
|
||||||
let full_path = self.resolve_path(path);
|
let full_path = self.resolve_path(path);
|
||||||
atomic_write(&full_path, content)?;
|
atomic_write(&full_path, content)?;
|
||||||
Ok(())
|
self.sync_directory()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
|
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
|
||||||
@@ -444,30 +476,6 @@ impl Directory for MmapDirectory {
|
|||||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||||
Ok(self.inner.watch(watch_callback))
|
Ok(self.inner.watch(watch_callback))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn sync_directory(&self) -> Result<(), io::Error> {
|
|
||||||
let mut open_opts = OpenOptions::new();
|
|
||||||
|
|
||||||
// Linux needs read to be set, otherwise returns EINVAL
|
|
||||||
// write must not be set, or it fails with EISDIR
|
|
||||||
open_opts.read(true);
|
|
||||||
|
|
||||||
// On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
|
|
||||||
// and calling sync_all() only works if write access is requested.
|
|
||||||
#[cfg(windows)]
|
|
||||||
{
|
|
||||||
use std::os::windows::fs::OpenOptionsExt;
|
|
||||||
use winapi::um::winbase;
|
|
||||||
|
|
||||||
open_opts
|
|
||||||
.write(true)
|
|
||||||
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
|
|
||||||
}
|
|
||||||
|
|
||||||
let fd = open_opts.open(&self.inner.root_path)?;
|
|
||||||
fd.sync_data()?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -574,8 +582,8 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_mmap_released() -> crate::Result<()> {
|
fn test_mmap_released() {
|
||||||
let mmap_directory = MmapDirectory::create_from_tempdir()?;
|
let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
||||||
let mut schema_builder: SchemaBuilder = Schema::builder();
|
let mut schema_builder: SchemaBuilder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -584,30 +592,31 @@ mod tests {
|
|||||||
let index =
|
let index =
|
||||||
Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap();
|
Index::create(mmap_directory.clone(), schema, IndexSettings::default()).unwrap();
|
||||||
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
let mut log_merge_policy = LogMergePolicy::default();
|
let mut log_merge_policy = LogMergePolicy::default();
|
||||||
log_merge_policy.set_min_num_segments(3);
|
log_merge_policy.set_min_num_segments(3);
|
||||||
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
||||||
for _num_commits in 0..10 {
|
for _num_commits in 0..10 {
|
||||||
for _ in 0..10 {
|
for _ in 0..10 {
|
||||||
index_writer.add_document(doc!(text_field=>"abc"))?;
|
index_writer.add_document(doc!(text_field=>"abc"));
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
let reader = index
|
let reader = index
|
||||||
.reader_builder()
|
.reader_builder()
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
.reload_policy(ReloadPolicy::Manual)
|
||||||
.try_into()?;
|
.try_into()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
for _ in 0..4 {
|
for _ in 0..4 {
|
||||||
index_writer.add_document(doc!(text_field=>"abc"))?;
|
index_writer.add_document(doc!(text_field=>"abc"));
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
reader.reload()?;
|
reader.reload().unwrap();
|
||||||
}
|
}
|
||||||
index_writer.wait_merging_threads()?;
|
index_writer.wait_merging_threads().unwrap();
|
||||||
|
|
||||||
reader.reload()?;
|
reader.reload().unwrap();
|
||||||
let num_segments = reader.searcher().segment_readers().len();
|
let num_segments = reader.searcher().segment_readers().len();
|
||||||
assert!(num_segments <= 4);
|
assert!(num_segments <= 4);
|
||||||
let num_components_except_deletes_and_tempstore =
|
let num_components_except_deletes_and_tempstore =
|
||||||
@@ -618,6 +627,5 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
assert!(mmap_directory.get_cache_info().mmapped.is_empty());
|
assert!(mmap_directory.get_cache_info().mmapped.is_empty());
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
/*!
|
/*!
|
||||||
|
|
||||||
WORM (Write Once Read Many) directory abstraction.
|
WORM directory abstraction.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,13 @@ use super::FileHandle;
|
|||||||
/// Writer associated with the `RamDirectory`
|
/// Writer associated with the `RamDirectory`
|
||||||
///
|
///
|
||||||
/// The Writer just writes a buffer.
|
/// The Writer just writes a buffer.
|
||||||
|
///
|
||||||
|
/// # Panics
|
||||||
|
///
|
||||||
|
/// On drop, if the writer was left in a *dirty* state.
|
||||||
|
/// That is, if flush was not called after the last call
|
||||||
|
/// to write.
|
||||||
|
///
|
||||||
struct VecWriter {
|
struct VecWriter {
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
shared_directory: RamDirectory,
|
shared_directory: RamDirectory,
|
||||||
@@ -39,7 +46,7 @@ impl VecWriter {
|
|||||||
impl Drop for VecWriter {
|
impl Drop for VecWriter {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
if !self.is_flushed {
|
if !self.is_flushed {
|
||||||
warn!(
|
panic!(
|
||||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.",
|
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.",
|
||||||
self.path
|
self.path
|
||||||
)
|
)
|
||||||
@@ -214,8 +221,14 @@ impl Directory for RamDirectory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> {
|
fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||||
|
fail_point!("RamDirectory::atomic_write", |msg| Err(io::Error::new(
|
||||||
|
io::ErrorKind::Other,
|
||||||
|
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||||
|
)));
|
||||||
let path_buf = PathBuf::from(path);
|
let path_buf = PathBuf::from(path);
|
||||||
|
|
||||||
self.fs.write().unwrap().write(path_buf, data);
|
self.fs.write().unwrap().write(path_buf, data);
|
||||||
|
|
||||||
if path == *META_FILEPATH {
|
if path == *META_FILEPATH {
|
||||||
let _ = self.fs.write().unwrap().watch_router.broadcast();
|
let _ = self.fs.write().unwrap().watch_router.broadcast();
|
||||||
}
|
}
|
||||||
@@ -225,10 +238,6 @@ impl Directory for RamDirectory {
|
|||||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||||
Ok(self.fs.write().unwrap().watch(watch_callback))
|
Ok(self.fs.write().unwrap().watch(watch_callback))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn sync_directory(&self) -> io::Result<()> {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -118,6 +118,15 @@ mod ram_directory_tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn ram_directory_panics_if_flush_forgotten() {
|
||||||
|
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||||
|
let ram_directory = RamDirectory::create();
|
||||||
|
let mut write_file = ram_directory.open_write(test_path).unwrap();
|
||||||
|
assert!(write_file.write_all(&[4]).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
fn test_simple(directory: &dyn Directory) -> crate::Result<()> {
|
fn test_simple(directory: &dyn Directory) -> crate::Result<()> {
|
||||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||||
let mut write_file = directory.open_write(test_path)?;
|
let mut write_file = directory.open_write(test_path)?;
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
use crate::space_usage::ByteCount;
|
use crate::space_usage::ByteCount;
|
||||||
use crate::DocId;
|
use crate::DocId;
|
||||||
use common::intersect_bitsets;
|
|
||||||
use common::BitSet;
|
use common::BitSet;
|
||||||
use common::ReadOnlyBitSet;
|
use common::ReadSerializedBitSet;
|
||||||
use ownedbytes::OwnedBytes;
|
use ownedbytes::OwnedBytes;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
@@ -21,19 +20,8 @@ pub fn write_alive_bitset<T: Write>(alive_bitset: &BitSet, writer: &mut T) -> io
|
|||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct AliveBitSet {
|
pub struct AliveBitSet {
|
||||||
num_alive_docs: usize,
|
num_alive_docs: usize,
|
||||||
bitset: ReadOnlyBitSet,
|
bitset: ReadSerializedBitSet,
|
||||||
}
|
num_bytes: ByteCount,
|
||||||
|
|
||||||
/// Intersects two AliveBitSets in a new one.
|
|
||||||
/// The two bitsets need to have the same max_value.
|
|
||||||
pub fn intersect_alive_bitsets(left: AliveBitSet, right: AliveBitSet) -> AliveBitSet {
|
|
||||||
assert_eq!(left.bitset().max_value(), right.bitset().max_value());
|
|
||||||
let bitset = intersect_bitsets(left.bitset(), right.bitset());
|
|
||||||
let num_alive_docs = bitset.len();
|
|
||||||
AliveBitSet {
|
|
||||||
num_alive_docs,
|
|
||||||
bitset,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AliveBitSet {
|
impl AliveBitSet {
|
||||||
@@ -50,15 +38,15 @@ impl AliveBitSet {
|
|||||||
Self::open(alive_bitset_bytes)
|
Self::open(alive_bitset_bytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn from_bitset(bitset: &BitSet) -> AliveBitSet {
|
|
||||||
let readonly_bitset = ReadOnlyBitSet::from(bitset);
|
|
||||||
AliveBitSet::from(readonly_bitset)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Opens a delete bitset given its file.
|
/// Opens a delete bitset given its file.
|
||||||
pub fn open(bytes: OwnedBytes) -> AliveBitSet {
|
pub fn open(bytes: OwnedBytes) -> AliveBitSet {
|
||||||
let bitset = ReadOnlyBitSet::open(bytes);
|
let num_bytes = bytes.len();
|
||||||
AliveBitSet::from(bitset)
|
let bitset = ReadSerializedBitSet::open(bytes);
|
||||||
|
AliveBitSet {
|
||||||
|
num_alive_docs: bitset.len(),
|
||||||
|
bitset,
|
||||||
|
num_bytes,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
||||||
@@ -73,7 +61,7 @@ impl AliveBitSet {
|
|||||||
!self.is_alive(doc)
|
!self.is_alive(doc)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Iterate over the alive doc_ids.
|
/// Iterate over the alive docids.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn iter_alive(&self) -> impl Iterator<Item = DocId> + '_ {
|
pub fn iter_alive(&self) -> impl Iterator<Item = DocId> + '_ {
|
||||||
self.bitset.iter()
|
self.bitset.iter()
|
||||||
@@ -81,7 +69,7 @@ impl AliveBitSet {
|
|||||||
|
|
||||||
/// Get underlying bitset
|
/// Get underlying bitset
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn bitset(&self) -> &ReadOnlyBitSet {
|
pub fn bitset(&self) -> &ReadSerializedBitSet {
|
||||||
&self.bitset
|
&self.bitset
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -92,17 +80,7 @@ impl AliveBitSet {
|
|||||||
|
|
||||||
/// Summarize total space usage of this bitset.
|
/// Summarize total space usage of this bitset.
|
||||||
pub fn space_usage(&self) -> ByteCount {
|
pub fn space_usage(&self) -> ByteCount {
|
||||||
self.bitset().num_bytes()
|
self.num_bytes
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<ReadOnlyBitSet> for AliveBitSet {
|
|
||||||
fn from(bitset: ReadOnlyBitSet) -> AliveBitSet {
|
|
||||||
let num_alive_docs = bitset.len();
|
|
||||||
AliveBitSet {
|
|
||||||
num_alive_docs,
|
|
||||||
bitset,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -18,11 +18,11 @@ mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(bytes_field=>vec![0u8, 1, 2, 3]))?;
|
index_writer.add_document(doc!(bytes_field=>vec![0u8, 1, 2, 3]));
|
||||||
index_writer.add_document(doc!(bytes_field=>vec![]))?;
|
index_writer.add_document(doc!(bytes_field=>vec![]));
|
||||||
index_writer.add_document(doc!(bytes_field=>vec![255u8]))?;
|
index_writer.add_document(doc!(bytes_field=>vec![255u8]));
|
||||||
index_writer.add_document(doc!(bytes_field=>vec![1u8, 3, 5, 7, 9]))?;
|
index_writer.add_document(doc!(bytes_field=>vec![1u8, 3, 5, 7, 9]));
|
||||||
index_writer.add_document(doc!(bytes_field=>vec![0u8; 1000]))?;
|
index_writer.add_document(doc!(bytes_field=>vec![0u8; 1000]));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
@@ -47,7 +47,7 @@ mod tests {
|
|||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
field => b"tantivy".as_ref(),
|
field => b"tantivy".as_ref(),
|
||||||
field => b"lucene".as_ref()
|
field => b"lucene".as_ref()
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
Ok(index.reader()?.searcher())
|
Ok(index.reader()?.searcher())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -84,18 +84,18 @@ impl FacetReader {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use crate::{
|
use crate::{
|
||||||
schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED},
|
schema::{Facet, FacetOptions, SchemaBuilder, Value, INDEXED, STORED},
|
||||||
DocAddress, Document,
|
DocAddress, Document,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_only_indexed() -> crate::Result<()> {
|
fn test_facet_only_indexed() -> crate::Result<()> {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
let facet_reader = searcher
|
let facet_reader = searcher
|
||||||
@@ -106,19 +106,42 @@ mod tests {
|
|||||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||||
assert_eq!(&facet_ords, &[2u64]);
|
assert_eq!(&facet_ords, &[2u64]);
|
||||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||||
let value = doc.get_first(facet_field).and_then(Value::facet);
|
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||||
assert_eq!(value, None);
|
assert_eq!(value, None);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_facet_only_stored() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
let facet_field = schema_builder.add_facet_field("facet", STORED);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
|
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||||
|
index_writer.commit()?;
|
||||||
|
let searcher = index.reader()?.searcher();
|
||||||
|
let facet_reader = searcher
|
||||||
|
.segment_reader(0u32)
|
||||||
|
.facet_reader(facet_field)
|
||||||
|
.unwrap();
|
||||||
|
let mut facet_ords = Vec::new();
|
||||||
|
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||||
|
assert!(facet_ords.is_empty());
|
||||||
|
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||||
|
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||||
|
assert_eq!(value, Some("/a/b".to_string()));
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_stored_and_indexed() -> crate::Result<()> {
|
fn test_facet_stored_and_indexed() -> crate::Result<()> {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let facet_field = schema_builder.add_facet_field("facet", STORED);
|
let facet_field = schema_builder.add_facet_field("facet", STORED | INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
let facet_reader = searcher
|
let facet_reader = searcher
|
||||||
@@ -129,20 +152,43 @@ mod tests {
|
|||||||
facet_reader.facet_ords(0u32, &mut facet_ords);
|
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||||
assert_eq!(&facet_ords, &[2u64]);
|
assert_eq!(&facet_ords, &[2u64]);
|
||||||
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||||
let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::facet);
|
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||||
assert_eq!(value, Facet::from_text("/a/b").ok().as_ref());
|
assert_eq!(value, Some("/a/b".to_string()));
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_facet_neither_stored_and_indexed() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
|
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||||
|
index_writer.commit()?;
|
||||||
|
let searcher = index.reader()?.searcher();
|
||||||
|
let facet_reader = searcher
|
||||||
|
.segment_reader(0u32)
|
||||||
|
.facet_reader(facet_field)
|
||||||
|
.unwrap();
|
||||||
|
let mut facet_ords = Vec::new();
|
||||||
|
facet_reader.facet_ords(0u32, &mut facet_ords);
|
||||||
|
assert!(facet_ords.is_empty());
|
||||||
|
let doc = searcher.doc(DocAddress::new(0u32, 0u32))?;
|
||||||
|
let value = doc.get_first(facet_field).and_then(Value::path);
|
||||||
|
assert_eq!(value, None);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_not_populated_for_all_docs() -> crate::Result<()> {
|
fn test_facet_not_populated_for_all_docs() -> crate::Result<()> {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?;
|
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||||
index_writer.add_document(Document::default())?;
|
index_writer.add_document(Document::default());
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
let facet_reader = searcher
|
let facet_reader = searcher
|
||||||
@@ -160,12 +206,12 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_facet_not_populated_for_any_docs() -> crate::Result<()> {
|
fn test_facet_not_populated_for_any_docs() -> crate::Result<()> {
|
||||||
let mut schema_builder = SchemaBuilder::default();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(Document::default())?;
|
index_writer.add_document(Document::default());
|
||||||
index_writer.add_document(Document::default())?;
|
index_writer.add_document(Document::default());
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
let facet_reader = searcher
|
let facet_reader = searcher
|
||||||
|
|||||||
@@ -23,7 +23,6 @@ values stored.
|
|||||||
Read access performance is comparable to that of an array lookup.
|
Read access performance is comparable to that of an array lookup.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
pub use self::alive_bitset::intersect_alive_bitsets;
|
|
||||||
pub use self::alive_bitset::write_alive_bitset;
|
pub use self::alive_bitset::write_alive_bitset;
|
||||||
pub use self::alive_bitset::AliveBitSet;
|
pub use self::alive_bitset::AliveBitSet;
|
||||||
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
||||||
@@ -110,7 +109,7 @@ impl FastValue for u64 {
|
|||||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||||
match *field_type {
|
match *field_type {
|
||||||
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||||
FieldType::Facet(_) => Some(Cardinality::MultiValues),
|
FieldType::HierarchicalFacet(_) => Some(Cardinality::MultiValues),
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -497,18 +496,18 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_missing_date_fast_field() -> crate::Result<()> {
|
fn test_merge_missing_date_fast_field() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let date_field = schema_builder.add_date_field("date", FAST);
|
let date_field = schema_builder.add_date_field("date", FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests().unwrap();
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()))?;
|
index_writer.add_document(doc!(date_field =>crate::chrono::prelude::Utc::now()));
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
index_writer.add_document(doc!())?;
|
index_writer.add_document(doc!());
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let segment_ids: Vec<SegmentId> = reader
|
let segment_ids: Vec<SegmentId> = reader
|
||||||
.searcher()
|
.searcher()
|
||||||
.segment_readers()
|
.segment_readers()
|
||||||
@@ -517,10 +516,10 @@ mod tests {
|
|||||||
.collect();
|
.collect();
|
||||||
assert_eq!(segment_ids.len(), 2);
|
assert_eq!(segment_ids.len(), 2);
|
||||||
let merge_future = index_writer.merge(&segment_ids[..]);
|
let merge_future = index_writer.merge(&segment_ids[..]);
|
||||||
futures::executor::block_on(merge_future)?;
|
let merge_res = futures::executor::block_on(merge_future);
|
||||||
reader.reload()?;
|
assert!(merge_res.is_ok());
|
||||||
|
assert!(reader.reload().is_ok());
|
||||||
assert_eq!(reader.searcher().segment_readers().len(), 1);
|
assert_eq!(reader.searcher().segment_readers().len(), 1);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -529,7 +528,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_datefastfield() -> crate::Result<()> {
|
fn test_datefastfield() {
|
||||||
use crate::fastfield::FastValue;
|
use crate::fastfield::FastValue;
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let date_field = schema_builder.add_date_field("date", FAST);
|
let date_field = schema_builder.add_date_field("date", FAST);
|
||||||
@@ -539,22 +538,22 @@ mod tests {
|
|||||||
);
|
);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
date_field => crate::DateTime::from_u64(1i64.to_u64()),
|
date_field => crate::DateTime::from_u64(1i64.to_u64()),
|
||||||
multi_date_field => crate::DateTime::from_u64(2i64.to_u64()),
|
multi_date_field => crate::DateTime::from_u64(2i64.to_u64()),
|
||||||
multi_date_field => crate::DateTime::from_u64(3i64.to_u64())
|
multi_date_field => crate::DateTime::from_u64(3i64.to_u64())
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
date_field => crate::DateTime::from_u64(4i64.to_u64())
|
date_field => crate::DateTime::from_u64(4i64.to_u64())
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
multi_date_field => crate::DateTime::from_u64(5i64.to_u64()),
|
multi_date_field => crate::DateTime::from_u64(5i64.to_u64()),
|
||||||
multi_date_field => crate::DateTime::from_u64(6i64.to_u64())
|
multi_date_field => crate::DateTime::from_u64(6i64.to_u64())
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
assert_eq!(searcher.segment_readers().len(), 1);
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
@@ -581,7 +580,6 @@ mod tests {
|
|||||||
assert_eq!(dates[0].timestamp(), 5i64);
|
assert_eq!(dates[0].timestamp(), 5i64);
|
||||||
assert_eq!(dates[1].timestamp(), 6i64);
|
assert_eq!(dates[1].timestamp(), 6i64);
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -12,9 +12,9 @@ mod tests {
|
|||||||
use crate::query::QueryParser;
|
use crate::query::QueryParser;
|
||||||
use crate::schema::Cardinality;
|
use crate::schema::Cardinality;
|
||||||
use crate::schema::Facet;
|
use crate::schema::Facet;
|
||||||
use crate::schema::FacetOptions;
|
|
||||||
use crate::schema::IntOptions;
|
use crate::schema::IntOptions;
|
||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
|
use crate::schema::INDEXED;
|
||||||
use crate::Document;
|
use crate::Document;
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use crate::Term;
|
use crate::Term;
|
||||||
@@ -23,10 +23,10 @@ mod tests {
|
|||||||
use proptest::prop_oneof;
|
use proptest::prop_oneof;
|
||||||
use proptest::proptest;
|
use proptest::proptest;
|
||||||
use proptest::strategy::Strategy;
|
use proptest::strategy::Strategy;
|
||||||
use test_log::test;
|
use test_env_log::test;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multivalued_u64() -> crate::Result<()> {
|
fn test_multivalued_u64() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let field = schema_builder.add_u64_field(
|
let field = schema_builder.add_u64_field(
|
||||||
"multifield",
|
"multifield",
|
||||||
@@ -34,17 +34,17 @@ mod tests {
|
|||||||
);
|
);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(field=>1u64, field=>3u64))?;
|
index_writer.add_document(doc!(field=>1u64, field=>3u64));
|
||||||
index_writer.add_document(doc!())?;
|
index_writer.add_document(doc!());
|
||||||
index_writer.add_document(doc!(field=>4u64))?;
|
index_writer.add_document(doc!(field=>4u64));
|
||||||
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64))?;
|
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let mut vals = Vec::new();
|
let mut vals = Vec::new();
|
||||||
let multi_value_reader = segment_reader.fast_fields().u64s(field)?;
|
let multi_value_reader = segment_reader.fast_fields().u64s(field).unwrap();
|
||||||
{
|
{
|
||||||
multi_value_reader.get_vals(2, &mut vals);
|
multi_value_reader.get_vals(2, &mut vals);
|
||||||
assert_eq!(&vals, &[4u64]);
|
assert_eq!(&vals, &[4u64]);
|
||||||
@@ -57,55 +57,56 @@ mod tests {
|
|||||||
multi_value_reader.get_vals(1, &mut vals);
|
multi_value_reader.get_vals(1, &mut vals);
|
||||||
assert!(vals.is_empty());
|
assert!(vals.is_empty());
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multivalued_date() -> crate::Result<()> {
|
fn test_multivalued_date() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let date_field = schema_builder.add_date_field(
|
let date_field = schema_builder.add_date_field(
|
||||||
"multi_date_field",
|
"multi_date_field",
|
||||||
IntOptions::default()
|
IntOptions::default()
|
||||||
.set_fast(Cardinality::MultiValues)
|
.set_fast(Cardinality::MultiValues)
|
||||||
.set_indexed()
|
.set_indexed()
|
||||||
.set_fieldnorm()
|
|
||||||
.set_stored(),
|
.set_stored(),
|
||||||
);
|
);
|
||||||
let time_i =
|
let time_i =
|
||||||
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
|
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
let first_time_stamp = chrono::Utc::now();
|
let first_time_stamp = chrono::Utc::now();
|
||||||
index_writer.add_document(
|
index_writer.add_document(
|
||||||
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
|
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
|
||||||
)?;
|
);
|
||||||
index_writer.add_document(doc!(time_i=>0i64))?;
|
index_writer.add_document(doc!(time_i=>0i64));
|
||||||
// add one second
|
// add one second
|
||||||
index_writer.add_document(
|
index_writer
|
||||||
doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64),
|
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64));
|
||||||
)?;
|
|
||||||
// add another second
|
// add another second
|
||||||
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
|
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
|
||||||
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64))?;
|
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
|
||||||
// add three seconds
|
// add three seconds
|
||||||
index_writer.add_document(
|
index_writer
|
||||||
doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64),
|
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(3), time_i=>4i64));
|
||||||
)?;
|
assert!(index_writer.commit().is_ok());
|
||||||
index_writer.commit()?;
|
|
||||||
|
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
assert_eq!(reader.num_docs(), 5);
|
assert_eq!(reader.num_docs(), 5);
|
||||||
|
|
||||||
{
|
{
|
||||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||||
let query = parser.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))?;
|
let query = parser
|
||||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()))
|
||||||
|
.expect("could not parse query");
|
||||||
|
let results = searcher
|
||||||
|
.search(&query, &TopDocs::with_limit(5))
|
||||||
|
.expect("could not query index");
|
||||||
|
|
||||||
assert_eq!(results.len(), 1);
|
assert_eq!(results.len(), 1);
|
||||||
for (_score, doc_address) in results {
|
for (_score, doc_address) in results {
|
||||||
let retrieved_doc = searcher.doc(doc_address)?;
|
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
retrieved_doc
|
retrieved_doc
|
||||||
.get_first(date_field)
|
.get_first(date_field)
|
||||||
@@ -127,8 +128,12 @@ mod tests {
|
|||||||
|
|
||||||
{
|
{
|
||||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||||
let query = parser.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))?;
|
let query = parser
|
||||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()))
|
||||||
|
.expect("could not parse query");
|
||||||
|
let results = searcher
|
||||||
|
.search(&query, &TopDocs::with_limit(5))
|
||||||
|
.expect("could not query index");
|
||||||
|
|
||||||
assert_eq!(results.len(), 1);
|
assert_eq!(results.len(), 1);
|
||||||
|
|
||||||
@@ -160,8 +165,10 @@ mod tests {
|
|||||||
(first_time_stamp + Duration::seconds(1)).to_rfc3339(),
|
(first_time_stamp + Duration::seconds(1)).to_rfc3339(),
|
||||||
(first_time_stamp + Duration::seconds(3)).to_rfc3339()
|
(first_time_stamp + Duration::seconds(3)).to_rfc3339()
|
||||||
);
|
);
|
||||||
let query = parser.parse_query(&range_q)?;
|
let query = parser.parse_query(&range_q).expect("could not parse query");
|
||||||
let results = searcher.search(&query, &TopDocs::with_limit(5))?;
|
let results = searcher
|
||||||
|
.search(&query, &TopDocs::with_limit(5))
|
||||||
|
.expect("could not query index");
|
||||||
|
|
||||||
assert_eq!(results.len(), 2);
|
assert_eq!(results.len(), 2);
|
||||||
for (i, doc_pair) in results.iter().enumerate() {
|
for (i, doc_pair) in results.iter().enumerate() {
|
||||||
@@ -189,16 +196,16 @@ mod tests {
|
|||||||
retrieved_doc
|
retrieved_doc
|
||||||
.get_first(time_i)
|
.get_first(time_i)
|
||||||
.expect("cannot find value")
|
.expect("cannot find value")
|
||||||
.i64_value(),
|
.i64_value()
|
||||||
Some(time_i_val)
|
.expect("value not of i64 type"),
|
||||||
|
time_i_val
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multivalued_i64() -> crate::Result<()> {
|
fn test_multivalued_i64() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let field = schema_builder.add_i64_field(
|
let field = schema_builder.add_i64_field(
|
||||||
"multifield",
|
"multifield",
|
||||||
@@ -206,14 +213,14 @@ mod tests {
|
|||||||
);
|
);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(field=> 1i64, field => 3i64))?;
|
index_writer.add_document(doc!(field=> 1i64, field => 3i64));
|
||||||
index_writer.add_document(doc!())?;
|
index_writer.add_document(doc!());
|
||||||
index_writer.add_document(doc!(field=> -4i64))?;
|
index_writer.add_document(doc!(field=> -4i64));
|
||||||
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64))?;
|
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let mut vals = Vec::new();
|
let mut vals = Vec::new();
|
||||||
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
|
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
|
||||||
@@ -225,10 +232,9 @@ mod tests {
|
|||||||
assert!(vals.is_empty());
|
assert!(vals.is_empty());
|
||||||
multi_value_reader.get_vals(3, &mut vals);
|
multi_value_reader.get_vals(3, &mut vals);
|
||||||
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
|
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_multivalued_no_panic(ops: &[IndexingOp]) -> crate::Result<()> {
|
fn test_multivalued_no_panic(ops: &[IndexingOp]) {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let field = schema_builder.add_u64_field(
|
let field = schema_builder.add_u64_field(
|
||||||
"multifield",
|
"multifield",
|
||||||
@@ -238,7 +244,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
|
|
||||||
for &op in ops {
|
for &op in ops {
|
||||||
@@ -246,19 +252,19 @@ mod tests {
|
|||||||
IndexingOp::AddDoc { id } => {
|
IndexingOp::AddDoc { id } => {
|
||||||
match id % 3 {
|
match id % 3 {
|
||||||
0 => {
|
0 => {
|
||||||
index_writer.add_document(doc!())?;
|
index_writer.add_document(doc!());
|
||||||
}
|
}
|
||||||
1 => {
|
1 => {
|
||||||
let mut doc = Document::new();
|
let mut doc = Document::new();
|
||||||
for _ in 0..5001 {
|
for _ in 0..5001 {
|
||||||
doc.add_u64(field, id as u64);
|
doc.add_u64(field, id as u64);
|
||||||
}
|
}
|
||||||
index_writer.add_document(doc)?;
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
let mut doc = Document::new();
|
let mut doc = Document::new();
|
||||||
doc.add_u64(field, id as u64);
|
doc.add_u64(field, id as u64);
|
||||||
index_writer.add_document(doc)?;
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -269,16 +275,18 @@ mod tests {
|
|||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
IndexingOp::Merge => {
|
IndexingOp::Merge => {
|
||||||
let segment_ids = index.searchable_segment_ids()?;
|
let segment_ids = index
|
||||||
|
.searchable_segment_ids()
|
||||||
|
.expect("Searchable segments failed.");
|
||||||
if segment_ids.len() >= 2 {
|
if segment_ids.len() >= 2 {
|
||||||
block_on(index_writer.merge(&segment_ids))?;
|
block_on(index_writer.merge(&segment_ids)).unwrap();
|
||||||
index_writer.segment_updater().wait_merging_thread()?;
|
assert!(index_writer.segment_updater().wait_merging_thread().is_ok());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
// Merging the segments
|
// Merging the segments
|
||||||
{
|
{
|
||||||
@@ -290,7 +298,6 @@ mod tests {
|
|||||||
assert!(index_writer.wait_merging_threads().is_ok());
|
assert!(index_writer.wait_merging_threads().is_ok());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
@@ -313,7 +320,7 @@ mod tests {
|
|||||||
proptest! {
|
proptest! {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multivalued_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
fn test_multivalued_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||||
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
|
test_multivalued_no_panic(&ops[..]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -328,22 +335,20 @@ mod tests {
|
|||||||
Merge,
|
Merge,
|
||||||
];
|
];
|
||||||
|
|
||||||
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
|
test_multivalued_no_panic(&ops[..]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[ignore]
|
#[ignore]
|
||||||
fn test_many_facets() -> crate::Result<()> {
|
fn test_many_facets() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let field = schema_builder.add_facet_field("facetfield", FacetOptions::default());
|
let field = schema_builder.add_facet_field("facetfield", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
for i in 0..100_000 {
|
for i in 0..100_000 {
|
||||||
index_writer
|
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
|
||||||
.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())))?;
|
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -91,25 +91,27 @@ impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use crate::core::Index;
|
use crate::core::Index;
|
||||||
use crate::schema::{Cardinality, Facet, FacetOptions, IntOptions, Schema};
|
use crate::schema::{Cardinality, Facet, IntOptions, Schema, INDEXED};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multifastfield_reader() -> crate::Result<()> {
|
fn test_multifastfield_reader() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let facet_field = schema_builder.add_facet_field("facets", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facets", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index
|
||||||
|
.writer_for_tests()
|
||||||
|
.expect("Failed to create index writer.");
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
facet_field => Facet::from("/category/cat2"),
|
facet_field => Facet::from("/category/cat2"),
|
||||||
facet_field => Facet::from("/category/cat1"),
|
facet_field => Facet::from("/category/cat1"),
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")))?;
|
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")));
|
||||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")))?;
|
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")));
|
||||||
index_writer.commit()?;
|
index_writer.commit().expect("Commit failed");
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let mut facet_reader = segment_reader.facet_reader(facet_field)?;
|
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
|
||||||
|
|
||||||
let mut facet = Facet::root();
|
let mut facet = Facet::root();
|
||||||
{
|
{
|
||||||
@@ -143,11 +145,10 @@ mod tests {
|
|||||||
facet_reader.facet_ords(2, &mut vals);
|
facet_reader.facet_ords(2, &mut vals);
|
||||||
assert_eq!(&vals[..], &[4]);
|
assert_eq!(&vals[..], &[4]);
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multifastfield_reader_min_max() -> crate::Result<()> {
|
fn test_multifastfield_reader_min_max() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let field_options = IntOptions::default()
|
let field_options = IntOptions::default()
|
||||||
.set_indexed()
|
.set_indexed()
|
||||||
@@ -162,16 +163,15 @@ mod tests {
|
|||||||
item_field => 2i64,
|
item_field => 2i64,
|
||||||
item_field => 3i64,
|
item_field => 3i64,
|
||||||
item_field => -2i64,
|
item_field => -2i64,
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64))?;
|
index_writer.add_document(doc!(item_field => 6i64, item_field => 3i64));
|
||||||
index_writer.add_document(doc!(item_field => 4i64))?;
|
index_writer.add_document(doc!(item_field => 4i64));
|
||||||
index_writer.commit()?;
|
index_writer.commit().expect("Commit failed");
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let field_reader = segment_reader.fast_fields().i64s(item_field)?;
|
let field_reader = segment_reader.fast_fields().i64s(item_field).unwrap();
|
||||||
|
|
||||||
assert_eq!(field_reader.min_value(), -2);
|
assert_eq!(field_reader.min_value(), -2);
|
||||||
assert_eq!(field_reader.max_value(), 6);
|
assert_eq!(field_reader.max_value(), 6);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality
|
|||||||
FieldType::Date(options) => options
|
FieldType::Date(options) => options
|
||||||
.get_fastfield_cardinality()
|
.get_fastfield_cardinality()
|
||||||
.map(|cardinality| (FastType::Date, cardinality)),
|
.map(|cardinality| (FastType::Date, cardinality)),
|
||||||
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)),
|
FieldType::HierarchicalFacet(_) => Some((FastType::U64, Cardinality::MultiValues)),
|
||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ impl FastFieldsWriter {
|
|||||||
None => {}
|
None => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
FieldType::Facet(_) => {
|
FieldType::HierarchicalFacet(_) => {
|
||||||
let fast_field_writer = MultiValuedFastFieldWriter::new(field, true);
|
let fast_field_writer = MultiValuedFastFieldWriter::new(field, true);
|
||||||
multi_values_writers.push(fast_field_writer);
|
multi_values_writers.push(fast_field_writer);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,137 +26,3 @@ pub use self::serializer::FieldNormsSerializer;
|
|||||||
pub use self::writer::FieldNormsWriter;
|
pub use self::writer::FieldNormsWriter;
|
||||||
|
|
||||||
use self::code::{fieldnorm_to_id, id_to_fieldnorm};
|
use self::code::{fieldnorm_to_id, id_to_fieldnorm};
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use crate::directory::CompositeFile;
|
|
||||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
|
||||||
use crate::fieldnorm::FieldNormReader;
|
|
||||||
use crate::fieldnorm::FieldNormsSerializer;
|
|
||||||
use crate::fieldnorm::FieldNormsWriter;
|
|
||||||
use crate::query::Query;
|
|
||||||
use crate::query::TermQuery;
|
|
||||||
use crate::schema::IndexRecordOption;
|
|
||||||
use crate::schema::TextFieldIndexing;
|
|
||||||
use crate::schema::TextOptions;
|
|
||||||
use crate::schema::TEXT;
|
|
||||||
use crate::Index;
|
|
||||||
use crate::Term;
|
|
||||||
use crate::TERMINATED;
|
|
||||||
use once_cell::sync::Lazy;
|
|
||||||
use std::path::Path;
|
|
||||||
|
|
||||||
use crate::schema::{Field, Schema, STORED};
|
|
||||||
|
|
||||||
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
schema_builder.add_text_field("field", STORED);
|
|
||||||
schema_builder.add_text_field("txt_field", TEXT);
|
|
||||||
schema_builder.add_text_field(
|
|
||||||
"str_field",
|
|
||||||
TextOptions::default().set_indexing_options(
|
|
||||||
TextFieldIndexing::default()
|
|
||||||
.set_index_option(IndexRecordOption::Basic)
|
|
||||||
.set_fieldnorms(false),
|
|
||||||
),
|
|
||||||
);
|
|
||||||
schema_builder.build()
|
|
||||||
});
|
|
||||||
|
|
||||||
pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
|
|
||||||
pub static TXT_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("txt_field").unwrap());
|
|
||||||
pub static STR_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("str_field").unwrap());
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic(expected = "Cannot register a given fieldnorm twice")]
|
|
||||||
pub fn test_should_panic_when_recording_fieldnorm_twice_for_same_doc() {
|
|
||||||
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
|
|
||||||
fieldnorm_writers.record(0u32, *TXT_FIELD, 5);
|
|
||||||
fieldnorm_writers.record(0u32, *TXT_FIELD, 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
pub fn test_fieldnorm() -> crate::Result<()> {
|
|
||||||
let path = Path::new("test");
|
|
||||||
let directory: RamDirectory = RamDirectory::create();
|
|
||||||
{
|
|
||||||
let write: WritePtr = directory.open_write(Path::new("test"))?;
|
|
||||||
let serializer = FieldNormsSerializer::from_write(write)?;
|
|
||||||
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
|
|
||||||
fieldnorm_writers.record(2u32, *TXT_FIELD, 5);
|
|
||||||
fieldnorm_writers.record(3u32, *TXT_FIELD, 3);
|
|
||||||
fieldnorm_writers.serialize(serializer, None)?;
|
|
||||||
}
|
|
||||||
let file = directory.open_read(&path)?;
|
|
||||||
{
|
|
||||||
let fields_composite = CompositeFile::open(&file)?;
|
|
||||||
assert!(fields_composite.open_read(*FIELD).is_none());
|
|
||||||
assert!(fields_composite.open_read(*STR_FIELD).is_none());
|
|
||||||
let data = fields_composite.open_read(*TXT_FIELD).unwrap();
|
|
||||||
let fieldnorm_reader = FieldNormReader::open(data)?;
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(0u32), 0u32);
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(1u32), 0u32);
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(2u32), 5u32);
|
|
||||||
assert_eq!(fieldnorm_reader.fieldnorm(3u32), 3u32);
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_fieldnorm_disabled() -> crate::Result<()> {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_options = TextOptions::default()
|
|
||||||
.set_indexing_options(TextFieldIndexing::default().set_fieldnorms(false));
|
|
||||||
let text = schema_builder.add_text_field("text", text_options);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut writer = index.writer_for_tests()?;
|
|
||||||
writer.add_document(doc!(text=>"hello"))?;
|
|
||||||
writer.add_document(doc!(text=>"hello hello hello"))?;
|
|
||||||
writer.commit()?;
|
|
||||||
let reader = index.reader()?;
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let query = TermQuery::new(
|
|
||||||
Term::from_field_text(text, "hello"),
|
|
||||||
IndexRecordOption::WithFreqs,
|
|
||||||
);
|
|
||||||
let weight = query.weight(&*searcher, true)?;
|
|
||||||
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
|
|
||||||
assert_eq!(scorer.doc(), 0);
|
|
||||||
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
|
|
||||||
assert_eq!(scorer.advance(), 1);
|
|
||||||
assert_eq!(scorer.doc(), 1);
|
|
||||||
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
|
|
||||||
assert_eq!(scorer.advance(), TERMINATED);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_fieldnorm_enabled() -> crate::Result<()> {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_options = TextOptions::default()
|
|
||||||
.set_indexing_options(TextFieldIndexing::default().set_fieldnorms(true));
|
|
||||||
let text = schema_builder.add_text_field("text", text_options);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut writer = index.writer_for_tests()?;
|
|
||||||
writer.add_document(doc!(text=>"hello"))?;
|
|
||||||
writer.add_document(doc!(text=>"hello hello hello"))?;
|
|
||||||
writer.commit()?;
|
|
||||||
let reader = index.reader()?;
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let query = TermQuery::new(
|
|
||||||
Term::from_field_text(text, "hello"),
|
|
||||||
IndexRecordOption::WithFreqs,
|
|
||||||
);
|
|
||||||
let weight = query.weight(&*searcher, true)?;
|
|
||||||
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
|
|
||||||
assert_eq!(scorer.doc(), 0);
|
|
||||||
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
|
|
||||||
assert_eq!(scorer.advance(), 1);
|
|
||||||
assert_eq!(scorer.doc(), 1);
|
|
||||||
assert!((scorer.score() - 0.15136132).abs() < 0.001f32);
|
|
||||||
assert_eq!(scorer.advance(), TERMINATED);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ use super::fieldnorm_to_id;
|
|||||||
use super::FieldNormsSerializer;
|
use super::FieldNormsSerializer;
|
||||||
use crate::schema::Field;
|
use crate::schema::Field;
|
||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
use std::cmp::Ordering;
|
|
||||||
use std::{io, iter};
|
use std::{io, iter};
|
||||||
|
|
||||||
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
|
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
|
||||||
@@ -13,7 +12,8 @@ use std::{io, iter};
|
|||||||
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
|
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
|
||||||
/// byte per document per field.
|
/// byte per document per field.
|
||||||
pub struct FieldNormsWriter {
|
pub struct FieldNormsWriter {
|
||||||
fieldnorms_buffers: Vec<Option<Vec<u8>>>,
|
fields: Vec<Field>,
|
||||||
|
fieldnorms_buffer: Vec<Vec<u8>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FieldNormsWriter {
|
impl FieldNormsWriter {
|
||||||
@@ -23,7 +23,7 @@ impl FieldNormsWriter {
|
|||||||
schema
|
schema
|
||||||
.fields()
|
.fields()
|
||||||
.filter_map(|(field, field_entry)| {
|
.filter_map(|(field, field_entry)| {
|
||||||
if field_entry.is_indexed() && field_entry.has_fieldnorms() {
|
if field_entry.is_indexed() {
|
||||||
Some(field)
|
Some(field)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
@@ -35,20 +35,25 @@ impl FieldNormsWriter {
|
|||||||
/// Initialize with state for tracking the field norm fields
|
/// Initialize with state for tracking the field norm fields
|
||||||
/// specified in the schema.
|
/// specified in the schema.
|
||||||
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
|
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
|
||||||
let mut fieldnorms_buffers: Vec<Option<Vec<u8>>> = iter::repeat_with(|| None)
|
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
|
||||||
.take(schema.num_fields())
|
let max_field = fields
|
||||||
.collect();
|
.iter()
|
||||||
for field in FieldNormsWriter::fields_with_fieldnorm(schema) {
|
.map(Field::field_id)
|
||||||
fieldnorms_buffers[field.field_id() as usize] = Some(Vec::with_capacity(1_000));
|
.max()
|
||||||
|
.map(|max_field_id| max_field_id as usize + 1)
|
||||||
|
.unwrap_or(0);
|
||||||
|
FieldNormsWriter {
|
||||||
|
fields,
|
||||||
|
fieldnorms_buffer: iter::repeat_with(Vec::new)
|
||||||
|
.take(max_field)
|
||||||
|
.collect::<Vec<_>>(),
|
||||||
}
|
}
|
||||||
FieldNormsWriter { fieldnorms_buffers }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The memory used inclusive childs
|
/// The memory used inclusive childs
|
||||||
pub fn mem_usage(&self) -> usize {
|
pub fn mem_usage(&self) -> usize {
|
||||||
self.fieldnorms_buffers
|
self.fieldnorms_buffer
|
||||||
.iter()
|
.iter()
|
||||||
.flatten()
|
|
||||||
.map(|buf| buf.capacity())
|
.map(|buf| buf.capacity())
|
||||||
.sum()
|
.sum()
|
||||||
}
|
}
|
||||||
@@ -57,10 +62,8 @@ impl FieldNormsWriter {
|
|||||||
///
|
///
|
||||||
/// Will extend with 0-bytes for documents that have not been seen.
|
/// Will extend with 0-bytes for documents that have not been seen.
|
||||||
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
|
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
|
||||||
for fieldnorms_buffer_opt in self.fieldnorms_buffers.iter_mut() {
|
for field in self.fields.iter() {
|
||||||
if let Some(fieldnorms_buffer) = fieldnorms_buffer_opt.as_mut() {
|
self.fieldnorms_buffer[field.field_id() as usize].resize(max_doc as usize, 0u8);
|
||||||
fieldnorms_buffer.resize(max_doc as usize, 0u8);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -73,23 +76,14 @@ impl FieldNormsWriter {
|
|||||||
/// * field - the field being set
|
/// * field - the field being set
|
||||||
/// * fieldnorm - the number of terms present in document `doc` in field `field`
|
/// * fieldnorm - the number of terms present in document `doc` in field `field`
|
||||||
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
|
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
|
||||||
if let Some(fieldnorm_buffer) = self
|
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.field_id() as usize];
|
||||||
.fieldnorms_buffers
|
assert!(
|
||||||
.get_mut(field.field_id() as usize)
|
fieldnorm_buffer.len() <= doc as usize,
|
||||||
.and_then(Option::as_mut)
|
"Cannot register a given fieldnorm twice"
|
||||||
{
|
);
|
||||||
match fieldnorm_buffer.len().cmp(&(doc as usize)) {
|
// we fill intermediary `DocId` as having a fieldnorm of 0.
|
||||||
Ordering::Less => {
|
fieldnorm_buffer.resize(doc as usize + 1, 0u8);
|
||||||
// we fill intermediary `DocId` as having a fieldnorm of 0.
|
fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
|
||||||
fieldnorm_buffer.resize(doc as usize, 0u8);
|
|
||||||
}
|
|
||||||
Ordering::Equal => {}
|
|
||||||
Ordering::Greater => {
|
|
||||||
panic!("Cannot register a given fieldnorm twice")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fieldnorm_buffer.push(fieldnorm_to_id(fieldnorm));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
||||||
@@ -98,18 +92,17 @@ impl FieldNormsWriter {
|
|||||||
mut fieldnorms_serializer: FieldNormsSerializer,
|
mut fieldnorms_serializer: FieldNormsSerializer,
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
doc_id_map: Option<&DocIdMapping>,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
for (field, fieldnorms_buffer) in self.fieldnorms_buffers.iter().enumerate().filter_map(
|
for &field in self.fields.iter() {
|
||||||
|(field_id, fieldnorms_buffer_opt)| {
|
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
|
||||||
fieldnorms_buffer_opt.as_ref().map(|fieldnorms_buffer| {
|
|
||||||
(Field::from_field_id(field_id as u32), fieldnorms_buffer)
|
|
||||||
})
|
|
||||||
},
|
|
||||||
) {
|
|
||||||
if let Some(doc_id_map) = doc_id_map {
|
if let Some(doc_id_map) = doc_id_map {
|
||||||
let remapped_fieldnorm_buffer = doc_id_map.remap(fieldnorms_buffer);
|
let mut mapped_fieldnorm_values = vec![];
|
||||||
fieldnorms_serializer.serialize_field(field, &remapped_fieldnorm_buffer)?;
|
mapped_fieldnorm_values.resize(fieldnorm_values.len(), 0u8);
|
||||||
|
for (new_doc_id, old_doc_id) in doc_id_map.iter_old_doc_ids().enumerate() {
|
||||||
|
mapped_fieldnorm_values[new_doc_id] = fieldnorm_values[old_doc_id as usize];
|
||||||
|
}
|
||||||
|
fieldnorms_serializer.serialize_field(field, &mapped_fieldnorm_values)?;
|
||||||
} else {
|
} else {
|
||||||
fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?;
|
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fieldnorms_serializer.close()?;
|
fieldnorms_serializer.close()?;
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ fn test_functional_store() -> crate::Result<()> {
|
|||||||
}
|
}
|
||||||
for _ in 0..num_docs {
|
for _ in 0..num_docs {
|
||||||
doc_set.push(doc_id);
|
doc_set.push(doc_id);
|
||||||
index_writer.add_document(doc!(id_field=>doc_id))?;
|
index_writer.add_document(doc!(id_field=>doc_id));
|
||||||
doc_id += 1;
|
doc_id += 1;
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
@@ -124,7 +124,7 @@ fn test_functional_indexing_sorted() -> crate::Result<()> {
|
|||||||
doc.add_u64(multiples_field, random_val * i);
|
doc.add_u64(multiples_field, random_val * i);
|
||||||
}
|
}
|
||||||
doc.add_text(text_field, get_text());
|
doc.add_text(text_field, get_text());
|
||||||
index_writer.add_document(doc)?;
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -201,7 +201,7 @@ fn test_functional_indexing_unsorted() -> crate::Result<()> {
|
|||||||
doc.add_u64(multiples_field, random_val * i);
|
doc.add_u64(multiples_field, random_val * i);
|
||||||
}
|
}
|
||||||
doc.add_text(text_field, get_text());
|
doc.add_text(text_field, get_text());
|
||||||
index_writer.add_document(doc)?;
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -1,324 +0,0 @@
|
|||||||
use common::BitSet;
|
|
||||||
use itertools::Itertools;
|
|
||||||
|
|
||||||
use crate::fastfield::AliveBitSet;
|
|
||||||
use crate::{merge_filtered_segments, Directory, Index, IndexSettings, Segment, SegmentOrdinal};
|
|
||||||
/// DemuxMapping can be used to reorganize data from multiple segments.
|
|
||||||
///
|
|
||||||
/// DemuxMapping is useful in a multitenant settings, in which each document might actually belong to a different tenant.
|
|
||||||
/// It allows to reorganize documents as follows:
|
|
||||||
///
|
|
||||||
/// e.g. if you have two tenant ids TENANT_A and TENANT_B and two segments with
|
|
||||||
/// the documents (simplified)
|
|
||||||
/// Seg 1 [TENANT_A, TENANT_B]
|
|
||||||
/// Seg 2 [TENANT_A, TENANT_B]
|
|
||||||
///
|
|
||||||
/// You may want to group your documents to
|
|
||||||
/// Seg 1 [TENANT_A, TENANT_A]
|
|
||||||
/// Seg 2 [TENANT_B, TENANT_B]
|
|
||||||
///
|
|
||||||
/// Demuxing is the tool for that.
|
|
||||||
/// Semantically you can define a mapping from [old segment ordinal, old doc_id] -> [new segment ordinal].
|
|
||||||
#[derive(Debug, Default)]
|
|
||||||
pub struct DemuxMapping {
|
|
||||||
/// [index old segment ordinal] -> [index doc_id] = new segment ordinal
|
|
||||||
mapping: Vec<DocIdToSegmentOrdinal>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// DocIdToSegmentOrdinal maps from doc_id within a segment to the new segment ordinal for demuxing.
|
|
||||||
///
|
|
||||||
/// For every source segment there is a `DocIdToSegmentOrdinal` to distribute its doc_ids.
|
|
||||||
#[derive(Debug, Default)]
|
|
||||||
pub struct DocIdToSegmentOrdinal {
|
|
||||||
doc_id_index_to_segment_ord: Vec<SegmentOrdinal>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DocIdToSegmentOrdinal {
|
|
||||||
/// Creates a new DocIdToSegmentOrdinal with size of num_doc_ids.
|
|
||||||
/// Initially all doc_ids point to segment ordinal 0 and need to be set
|
|
||||||
/// the via `set` method.
|
|
||||||
pub fn with_max_doc(max_doc: usize) -> Self {
|
|
||||||
DocIdToSegmentOrdinal {
|
|
||||||
doc_id_index_to_segment_ord: vec![0; max_doc],
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the number of documents in this mapping.
|
|
||||||
/// It should be equal to the `max_doc` of the segment it targets.
|
|
||||||
pub fn max_doc(&self) -> u32 {
|
|
||||||
self.doc_id_index_to_segment_ord.len() as u32
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Associates a doc_id with an output `SegmentOrdinal`.
|
|
||||||
pub fn set(&mut self, doc_id: u32, segment_ord: SegmentOrdinal) {
|
|
||||||
self.doc_id_index_to_segment_ord[doc_id as usize] = segment_ord;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Iterates over the new SegmentOrdinal in the order of the doc_id.
|
|
||||||
pub fn iter(&self) -> impl Iterator<Item = SegmentOrdinal> + '_ {
|
|
||||||
self.doc_id_index_to_segment_ord.iter().cloned()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DemuxMapping {
|
|
||||||
/// Adds a DocIdToSegmentOrdinal. The order of the pus calls
|
|
||||||
/// defines the old segment ordinal. e.g. first push = ordinal 0.
|
|
||||||
pub fn add(&mut self, segment_mapping: DocIdToSegmentOrdinal) {
|
|
||||||
self.mapping.push(segment_mapping);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the old number of segments.
|
|
||||||
pub fn get_old_num_segments(&self) -> usize {
|
|
||||||
self.mapping.len()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn docs_for_segment_ord(
|
|
||||||
doc_id_to_segment_ord: &DocIdToSegmentOrdinal,
|
|
||||||
target_segment_ord: SegmentOrdinal,
|
|
||||||
) -> AliveBitSet {
|
|
||||||
let mut bitset = BitSet::with_max_value(doc_id_to_segment_ord.max_doc());
|
|
||||||
for doc_id in doc_id_to_segment_ord
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.filter(|(_doc_id, new_segment_ord)| *new_segment_ord == target_segment_ord)
|
|
||||||
.map(|(doc_id, _)| doc_id)
|
|
||||||
{
|
|
||||||
// add document if segment ordinal = target segment ordinal
|
|
||||||
bitset.insert(doc_id as u32);
|
|
||||||
}
|
|
||||||
AliveBitSet::from_bitset(&bitset)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn get_alive_bitsets(
|
|
||||||
demux_mapping: &DemuxMapping,
|
|
||||||
target_segment_ord: SegmentOrdinal,
|
|
||||||
) -> Vec<AliveBitSet> {
|
|
||||||
demux_mapping
|
|
||||||
.mapping
|
|
||||||
.iter()
|
|
||||||
.map(|doc_id_to_segment_ord| {
|
|
||||||
docs_for_segment_ord(doc_id_to_segment_ord, target_segment_ord)
|
|
||||||
})
|
|
||||||
.collect_vec()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Demux the segments according to `demux_mapping`. See `DemuxMapping`.
|
|
||||||
/// The number of output_directories need to match max new segment ordinal from `demux_mapping`.
|
|
||||||
///
|
|
||||||
/// The ordinal of `segments` need to match the ordinals provided in `demux_mapping`.
|
|
||||||
pub fn demux(
|
|
||||||
segments: &[Segment],
|
|
||||||
demux_mapping: &DemuxMapping,
|
|
||||||
target_settings: IndexSettings,
|
|
||||||
output_directories: Vec<Box<dyn Directory>>,
|
|
||||||
) -> crate::Result<Vec<Index>> {
|
|
||||||
let mut indices = vec![];
|
|
||||||
for (target_segment_ord, output_directory) in output_directories.into_iter().enumerate() {
|
|
||||||
let delete_bitsets = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
|
|
||||||
.into_iter()
|
|
||||||
.map(Some)
|
|
||||||
.collect_vec();
|
|
||||||
let index = merge_filtered_segments(
|
|
||||||
segments,
|
|
||||||
target_settings.clone(),
|
|
||||||
delete_bitsets,
|
|
||||||
output_directory,
|
|
||||||
)?;
|
|
||||||
indices.push(index);
|
|
||||||
}
|
|
||||||
Ok(indices)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use crate::{
|
|
||||||
collector::TopDocs,
|
|
||||||
directory::RamDirectory,
|
|
||||||
query::QueryParser,
|
|
||||||
schema::{Schema, TEXT},
|
|
||||||
DocAddress, Term,
|
|
||||||
};
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_demux_map_to_deletebitset() {
|
|
||||||
let max_value = 2;
|
|
||||||
let mut demux_mapping = DemuxMapping::default();
|
|
||||||
//segment ordinal 0 mapping
|
|
||||||
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
|
|
||||||
doc_id_to_segment.set(0, 1);
|
|
||||||
doc_id_to_segment.set(1, 0);
|
|
||||||
demux_mapping.add(doc_id_to_segment);
|
|
||||||
|
|
||||||
//segment ordinal 1 mapping
|
|
||||||
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
|
|
||||||
doc_id_to_segment.set(0, 1);
|
|
||||||
doc_id_to_segment.set(1, 1);
|
|
||||||
demux_mapping.add(doc_id_to_segment);
|
|
||||||
{
|
|
||||||
let bit_sets_for_demuxing_to_segment_ord_0 = get_alive_bitsets(&demux_mapping, 0);
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
bit_sets_for_demuxing_to_segment_ord_0[0].is_deleted(0),
|
|
||||||
true
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
bit_sets_for_demuxing_to_segment_ord_0[0].is_deleted(1),
|
|
||||||
false
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
bit_sets_for_demuxing_to_segment_ord_0[1].is_deleted(0),
|
|
||||||
true
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
bit_sets_for_demuxing_to_segment_ord_0[1].is_deleted(1),
|
|
||||||
true
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
let bit_sets_for_demuxing_to_segment_ord_1 = get_alive_bitsets(&demux_mapping, 1);
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
bit_sets_for_demuxing_to_segment_ord_1[0].is_deleted(0),
|
|
||||||
false
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
bit_sets_for_demuxing_to_segment_ord_1[0].is_deleted(1),
|
|
||||||
true
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
bit_sets_for_demuxing_to_segment_ord_1[1].is_deleted(0),
|
|
||||||
false
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
bit_sets_for_demuxing_to_segment_ord_1[1].is_deleted(1),
|
|
||||||
false
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_demux_segments() -> crate::Result<()> {
|
|
||||||
let first_index = {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"texto1"))?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"texto2"))?;
|
|
||||||
index_writer.commit()?;
|
|
||||||
index
|
|
||||||
};
|
|
||||||
|
|
||||||
let second_index = {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"texto3"))?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"texto4"))?;
|
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "4"));
|
|
||||||
|
|
||||||
index_writer.commit()?;
|
|
||||||
index
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut segments: Vec<Segment> = Vec::new();
|
|
||||||
segments.extend(first_index.searchable_segments()?);
|
|
||||||
segments.extend(second_index.searchable_segments()?);
|
|
||||||
|
|
||||||
let target_settings = first_index.settings().clone();
|
|
||||||
|
|
||||||
let mut demux_mapping = DemuxMapping::default();
|
|
||||||
{
|
|
||||||
let max_value = 2;
|
|
||||||
//segment ordinal 0 mapping
|
|
||||||
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
|
|
||||||
doc_id_to_segment.set(0, 1);
|
|
||||||
doc_id_to_segment.set(1, 0);
|
|
||||||
demux_mapping.add(doc_id_to_segment);
|
|
||||||
|
|
||||||
//segment ordinal 1 mapping
|
|
||||||
let mut doc_id_to_segment = DocIdToSegmentOrdinal::with_max_doc(max_value);
|
|
||||||
doc_id_to_segment.set(0, 1);
|
|
||||||
doc_id_to_segment.set(1, 1);
|
|
||||||
demux_mapping.add(doc_id_to_segment);
|
|
||||||
}
|
|
||||||
assert_eq!(demux_mapping.get_old_num_segments(), 2);
|
|
||||||
|
|
||||||
let demuxed_indices = demux(
|
|
||||||
&segments,
|
|
||||||
&demux_mapping,
|
|
||||||
target_settings,
|
|
||||||
vec![
|
|
||||||
Box::new(RamDirectory::default()),
|
|
||||||
Box::new(RamDirectory::default()),
|
|
||||||
],
|
|
||||||
)?;
|
|
||||||
|
|
||||||
{
|
|
||||||
let index = &demuxed_indices[0];
|
|
||||||
|
|
||||||
let segments = index.searchable_segments()?;
|
|
||||||
assert_eq!(segments.len(), 1);
|
|
||||||
|
|
||||||
let segment_metas = segments[0].meta();
|
|
||||||
assert_eq!(segment_metas.num_deleted_docs(), 0);
|
|
||||||
assert_eq!(segment_metas.num_docs(), 1);
|
|
||||||
|
|
||||||
let searcher = index.reader().unwrap().searcher();
|
|
||||||
{
|
|
||||||
let text_field = index.schema().get_field("text").unwrap();
|
|
||||||
|
|
||||||
let do_search = |term: &str| {
|
|
||||||
let query = QueryParser::for_index(&index, vec![text_field])
|
|
||||||
.parse_query(term)
|
|
||||||
.unwrap();
|
|
||||||
let top_docs: Vec<(f32, DocAddress)> =
|
|
||||||
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
|
||||||
|
|
||||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
|
||||||
};
|
|
||||||
|
|
||||||
assert_eq!(do_search("texto1"), vec![] as Vec<u32>);
|
|
||||||
assert_eq!(do_search("texto2"), vec![0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
let index = &demuxed_indices[1];
|
|
||||||
|
|
||||||
let segments = index.searchable_segments()?;
|
|
||||||
assert_eq!(segments.len(), 1);
|
|
||||||
|
|
||||||
let segment_metas = segments[0].meta();
|
|
||||||
assert_eq!(segment_metas.num_deleted_docs(), 0);
|
|
||||||
assert_eq!(segment_metas.num_docs(), 3);
|
|
||||||
|
|
||||||
let searcher = index.reader().unwrap().searcher();
|
|
||||||
{
|
|
||||||
let text_field = index.schema().get_field("text").unwrap();
|
|
||||||
|
|
||||||
let do_search = |term: &str| {
|
|
||||||
let query = QueryParser::for_index(&index, vec![text_field])
|
|
||||||
.parse_query(term)
|
|
||||||
.unwrap();
|
|
||||||
let top_docs: Vec<(f32, DocAddress)> =
|
|
||||||
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
|
||||||
|
|
||||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
|
||||||
};
|
|
||||||
|
|
||||||
assert_eq!(do_search("texto1"), vec![0]);
|
|
||||||
assert_eq!(do_search("texto2"), vec![] as Vec<u32>);
|
|
||||||
assert_eq!(do_search("texto3"), vec![1]);
|
|
||||||
assert_eq!(do_search("texto4"), vec![2]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -11,12 +11,12 @@ use std::{cmp::Reverse, ops::Index};
|
|||||||
|
|
||||||
/// Struct to provide mapping from new doc_id to old doc_id and segment.
|
/// Struct to provide mapping from new doc_id to old doc_id and segment.
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub(crate) struct SegmentDocIdMapping {
|
pub(crate) struct SegmentDocidMapping {
|
||||||
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
|
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
|
||||||
is_trivial: bool,
|
is_trivial: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentDocIdMapping {
|
impl SegmentDocidMapping {
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(
|
||||||
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
|
new_doc_id_to_old_and_segment: Vec<(DocId, SegmentOrdinal)>,
|
||||||
is_trivial: bool,
|
is_trivial: bool,
|
||||||
@@ -40,14 +40,14 @@ impl SegmentDocIdMapping {
|
|||||||
self.is_trivial
|
self.is_trivial
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl Index<usize> for SegmentDocIdMapping {
|
impl Index<usize> for SegmentDocidMapping {
|
||||||
type Output = (DocId, SegmentOrdinal);
|
type Output = (DocId, SegmentOrdinal);
|
||||||
|
|
||||||
fn index(&self, idx: usize) -> &Self::Output {
|
fn index(&self, idx: usize) -> &Self::Output {
|
||||||
&self.new_doc_id_to_old_and_segment[idx]
|
&self.new_doc_id_to_old_and_segment[idx]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl IntoIterator for SegmentDocIdMapping {
|
impl IntoIterator for SegmentDocidMapping {
|
||||||
type Item = (DocId, SegmentOrdinal);
|
type Item = (DocId, SegmentOrdinal);
|
||||||
type IntoIter = std::vec::IntoIter<Self::Item>;
|
type IntoIter = std::vec::IntoIter<Self::Item>;
|
||||||
|
|
||||||
@@ -63,24 +63,6 @@ pub struct DocIdMapping {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl DocIdMapping {
|
impl DocIdMapping {
|
||||||
pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
|
|
||||||
let max_doc = new_doc_id_to_old.len();
|
|
||||||
let old_max_doc = new_doc_id_to_old
|
|
||||||
.iter()
|
|
||||||
.cloned()
|
|
||||||
.max()
|
|
||||||
.map(|n| n + 1)
|
|
||||||
.unwrap_or(0);
|
|
||||||
let mut old_doc_id_to_new = vec![0; old_max_doc as usize];
|
|
||||||
for i in 0..max_doc {
|
|
||||||
old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
|
|
||||||
}
|
|
||||||
DocIdMapping {
|
|
||||||
new_doc_id_to_old,
|
|
||||||
old_doc_id_to_new,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// returns the new doc_id for the old doc_id
|
/// returns the new doc_id for the old doc_id
|
||||||
pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
|
pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
|
||||||
self.old_doc_id_to_new[doc_id as usize]
|
self.old_doc_id_to_new[doc_id as usize]
|
||||||
@@ -93,13 +75,6 @@ impl DocIdMapping {
|
|||||||
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
|
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
|
||||||
self.new_doc_id_to_old.iter().cloned()
|
self.new_doc_id_to_old.iter().cloned()
|
||||||
}
|
}
|
||||||
/// Remaps a given array to the new doc ids.
|
|
||||||
pub fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
|
|
||||||
self.new_doc_id_to_old
|
|
||||||
.iter()
|
|
||||||
.map(|old_doc| els[*old_doc as usize])
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn expect_field_id_for_sort_field(
|
pub(crate) fn expect_field_id_for_sort_field(
|
||||||
@@ -147,13 +122,23 @@ pub(crate) fn get_doc_id_mapping_from_field(
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|el| el.0)
|
.map(|el| el.0)
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
|
|
||||||
|
// create old doc_id to new doc_id index (used in posting recorder)
|
||||||
|
let max_doc = new_doc_id_to_old.len();
|
||||||
|
let mut old_doc_id_to_new = vec![0; max_doc];
|
||||||
|
for i in 0..max_doc {
|
||||||
|
old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
|
||||||
|
}
|
||||||
|
let doc_id_map = DocIdMapping {
|
||||||
|
new_doc_id_to_old,
|
||||||
|
old_doc_id_to_new,
|
||||||
|
};
|
||||||
|
Ok(doc_id_map)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests_indexsorting {
|
mod tests_indexsorting {
|
||||||
use crate::fastfield::FastFieldReader;
|
use crate::fastfield::FastFieldReader;
|
||||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
|
||||||
use crate::{collector::TopDocs, query::QueryParser, schema::*};
|
use crate::{collector::TopDocs, query::QueryParser, schema::*};
|
||||||
use crate::{schema::Schema, DocAddress};
|
use crate::{schema::Schema, DocAddress};
|
||||||
use crate::{Index, IndexSettings, IndexSortByField, Order};
|
use crate::{Index, IndexSettings, IndexSortByField, Order};
|
||||||
@@ -161,7 +146,7 @@ mod tests_indexsorting {
|
|||||||
fn create_test_index(
|
fn create_test_index(
|
||||||
index_settings: Option<IndexSettings>,
|
index_settings: Option<IndexSettings>,
|
||||||
text_field_options: TextOptions,
|
text_field_options: TextOptions,
|
||||||
) -> crate::Result<Index> {
|
) -> Index {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
|
let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
|
||||||
@@ -181,20 +166,19 @@ mod tests_indexsorting {
|
|||||||
if let Some(settings) = index_settings {
|
if let Some(settings) = index_settings {
|
||||||
index_builder = index_builder.settings(settings);
|
index_builder = index_builder.settings(settings);
|
||||||
}
|
}
|
||||||
let index = index_builder.create_in_ram()?;
|
let index = index_builder.create_in_ram().unwrap();
|
||||||
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(my_number=>40_u64))?;
|
index_writer.add_document(doc!(my_number=>40_u64));
|
||||||
index_writer.add_document(
|
index_writer
|
||||||
doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64),
|
.add_document(doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64));
|
||||||
)?;
|
index_writer.add_document(doc!(my_number=>100_u64));
|
||||||
index_writer.add_document(doc!(my_number=>100_u64))?;
|
|
||||||
index_writer.add_document(
|
index_writer.add_document(
|
||||||
doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"),
|
doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"),
|
||||||
)?;
|
);
|
||||||
index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ))?;
|
index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ));
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
Ok(index)
|
index
|
||||||
}
|
}
|
||||||
fn get_text_options() -> TextOptions {
|
fn get_text_options() -> TextOptions {
|
||||||
TextOptions::default().set_indexing_options(
|
TextOptions::default().set_indexing_options(
|
||||||
@@ -219,7 +203,7 @@ mod tests_indexsorting {
|
|||||||
for option in options {
|
for option in options {
|
||||||
//let options = get_text_options();
|
//let options = get_text_options();
|
||||||
// no index_sort
|
// no index_sort
|
||||||
let index = create_test_index(None, option.clone())?;
|
let index = create_test_index(None, option.clone());
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
|
|
||||||
@@ -241,7 +225,7 @@ mod tests_indexsorting {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
}),
|
}),
|
||||||
option.clone(),
|
option.clone(),
|
||||||
)?;
|
);
|
||||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -273,7 +257,7 @@ mod tests_indexsorting {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
}),
|
}),
|
||||||
option.clone(),
|
option.clone(),
|
||||||
)?;
|
);
|
||||||
let my_string_field = index.schema().get_field("text_field").unwrap();
|
let my_string_field = index.schema().get_field("text_field").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
|
|
||||||
@@ -303,7 +287,7 @@ mod tests_indexsorting {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_sort_index_get_documents() -> crate::Result<()> {
|
fn test_sort_index_get_documents() -> crate::Result<()> {
|
||||||
// default baseline
|
// default baseline
|
||||||
let index = create_test_index(None, get_text_options())?;
|
let index = create_test_index(None, get_text_options());
|
||||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
{
|
{
|
||||||
@@ -332,7 +316,7 @@ mod tests_indexsorting {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
}),
|
}),
|
||||||
get_text_options(),
|
get_text_options(),
|
||||||
)?;
|
);
|
||||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
{
|
{
|
||||||
@@ -357,7 +341,7 @@ mod tests_indexsorting {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
}),
|
}),
|
||||||
get_text_options(),
|
get_text_options(),
|
||||||
)?;
|
);
|
||||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
{
|
{
|
||||||
@@ -372,7 +356,7 @@ mod tests_indexsorting {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_sort_index_test_string_field() -> crate::Result<()> {
|
fn test_sort_index_test_string_field() -> crate::Result<()> {
|
||||||
let index = create_test_index(None, get_text_options())?;
|
let index = create_test_index(None, get_text_options());
|
||||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
|
|
||||||
@@ -392,7 +376,7 @@ mod tests_indexsorting {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
}),
|
}),
|
||||||
get_text_options(),
|
get_text_options(),
|
||||||
)?;
|
);
|
||||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -423,7 +407,7 @@ mod tests_indexsorting {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
}),
|
}),
|
||||||
get_text_options(),
|
get_text_options(),
|
||||||
)?;
|
);
|
||||||
let my_string_field = index.schema().get_field("string_field").unwrap();
|
let my_string_field = index.schema().get_field("string_field").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
|
|
||||||
@@ -459,7 +443,7 @@ mod tests_indexsorting {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
}),
|
}),
|
||||||
get_text_options(),
|
get_text_options(),
|
||||||
)?;
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
index.settings().sort_by_field.as_ref().unwrap().field,
|
index.settings().sort_by_field.as_ref().unwrap().field,
|
||||||
"my_number".to_string()
|
"my_number".to_string()
|
||||||
@@ -490,27 +474,4 @@ mod tests_indexsorting {
|
|||||||
assert_eq!(vals, &[3]);
|
assert_eq!(vals, &[3]);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_doc_mapping() {
|
|
||||||
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![3, 2, 5]);
|
|
||||||
assert_eq!(doc_mapping.get_old_doc_id(0), 3);
|
|
||||||
assert_eq!(doc_mapping.get_old_doc_id(1), 2);
|
|
||||||
assert_eq!(doc_mapping.get_old_doc_id(2), 5);
|
|
||||||
assert_eq!(doc_mapping.get_new_doc_id(0), 0);
|
|
||||||
assert_eq!(doc_mapping.get_new_doc_id(1), 0);
|
|
||||||
assert_eq!(doc_mapping.get_new_doc_id(2), 1);
|
|
||||||
assert_eq!(doc_mapping.get_new_doc_id(3), 0);
|
|
||||||
assert_eq!(doc_mapping.get_new_doc_id(4), 0);
|
|
||||||
assert_eq!(doc_mapping.get_new_doc_id(5), 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_doc_mapping_remap() {
|
|
||||||
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![2, 8, 3]);
|
|
||||||
assert_eq!(
|
|
||||||
&doc_mapping.remap(&[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]),
|
|
||||||
&[2000, 8000, 3000]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,7 +14,6 @@ use crate::error::TantivyError;
|
|||||||
use crate::fastfield::write_alive_bitset;
|
use crate::fastfield::write_alive_bitset;
|
||||||
use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||||
use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
||||||
use crate::indexer::index_writer_status::IndexWriterStatus;
|
|
||||||
use crate::indexer::operation::DeleteOperation;
|
use crate::indexer::operation::DeleteOperation;
|
||||||
use crate::indexer::stamper::Stamper;
|
use crate::indexer::stamper::Stamper;
|
||||||
use crate::indexer::MergePolicy;
|
use crate::indexer::MergePolicy;
|
||||||
@@ -29,13 +28,16 @@ use crossbeam::channel;
|
|||||||
use futures::executor::block_on;
|
use futures::executor::block_on;
|
||||||
use futures::future::Future;
|
use futures::future::Future;
|
||||||
use smallvec::smallvec;
|
use smallvec::smallvec;
|
||||||
|
use smallvec::SmallVec;
|
||||||
|
use wasm_mt_pool::pool_exec;
|
||||||
|
use wasm_mt::prelude::*;
|
||||||
|
use std::mem;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use wasm_mt_pool::prelude::*;
|
||||||
use std::thread;
|
use std::thread;
|
||||||
use std::thread::JoinHandle;
|
use std::thread::JoinHandle;
|
||||||
|
|
||||||
use super::{AddBatch, AddBatchReceiver, AddBatchSender};
|
|
||||||
|
|
||||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||||
// in the heap goes below MARGIN_IN_BYTES.
|
// in the heap goes below MARGIN_IN_BYTES.
|
||||||
pub const MARGIN_IN_BYTES: usize = 1_000_000;
|
pub const MARGIN_IN_BYTES: usize = 1_000_000;
|
||||||
@@ -51,12 +53,15 @@ pub const MAX_NUM_THREAD: usize = 8;
|
|||||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||||
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||||
|
|
||||||
fn error_in_index_worker_thread(context: &str) -> TantivyError {
|
// Group of operations.
|
||||||
TantivyError::ErrorInThread(format!(
|
// Most of the time, users will send operation one-by-one, but it can be useful to
|
||||||
"{}. A worker thread encounterred an error (io::Error most likely) or panicked.",
|
// send them as a small block to ensure that
|
||||||
context
|
// - all docs in the operation will happen on the same segment and continuous doc_ids.
|
||||||
))
|
// - all operations in the group are committed at the same time, making the group
|
||||||
}
|
// atomic.
|
||||||
|
type OperationGroup = SmallVec<[AddOperation; 4]>;
|
||||||
|
type OperationSender = channel::Sender<OperationGroup>;
|
||||||
|
type OperationReceiver = channel::Receiver<OperationGroup>;
|
||||||
|
|
||||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||||
///
|
///
|
||||||
@@ -73,10 +78,10 @@ pub struct IndexWriter {
|
|||||||
|
|
||||||
heap_size_in_bytes_per_thread: usize,
|
heap_size_in_bytes_per_thread: usize,
|
||||||
|
|
||||||
workers_join_handle: Vec<JoinHandle<crate::Result<()>>>,
|
workers_join_handle: Vec<JoinHandle<Result<JsValue, JsValue>>>,
|
||||||
|
|
||||||
index_writer_status: IndexWriterStatus,
|
operation_receiver: OperationReceiver,
|
||||||
operation_sender: AddBatchSender,
|
operation_sender: OperationSender,
|
||||||
|
|
||||||
segment_updater: SegmentUpdater,
|
segment_updater: SegmentUpdater,
|
||||||
|
|
||||||
@@ -88,6 +93,8 @@ pub struct IndexWriter {
|
|||||||
|
|
||||||
stamper: Stamper,
|
stamper: Stamper,
|
||||||
committed_opstamp: Opstamp,
|
committed_opstamp: Opstamp,
|
||||||
|
|
||||||
|
worker_pool: wasm_mt_pool::ThreadPool,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn compute_deleted_bitset(
|
fn compute_deleted_bitset(
|
||||||
@@ -162,8 +169,15 @@ pub(crate) fn advance_deletes(
|
|||||||
target_opstamp,
|
target_opstamp,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
// TODO optimize
|
||||||
|
// It should be possible to do something smarter by manipulation bitsets directly
|
||||||
|
// to compute this union.
|
||||||
if let Some(seg_alive_bitset) = segment_reader.alive_bitset() {
|
if let Some(seg_alive_bitset) = segment_reader.alive_bitset() {
|
||||||
alive_bitset.intersect_update(seg_alive_bitset.bitset());
|
for doc in 0u32..max_doc {
|
||||||
|
if seg_alive_bitset.is_deleted(doc) {
|
||||||
|
alive_bitset.remove(doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let num_alive_docs: u32 = alive_bitset.len() as u32;
|
let num_alive_docs: u32 = alive_bitset.len() as u32;
|
||||||
@@ -183,10 +197,10 @@ pub(crate) fn advance_deletes(
|
|||||||
fn index_documents(
|
fn index_documents(
|
||||||
memory_budget: usize,
|
memory_budget: usize,
|
||||||
segment: Segment,
|
segment: Segment,
|
||||||
grouped_document_iterator: &mut dyn Iterator<Item = AddBatch>,
|
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
|
||||||
segment_updater: &mut SegmentUpdater,
|
segment_updater: &mut SegmentUpdater,
|
||||||
mut delete_cursor: DeleteCursor,
|
mut delete_cursor: DeleteCursor,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<bool> {
|
||||||
let schema = segment.schema();
|
let schema = segment.schema();
|
||||||
|
|
||||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
|
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
|
||||||
@@ -205,7 +219,7 @@ fn index_documents(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if !segment_updater.is_alive() {
|
if !segment_updater.is_alive() {
|
||||||
return Ok(());
|
return Ok(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
let max_doc = segment_writer.max_doc();
|
let max_doc = segment_writer.max_doc();
|
||||||
@@ -225,13 +239,13 @@ fn index_documents(
|
|||||||
// update segment_updater inventory to remove tempstore
|
// update segment_updater inventory to remove tempstore
|
||||||
let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt);
|
let segment_entry = SegmentEntry::new(meta, delete_cursor, alive_bitset_opt);
|
||||||
block_on(segment_updater.schedule_add_segment(segment_entry))?;
|
block_on(segment_updater.schedule_add_segment(segment_entry))?;
|
||||||
Ok(())
|
Ok(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// `doc_opstamps` is required to be non-empty.
|
/// `doc_opstamps` is required to be non-empty.
|
||||||
fn apply_deletes(
|
fn apply_deletes(
|
||||||
segment: &Segment,
|
segment: &Segment,
|
||||||
delete_cursor: &mut DeleteCursor,
|
mut delete_cursor: &mut DeleteCursor,
|
||||||
doc_opstamps: &[Opstamp],
|
doc_opstamps: &[Opstamp],
|
||||||
) -> crate::Result<Option<BitSet>> {
|
) -> crate::Result<Option<BitSet>> {
|
||||||
if delete_cursor.get().is_none() {
|
if delete_cursor.get().is_none() {
|
||||||
@@ -254,7 +268,7 @@ fn apply_deletes(
|
|||||||
let may_have_deletes = compute_deleted_bitset(
|
let may_have_deletes = compute_deleted_bitset(
|
||||||
&mut deleted_bitset,
|
&mut deleted_bitset,
|
||||||
&segment_reader,
|
&segment_reader,
|
||||||
delete_cursor,
|
&mut delete_cursor,
|
||||||
&doc_to_opstamps,
|
&doc_to_opstamps,
|
||||||
max_doc_opstamp,
|
max_doc_opstamp,
|
||||||
)?;
|
)?;
|
||||||
@@ -278,7 +292,8 @@ impl IndexWriter {
|
|||||||
/// should work at the same time.
|
/// should work at the same time.
|
||||||
/// # Errors
|
/// # Errors
|
||||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||||
/// If the heap size per thread is too small or too big, returns `TantivyError::InvalidArgument`
|
/// # Panics
|
||||||
|
/// If the heap size per thread is too small, panics.
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
num_threads: usize,
|
num_threads: usize,
|
||||||
@@ -296,7 +311,7 @@ impl IndexWriter {
|
|||||||
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
||||||
return Err(TantivyError::InvalidArgument(err_msg));
|
return Err(TantivyError::InvalidArgument(err_msg));
|
||||||
}
|
}
|
||||||
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
|
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
|
||||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||||
|
|
||||||
let delete_queue = DeleteQueue::new();
|
let delete_queue = DeleteQueue::new();
|
||||||
@@ -308,13 +323,14 @@ impl IndexWriter {
|
|||||||
let segment_updater =
|
let segment_updater =
|
||||||
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||||
|
|
||||||
|
let worker_pool = block_on(wasm_mt_pool::ThreadPool::new(num_threads, crate::PKG_JS).and_init()).unwrap();
|
||||||
let mut index_writer = IndexWriter {
|
let mut index_writer = IndexWriter {
|
||||||
_directory_lock: Some(directory_lock),
|
_directory_lock: Some(directory_lock),
|
||||||
|
|
||||||
heap_size_in_bytes_per_thread,
|
heap_size_in_bytes_per_thread,
|
||||||
index: index.clone(),
|
index: index.clone(),
|
||||||
|
|
||||||
index_writer_status: IndexWriterStatus::from(document_receiver),
|
operation_receiver: document_receiver,
|
||||||
operation_sender: document_sender,
|
operation_sender: document_sender,
|
||||||
|
|
||||||
segment_updater,
|
segment_updater,
|
||||||
@@ -328,6 +344,7 @@ impl IndexWriter {
|
|||||||
stamper,
|
stamper,
|
||||||
|
|
||||||
worker_id: 0,
|
worker_id: 0,
|
||||||
|
worker_pool,
|
||||||
};
|
};
|
||||||
index_writer.start_workers()?;
|
index_writer.start_workers()?;
|
||||||
Ok(index_writer)
|
Ok(index_writer)
|
||||||
@@ -354,14 +371,16 @@ impl IndexWriter {
|
|||||||
for join_handle in former_workers_handles {
|
for join_handle in former_workers_handles {
|
||||||
join_handle
|
join_handle
|
||||||
.join()
|
.join()
|
||||||
.map_err(|_| error_in_index_worker_thread("Worker thread panicked."))?
|
.expect("Indexing Worker thread panicked")
|
||||||
.map_err(|_| error_in_index_worker_thread("Worker thread failed."))?;
|
.map_err(|_| {
|
||||||
|
TantivyError::ErrorInThread("Error in indexing worker thread.".into())
|
||||||
|
})?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let result = self
|
let result = self
|
||||||
.segment_updater
|
.segment_updater
|
||||||
.wait_merging_thread()
|
.wait_merging_thread()
|
||||||
.map_err(|_| error_in_index_worker_thread("Failed to join merging thread."));
|
.map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into()));
|
||||||
|
|
||||||
if let Err(ref e) = result {
|
if let Err(ref e) = result {
|
||||||
error!("Some merging thread failed {:?}", e);
|
error!("Some merging thread failed {:?}", e);
|
||||||
@@ -389,53 +408,45 @@ impl IndexWriter {
|
|||||||
self.index.new_segment()
|
self.index.new_segment()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn operation_receiver(&self) -> crate::Result<AddBatchReceiver> {
|
|
||||||
self.index_writer_status
|
|
||||||
.operation_receiver()
|
|
||||||
.ok_or_else(|| crate::TantivyError::ErrorInThread("The index writer was killed. It can happen if an indexing worker encounterred an Io error for instance.".to_string()))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Spawns a new worker thread for indexing.
|
/// Spawns a new worker thread for indexing.
|
||||||
/// The thread consumes documents from the pipeline.
|
/// The thread consumes documents from the pipeline.
|
||||||
fn add_indexing_worker(&mut self) -> crate::Result<()> {
|
fn add_indexing_worker(&mut self) -> crate::Result<()> {
|
||||||
let document_receiver_clone = self.operation_receiver()?;
|
let document_receiver_clone = self.operation_receiver.clone();
|
||||||
let index_writer_bomb = self.index_writer_status.create_bomb();
|
|
||||||
|
|
||||||
let mut segment_updater = self.segment_updater.clone();
|
let mut segment_updater = self.segment_updater.clone();
|
||||||
|
|
||||||
let mut delete_cursor = self.delete_queue.cursor();
|
let mut delete_cursor = self.delete_queue.cursor();
|
||||||
|
|
||||||
let mem_budget = self.heap_size_in_bytes_per_thread;
|
let mem_budget = self.heap_size_in_bytes_per_thread;
|
||||||
let index = self.index.clone();
|
let index = self.index.clone();
|
||||||
let join_handle: JoinHandle<crate::Result<()>> = thread::Builder::new()
|
let join_handle: JoinHandle<crate::Result<_>> = pool_exec!(self.worker_pool,
|
||||||
.name(format!("thrd-tantivy-index{}", self.worker_id))
|
move || {
|
||||||
.spawn(move || {
|
|
||||||
loop {
|
loop {
|
||||||
let mut document_iterator = document_receiver_clone
|
let mut document_iterator =
|
||||||
.clone()
|
document_receiver_clone.clone().into_iter().peekable();
|
||||||
.into_iter()
|
|
||||||
.filter(|batch| !batch.is_empty())
|
|
||||||
.peekable();
|
|
||||||
|
|
||||||
// The peeking here is to avoid creating a new segment's files
|
// the peeking here is to avoid
|
||||||
|
// creating a new segment's files
|
||||||
// if no document are available.
|
// if no document are available.
|
||||||
//
|
//
|
||||||
// This is a valid guarantee as the peeked document now belongs to
|
// this is a valid guarantee as the
|
||||||
|
// peeked document now belongs to
|
||||||
// our local iterator.
|
// our local iterator.
|
||||||
if let Some(batch) = document_iterator.peek() {
|
if let Some(operations) = document_iterator.peek() {
|
||||||
assert!(!batch.is_empty());
|
if let Some(first) = operations.first() {
|
||||||
delete_cursor.skip_to(batch[0].opstamp);
|
delete_cursor.skip_to(first.opstamp);
|
||||||
|
} else {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// No more documents.
|
// No more documents.
|
||||||
// It happens when there is a commit, or if the `IndexWriter`
|
// Happens when there is a commit, or if the `IndexWriter`
|
||||||
// was dropped.
|
// was dropped.
|
||||||
index_writer_bomb.defuse();
|
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
let segment = index.new_segment();
|
||||||
index_documents(
|
index_documents(
|
||||||
mem_budget,
|
mem_budget,
|
||||||
index.new_segment(),
|
segment,
|
||||||
&mut document_iterator,
|
&mut document_iterator,
|
||||||
&mut segment_updater,
|
&mut segment_updater,
|
||||||
delete_cursor.clone(),
|
delete_cursor.clone(),
|
||||||
@@ -465,8 +476,10 @@ impl IndexWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Detects and removes the files that are not used by the index anymore.
|
/// Detects and removes the files that are not used by the index anymore.
|
||||||
pub async fn garbage_collect_files(&self) -> crate::Result<GarbageCollectionResult> {
|
pub fn garbage_collect_files(
|
||||||
self.segment_updater.schedule_garbage_collect().await
|
&self,
|
||||||
|
) -> impl Future<Output = crate::Result<GarbageCollectionResult>> {
|
||||||
|
self.segment_updater.schedule_garbage_collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Deletes all documents from the index
|
/// Deletes all documents from the index
|
||||||
@@ -489,7 +502,7 @@ impl IndexWriter {
|
|||||||
/// let index = Index::create_in_ram(schema.clone());
|
/// let index = Index::create_in_ram(schema.clone());
|
||||||
///
|
///
|
||||||
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||||
/// index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
/// index_writer.add_document(doc!(title => "The modern Promotheus"));
|
||||||
/// index_writer.commit()?;
|
/// index_writer.commit()?;
|
||||||
///
|
///
|
||||||
/// let clear_res = index_writer.delete_all_documents().unwrap();
|
/// let clear_res = index_writer.delete_all_documents().unwrap();
|
||||||
@@ -533,11 +546,12 @@ impl IndexWriter {
|
|||||||
/// when no documents are remaining.
|
/// when no documents are remaining.
|
||||||
///
|
///
|
||||||
/// Returns the former segment_ready channel.
|
/// Returns the former segment_ready channel.
|
||||||
fn recreate_document_channel(&mut self) {
|
#[allow(unused_must_use)]
|
||||||
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
|
fn recreate_document_channel(&mut self) -> OperationReceiver {
|
||||||
|
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
|
||||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||||
self.operation_sender = document_sender;
|
mem::replace(&mut self.operation_sender, document_sender);
|
||||||
self.index_writer_status = IndexWriterStatus::from(document_receiver);
|
mem::replace(&mut self.operation_receiver, document_receiver)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Rollback to the last commit
|
/// Rollback to the last commit
|
||||||
@@ -553,7 +567,7 @@ impl IndexWriter {
|
|||||||
// marks the segment updater as killed. From now on, all
|
// marks the segment updater as killed. From now on, all
|
||||||
// segment updates will be ignored.
|
// segment updates will be ignored.
|
||||||
self.segment_updater.kill();
|
self.segment_updater.kill();
|
||||||
let document_receiver_res = self.operation_receiver();
|
let document_receiver = self.operation_receiver.clone();
|
||||||
|
|
||||||
// take the directory lock to create a new index_writer.
|
// take the directory lock to create a new index_writer.
|
||||||
let directory_lock = self
|
let directory_lock = self
|
||||||
@@ -579,9 +593,7 @@ impl IndexWriter {
|
|||||||
//
|
//
|
||||||
// This will reach an end as the only document_sender
|
// This will reach an end as the only document_sender
|
||||||
// was dropped with the index_writer.
|
// was dropped with the index_writer.
|
||||||
if let Ok(document_receiver) = document_receiver_res {
|
for _ in document_receiver {}
|
||||||
for _ in document_receiver {}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(self.committed_opstamp)
|
Ok(self.committed_opstamp)
|
||||||
}
|
}
|
||||||
@@ -695,10 +707,14 @@ impl IndexWriter {
|
|||||||
/// The opstamp is an increasing `u64` that can
|
/// The opstamp is an increasing `u64` that can
|
||||||
/// be used by the client to align commits with its own
|
/// be used by the client to align commits with its own
|
||||||
/// document queue.
|
/// document queue.
|
||||||
pub fn add_document(&self, document: Document) -> crate::Result<Opstamp> {
|
pub fn add_document(&self, document: Document) -> Opstamp {
|
||||||
let opstamp = self.stamper.stamp();
|
let opstamp = self.stamper.stamp();
|
||||||
self.send_add_documents_batch(smallvec![AddOperation { opstamp, document }])?;
|
let add_operation = AddOperation { opstamp, document };
|
||||||
Ok(opstamp)
|
let send_result = self.operation_sender.send(smallvec![add_operation]);
|
||||||
|
if let Err(e) = send_result {
|
||||||
|
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||||
|
}
|
||||||
|
opstamp
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gets a range of stamps from the stamper and "pops" the last stamp
|
/// Gets a range of stamps from the stamper and "pops" the last stamp
|
||||||
@@ -711,7 +727,11 @@ impl IndexWriter {
|
|||||||
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
|
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
|
||||||
let Range { start, end } = self.stamper.stamps(count + 1u64);
|
let Range { start, end } = self.stamper.stamps(count + 1u64);
|
||||||
let last_opstamp = end - 1;
|
let last_opstamp = end - 1;
|
||||||
(last_opstamp, start..last_opstamp)
|
let stamps = Range {
|
||||||
|
start,
|
||||||
|
end: last_opstamp,
|
||||||
|
};
|
||||||
|
(last_opstamp, stamps)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Runs a group of document operations ensuring that the operations are
|
/// Runs a group of document operations ensuring that the operations are
|
||||||
@@ -730,20 +750,16 @@ impl IndexWriter {
|
|||||||
/// Like adds and deletes (see `IndexWriter.add_document` and
|
/// Like adds and deletes (see `IndexWriter.add_document` and
|
||||||
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
|
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
|
||||||
/// visible to readers only after calling `commit()`.
|
/// visible to readers only after calling `commit()`.
|
||||||
pub fn run<I>(&self, user_operations: I) -> crate::Result<Opstamp>
|
pub fn run(&self, user_operations: Vec<UserOperation>) -> Opstamp {
|
||||||
where
|
let count = user_operations.len() as u64;
|
||||||
I: IntoIterator<Item = UserOperation>,
|
|
||||||
I::IntoIter: ExactSizeIterator,
|
|
||||||
{
|
|
||||||
let user_operations_it = user_operations.into_iter();
|
|
||||||
let count = user_operations_it.len() as u64;
|
|
||||||
if count == 0 {
|
if count == 0 {
|
||||||
return Ok(self.stamper.stamp());
|
return self.stamper.stamp();
|
||||||
}
|
}
|
||||||
let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
|
let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
|
||||||
|
|
||||||
let mut adds = AddBatch::default();
|
let mut adds = OperationGroup::default();
|
||||||
for (user_op, opstamp) in user_operations_it.zip(stamps) {
|
|
||||||
|
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
|
||||||
match user_op {
|
match user_op {
|
||||||
UserOperation::Delete(term) => {
|
UserOperation::Delete(term) => {
|
||||||
let delete_operation = DeleteOperation { opstamp, term };
|
let delete_operation = DeleteOperation { opstamp, term };
|
||||||
@@ -755,16 +771,12 @@ impl IndexWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
self.send_add_documents_batch(adds)?;
|
let send_result = self.operation_sender.send(adds);
|
||||||
Ok(batch_opstamp)
|
if let Err(e) = send_result {
|
||||||
}
|
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||||
|
};
|
||||||
|
|
||||||
fn send_add_documents_batch(&self, add_ops: AddBatch) -> crate::Result<()> {
|
batch_opstamp
|
||||||
if self.index_writer_status.is_alive() && self.operation_sender.send(add_ops).is_ok() {
|
|
||||||
Ok(())
|
|
||||||
} else {
|
|
||||||
Err(error_in_index_worker_thread("An index writer was killed."))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -798,7 +810,6 @@ mod tests {
|
|||||||
use crate::query::TermQuery;
|
use crate::query::TermQuery;
|
||||||
use crate::schema::Cardinality;
|
use crate::schema::Cardinality;
|
||||||
use crate::schema::Facet;
|
use crate::schema::Facet;
|
||||||
use crate::schema::FacetOptions;
|
|
||||||
use crate::schema::IntOptions;
|
use crate::schema::IntOptions;
|
||||||
use crate::schema::TextFieldIndexing;
|
use crate::schema::TextFieldIndexing;
|
||||||
use crate::schema::TextOptions;
|
use crate::schema::TextOptions;
|
||||||
@@ -831,7 +842,7 @@ mod tests {
|
|||||||
UserOperation::Add(doc!(text_field=>"a")),
|
UserOperation::Add(doc!(text_field=>"a")),
|
||||||
UserOperation::Add(doc!(text_field=>"b")),
|
UserOperation::Add(doc!(text_field=>"b")),
|
||||||
];
|
];
|
||||||
let batch_opstamp1 = index_writer.run(operations).unwrap();
|
let batch_opstamp1 = index_writer.run(operations);
|
||||||
assert_eq!(batch_opstamp1, 2u64);
|
assert_eq!(batch_opstamp1, 2u64);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -842,12 +853,8 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
let mut index_writer = index.writer_for_tests().unwrap();
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer
|
index_writer.add_document(doc!(text_field => "hello1"));
|
||||||
.add_document(doc!(text_field => "hello1"))
|
index_writer.add_document(doc!(text_field => "hello2"));
|
||||||
.unwrap();
|
|
||||||
index_writer
|
|
||||||
.add_document(doc!(text_field => "hello2"))
|
|
||||||
.unwrap();
|
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
let reader = index.reader().unwrap();
|
let reader = index.reader().unwrap();
|
||||||
@@ -904,7 +911,7 @@ mod tests {
|
|||||||
UserOperation::Delete(b_term),
|
UserOperation::Delete(b_term),
|
||||||
];
|
];
|
||||||
|
|
||||||
index_writer.run(operations).unwrap();
|
index_writer.run(operations);
|
||||||
index_writer.commit().expect("failed to commit");
|
index_writer.commit().expect("failed to commit");
|
||||||
reader.reload().expect("failed to load searchers");
|
reader.reload().expect("failed to load searchers");
|
||||||
|
|
||||||
@@ -934,10 +941,10 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let index_writer = index.writer(3_000_000).unwrap();
|
let index_writer = index.writer(3_000_000).unwrap();
|
||||||
let operations1 = vec![];
|
let operations1 = vec![];
|
||||||
let batch_opstamp1 = index_writer.run(operations1).unwrap();
|
let batch_opstamp1 = index_writer.run(operations1);
|
||||||
assert_eq!(batch_opstamp1, 0u64);
|
assert_eq!(batch_opstamp1, 0u64);
|
||||||
let operations2 = vec![];
|
let operations2 = vec![];
|
||||||
let batch_opstamp2 = index_writer.run(operations2).unwrap();
|
let batch_opstamp2 = index_writer.run(operations2);
|
||||||
assert_eq!(batch_opstamp2, 1u64);
|
assert_eq!(batch_opstamp2, 1u64);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -974,7 +981,7 @@ mod tests {
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
format!("{:?}", index_writer.get_merge_policy()),
|
format!("{:?}", index_writer.get_merge_policy()),
|
||||||
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \
|
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \
|
||||||
level_log_size: 0.75, del_docs_ratio_before_merge: 1.0 }"
|
level_log_size: 0.75 }"
|
||||||
);
|
);
|
||||||
let merge_policy = Box::new(NoMergePolicy::default());
|
let merge_policy = Box::new(NoMergePolicy::default());
|
||||||
index_writer.set_merge_policy(merge_policy);
|
index_writer.set_merge_policy(merge_policy);
|
||||||
@@ -997,14 +1004,15 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_commit_and_rollback() -> crate::Result<()> {
|
fn test_commit_and_rollback() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let reader = index
|
let reader = index
|
||||||
.reader_builder()
|
.reader_builder()
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
.reload_policy(ReloadPolicy::Manual)
|
||||||
.try_into()?;
|
.try_into()
|
||||||
|
.unwrap();
|
||||||
let num_docs_containing = |s: &str| {
|
let num_docs_containing = |s: &str| {
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let term = Term::from_field_text(text_field, s);
|
let term = Term::from_field_text(text_field, s);
|
||||||
@@ -1013,127 +1021,136 @@ mod tests {
|
|||||||
|
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer(3_000_000)?;
|
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
index_writer.add_document(doc!(text_field=>"a"));
|
||||||
index_writer.rollback()?;
|
index_writer.rollback().unwrap();
|
||||||
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
||||||
assert_eq!(num_docs_containing("a"), 0);
|
assert_eq!(num_docs_containing("a"), 0);
|
||||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
{
|
||||||
index_writer.add_document(doc!(text_field=>"c"))?;
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
index_writer.commit()?;
|
index_writer.add_document(doc!(text_field=>"c"));
|
||||||
reader.reload()?;
|
}
|
||||||
|
assert!(index_writer.commit().is_ok());
|
||||||
|
reader.reload().unwrap();
|
||||||
assert_eq!(num_docs_containing("a"), 0);
|
assert_eq!(num_docs_containing("a"), 0);
|
||||||
assert_eq!(num_docs_containing("b"), 1);
|
assert_eq!(num_docs_containing("b"), 1);
|
||||||
assert_eq!(num_docs_containing("c"), 1);
|
assert_eq!(num_docs_containing("c"), 1);
|
||||||
}
|
}
|
||||||
reader.reload()?;
|
reader.reload().unwrap();
|
||||||
reader.searcher();
|
reader.searcher();
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_with_merges() -> crate::Result<()> {
|
fn test_with_merges() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let reader = index
|
let reader = index
|
||||||
.reader_builder()
|
.reader_builder()
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
.reload_policy(ReloadPolicy::Manual)
|
||||||
.try_into()?;
|
.try_into()
|
||||||
|
.unwrap();
|
||||||
let num_docs_containing = |s: &str| {
|
let num_docs_containing = |s: &str| {
|
||||||
let term_a = Term::from_field_text(text_field, s);
|
let term_a = Term::from_field_text(text_field, s);
|
||||||
reader.searcher().doc_freq(&term_a).unwrap()
|
reader.searcher().doc_freq(&term_a).unwrap()
|
||||||
};
|
};
|
||||||
// writing the segment
|
{
|
||||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
// writing the segment
|
||||||
// create 8 segments with 100 tiny docs
|
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||||
for _doc in 0..100 {
|
// create 8 segments with 100 tiny docs
|
||||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
for _doc in 0..100 {
|
||||||
|
index_writer.add_document(doc!(text_field=>"a"));
|
||||||
|
}
|
||||||
|
index_writer.commit().expect("commit failed");
|
||||||
|
for _doc in 0..100 {
|
||||||
|
index_writer.add_document(doc!(text_field=>"a"));
|
||||||
|
}
|
||||||
|
// this should create 8 segments and trigger a merge.
|
||||||
|
index_writer.commit().expect("commit failed");
|
||||||
|
index_writer
|
||||||
|
.wait_merging_threads()
|
||||||
|
.expect("waiting merging thread failed");
|
||||||
|
|
||||||
|
reader.reload().unwrap();
|
||||||
|
|
||||||
|
assert_eq!(num_docs_containing("a"), 200);
|
||||||
|
assert!(index.searchable_segments().unwrap().len() < 8);
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
|
||||||
for _doc in 0..100 {
|
|
||||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
|
||||||
}
|
|
||||||
// this should create 8 segments and trigger a merge.
|
|
||||||
index_writer.commit()?;
|
|
||||||
index_writer.wait_merging_threads()?;
|
|
||||||
reader.reload()?;
|
|
||||||
assert_eq!(num_docs_containing("a"), 200);
|
|
||||||
assert!(index.searchable_segments()?.len() < 8);
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_prepare_with_commit_message() -> crate::Result<()> {
|
fn test_prepare_with_commit_message() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
|
|
||||||
// writing the segment
|
|
||||||
let mut index_writer = index.writer(12_000_000)?;
|
|
||||||
// create 8 segments with 100 tiny docs
|
|
||||||
for _doc in 0..100 {
|
|
||||||
index_writer.add_document(doc!(text_field => "a"))?;
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let mut prepared_commit = index_writer.prepare_commit()?;
|
|
||||||
prepared_commit.set_payload("first commit");
|
|
||||||
prepared_commit.commit()?;
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let metas = index.load_metas()?;
|
|
||||||
assert_eq!(metas.payload.unwrap(), "first commit");
|
|
||||||
}
|
|
||||||
for _doc in 0..100 {
|
|
||||||
index_writer.add_document(doc!(text_field => "a"))?;
|
|
||||||
}
|
|
||||||
index_writer.commit()?;
|
|
||||||
{
|
|
||||||
let metas = index.load_metas()?;
|
|
||||||
assert!(metas.payload.is_none());
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_prepare_but_rollback() -> crate::Result<()> {
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000)?;
|
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||||
// create 8 segments with 100 tiny docs
|
// create 8 segments with 100 tiny docs
|
||||||
for _doc in 0..100 {
|
for _doc in 0..100 {
|
||||||
index_writer.add_document(doc!(text_field => "a"))?;
|
index_writer.add_document(doc!(text_field => "a"));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut prepared_commit = index_writer.prepare_commit()?;
|
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||||
prepared_commit.set_payload("first commit");
|
prepared_commit.set_payload("first commit");
|
||||||
prepared_commit.abort()?;
|
prepared_commit.commit().expect("commit failed");
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let metas = index.load_metas()?;
|
let metas = index.load_metas().unwrap();
|
||||||
|
assert_eq!(metas.payload.unwrap(), "first commit");
|
||||||
|
}
|
||||||
|
for _doc in 0..100 {
|
||||||
|
index_writer.add_document(doc!(text_field => "a"));
|
||||||
|
}
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
{
|
||||||
|
let metas = index.load_metas().unwrap();
|
||||||
|
assert!(metas.payload.is_none());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_prepare_but_rollback() {
|
||||||
|
let mut schema_builder = schema::Schema::builder();
|
||||||
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
|
{
|
||||||
|
// writing the segment
|
||||||
|
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||||
|
// create 8 segments with 100 tiny docs
|
||||||
|
for _doc in 0..100 {
|
||||||
|
index_writer.add_document(doc!(text_field => "a"));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||||
|
prepared_commit.set_payload("first commit");
|
||||||
|
prepared_commit.abort().expect("commit failed");
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let metas = index.load_metas().unwrap();
|
||||||
assert!(metas.payload.is_none());
|
assert!(metas.payload.is_none());
|
||||||
}
|
}
|
||||||
for _doc in 0..100 {
|
for _doc in 0..100 {
|
||||||
index_writer.add_document(doc!(text_field => "b"))?;
|
index_writer.add_document(doc!(text_field => "b"));
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
let num_docs_containing = |s: &str| {
|
let num_docs_containing = |s: &str| {
|
||||||
let term_a = Term::from_field_text(text_field, s);
|
let term_a = Term::from_field_text(text_field, s);
|
||||||
index
|
index
|
||||||
.reader_builder()
|
.reader_builder()
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
.reload_policy(ReloadPolicy::Manual)
|
||||||
.try_into()?
|
.try_into()
|
||||||
|
.unwrap()
|
||||||
.searcher()
|
.searcher()
|
||||||
.doc_freq(&term_a)
|
.doc_freq(&term_a)
|
||||||
|
.unwrap()
|
||||||
};
|
};
|
||||||
assert_eq!(num_docs_containing("a")?, 0);
|
assert_eq!(num_docs_containing("a"), 0);
|
||||||
assert_eq!(num_docs_containing("b")?, 100);
|
assert_eq!(num_docs_containing("b"), 100);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -1154,7 +1171,7 @@ mod tests {
|
|||||||
};
|
};
|
||||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||||
|
|
||||||
let add_tstamp = index_writer.add_document(doc!(text_field => "a")).unwrap();
|
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
|
||||||
let commit_tstamp = index_writer.commit().unwrap();
|
let commit_tstamp = index_writer.commit().unwrap();
|
||||||
assert!(commit_tstamp > add_tstamp);
|
assert!(commit_tstamp > add_tstamp);
|
||||||
index_writer.delete_all_documents().unwrap();
|
index_writer.delete_all_documents().unwrap();
|
||||||
@@ -1171,7 +1188,7 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||||
|
|
||||||
let add_tstamp = index_writer.add_document(doc!(text_field => "a")).unwrap();
|
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
|
||||||
|
|
||||||
// commit documents - they are now available
|
// commit documents - they are now available
|
||||||
let first_commit = index_writer.commit();
|
let first_commit = index_writer.commit();
|
||||||
@@ -1190,7 +1207,7 @@ mod tests {
|
|||||||
|
|
||||||
// add new documents again
|
// add new documents again
|
||||||
for _ in 0..100 {
|
for _ in 0..100 {
|
||||||
index_writer.add_document(doc!(text_field => "b")).unwrap();
|
index_writer.add_document(doc!(text_field => "b"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// rollback to last commit, when index was empty
|
// rollback to last commit, when index was empty
|
||||||
@@ -1224,7 +1241,7 @@ mod tests {
|
|||||||
|
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
// add one simple doc
|
// add one simple doc
|
||||||
index_writer.add_document(doc!(text_field => "a")).unwrap();
|
index_writer.add_document(doc!(text_field => "a"));
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
let term_a = Term::from_field_text(text_field, "a");
|
let term_a = Term::from_field_text(text_field, "a");
|
||||||
@@ -1248,7 +1265,7 @@ mod tests {
|
|||||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||||
|
|
||||||
// add one simple doc
|
// add one simple doc
|
||||||
assert!(index_writer.add_document(doc!(text_field => "a")).is_ok());
|
index_writer.add_document(doc!(text_field => "a"));
|
||||||
let comm = index_writer.commit();
|
let comm = index_writer.commit();
|
||||||
assert!(comm.is_ok());
|
assert!(comm.is_ok());
|
||||||
let commit_tstamp = comm.unwrap();
|
let commit_tstamp = comm.unwrap();
|
||||||
@@ -1324,13 +1341,13 @@ mod tests {
|
|||||||
|
|
||||||
// create and delete docs in same commit
|
// create and delete docs in same commit
|
||||||
for id in 0u64..5u64 {
|
for id in 0u64..5u64 {
|
||||||
index_writer.add_document(doc!(id_field => id))?;
|
index_writer.add_document(doc!(id_field => id));
|
||||||
}
|
}
|
||||||
for id in 2u64..4u64 {
|
for id in 2u64..4u64 {
|
||||||
index_writer.delete_term(Term::from_field_u64(id_field, id));
|
index_writer.delete_term(Term::from_field_u64(id_field, id));
|
||||||
}
|
}
|
||||||
for id in 5u64..10u64 {
|
for id in 5u64..10u64 {
|
||||||
index_writer.add_document(doc!(id_field => id))?;
|
index_writer.add_document(doc!(id_field => id));
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
index_reader.reload()?;
|
index_reader.reload()?;
|
||||||
@@ -1358,24 +1375,15 @@ mod tests {
|
|||||||
Merge,
|
Merge,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
|
fn operation_strategy() -> impl Strategy<Value = IndexingOp> {
|
||||||
prop_oneof![
|
prop_oneof![
|
||||||
(0u64..20u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
|
(0u64..10u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
|
||||||
(0u64..20u64).prop_map(|id| IndexingOp::AddDoc { id }),
|
(0u64..10u64).prop_map(|id| IndexingOp::AddDoc { id }),
|
||||||
(0u64..1u64).prop_map(|_| IndexingOp::Commit),
|
(0u64..2u64).prop_map(|_| IndexingOp::Commit),
|
||||||
(0u64..1u64).prop_map(|_| IndexingOp::Merge),
|
(0u64..1u64).prop_map(|_| IndexingOp::Merge),
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
fn adding_operation_strategy() -> impl Strategy<Value = IndexingOp> {
|
|
||||||
prop_oneof![
|
|
||||||
10 => (0u64..100u64).prop_map(|id| IndexingOp::DeleteDoc { id }),
|
|
||||||
50 => (0u64..100u64).prop_map(|id| IndexingOp::AddDoc { id }),
|
|
||||||
2 => (0u64..1u64).prop_map(|_| IndexingOp::Commit),
|
|
||||||
1 => (0u64..1u64).prop_map(|_| IndexingOp::Merge),
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
|
fn expected_ids(ops: &[IndexingOp]) -> (HashMap<u64, u64>, HashSet<u64>) {
|
||||||
let mut existing_ids = HashMap::new();
|
let mut existing_ids = HashMap::new();
|
||||||
let mut deleted_ids = HashSet::new();
|
let mut deleted_ids = HashSet::new();
|
||||||
@@ -1420,7 +1428,7 @@ mod tests {
|
|||||||
.set_fast(Cardinality::MultiValues)
|
.set_fast(Cardinality::MultiValues)
|
||||||
.set_stored(),
|
.set_stored(),
|
||||||
);
|
);
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let settings = if sort_index {
|
let settings = if sort_index {
|
||||||
IndexSettings {
|
IndexSettings {
|
||||||
@@ -1442,14 +1450,12 @@ mod tests {
|
|||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
|
|
||||||
let old_reader = index.reader()?;
|
|
||||||
|
|
||||||
for &op in ops {
|
for &op in ops {
|
||||||
match op {
|
match op {
|
||||||
IndexingOp::AddDoc { id } => {
|
IndexingOp::AddDoc { id } => {
|
||||||
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
|
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
|
||||||
index_writer
|
index_writer
|
||||||
.add_document(doc!(id_field=>id, multi_numbers=> id, multi_numbers => id, text_field => id.to_string(), facet_field => facet, large_text_field=> LOREM))?;
|
.add_document(doc!(id_field=>id, multi_numbers=> id, multi_numbers => id, text_field => id.to_string(), facet_field => facet, large_text_field=> LOREM));
|
||||||
}
|
}
|
||||||
IndexingOp::DeleteDoc { id } => {
|
IndexingOp::DeleteDoc { id } => {
|
||||||
index_writer.delete_term(Term::from_field_u64(id_field, id));
|
index_writer.delete_term(Term::from_field_u64(id_field, id));
|
||||||
@@ -1482,21 +1488,6 @@ mod tests {
|
|||||||
assert!(index_writer.wait_merging_threads().is_ok());
|
assert!(index_writer.wait_merging_threads().is_ok());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
old_reader.reload()?;
|
|
||||||
let old_searcher = old_reader.searcher();
|
|
||||||
|
|
||||||
let ids_old_searcher: HashSet<u64> = old_searcher
|
|
||||||
.segment_readers()
|
|
||||||
.iter()
|
|
||||||
.flat_map(|segment_reader| {
|
|
||||||
let ff_reader = segment_reader.fast_fields().u64(id_field).unwrap();
|
|
||||||
segment_reader
|
|
||||||
.doc_ids_alive()
|
|
||||||
.map(move |doc| ff_reader.get(doc))
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let ids: HashSet<u64> = searcher
|
let ids: HashSet<u64> = searcher
|
||||||
.segment_readers()
|
.segment_readers()
|
||||||
.iter()
|
.iter()
|
||||||
@@ -1509,19 +1500,6 @@ mod tests {
|
|||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let (expected_ids_and_num_occurences, deleted_ids) = expected_ids(ops);
|
let (expected_ids_and_num_occurences, deleted_ids) = expected_ids(ops);
|
||||||
let num_docs_expected = expected_ids_and_num_occurences
|
|
||||||
.iter()
|
|
||||||
.map(|(_, id_occurences)| *id_occurences as usize)
|
|
||||||
.sum::<usize>();
|
|
||||||
assert_eq!(searcher.num_docs() as usize, num_docs_expected);
|
|
||||||
assert_eq!(old_searcher.num_docs() as usize, num_docs_expected);
|
|
||||||
assert_eq!(
|
|
||||||
ids_old_searcher,
|
|
||||||
expected_ids_and_num_occurences
|
|
||||||
.keys()
|
|
||||||
.cloned()
|
|
||||||
.collect::<HashSet<_>>()
|
|
||||||
);
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
ids,
|
ids,
|
||||||
expected_ids_and_num_occurences
|
expected_ids_and_num_occurences
|
||||||
@@ -1616,42 +1594,22 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
proptest! {
|
proptest! {
|
||||||
#![proptest_config(ProptestConfig::with_cases(20))]
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_with_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
fn test_delete_with_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||||
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
|
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
|
||||||
}
|
}
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_without_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
fn test_delete_without_sort_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||||
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
|
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
|
||||||
}
|
}
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_with_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||||
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
|
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
|
||||||
}
|
}
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_without_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) {
|
fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(operation_strategy(), 1..10)) {
|
||||||
assert!(test_operation_strategy(&ops[..], false, true).is_ok());
|
assert!(test_operation_strategy(&ops[..], false, true).is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_delete_with_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
|
|
||||||
assert!(test_operation_strategy(&ops[..], true, false).is_ok());
|
|
||||||
}
|
|
||||||
#[test]
|
|
||||||
fn test_delete_without_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
|
|
||||||
assert!(test_operation_strategy(&ops[..], false, false).is_ok());
|
|
||||||
}
|
|
||||||
#[test]
|
|
||||||
fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) {
|
|
||||||
assert!(test_operation_strategy(&ops[..], true, true).is_ok());
|
|
||||||
}
|
|
||||||
#[test]
|
|
||||||
fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..100)) {
|
|
||||||
assert!(test_operation_strategy(&ops[..], false, true).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -1676,11 +1634,11 @@ mod tests {
|
|||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
|
|
||||||
// We add a doc...
|
// We add a doc...
|
||||||
index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64))?;
|
index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64));
|
||||||
// And remove it.
|
// And remove it.
|
||||||
index_writer.delete_term(Term::from_field_u64(id_field, 0u64));
|
index_writer.delete_term(Term::from_field_u64(id_field, 0u64));
|
||||||
// We add another doc.
|
// We add another doc.
|
||||||
index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64))?;
|
index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64));
|
||||||
|
|
||||||
// The expected result is a segment with
|
// The expected result is a segment with
|
||||||
// maxdoc = 2
|
// maxdoc = 2
|
||||||
@@ -1697,14 +1655,14 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_index_doc_missing_field() -> crate::Result<()> {
|
fn test_index_doc_missing_field() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let idfield = schema_builder.add_text_field("id", STRING);
|
let idfield = schema_builder.add_text_field("id", STRING);
|
||||||
schema_builder.add_text_field("optfield", STRING);
|
schema_builder.add_text_field("optfield", STRING);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(idfield=>"myid"))?;
|
index_writer.add_document(doc!(idfield=>"myid"));
|
||||||
index_writer.commit()?;
|
let commit = index_writer.commit();
|
||||||
Ok(())
|
assert!(commit.is_ok());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,118 +0,0 @@
|
|||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
|
||||||
use std::sync::{Arc, RwLock};
|
|
||||||
|
|
||||||
use super::AddBatchReceiver;
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub(crate) struct IndexWriterStatus {
|
|
||||||
inner: Arc<Inner>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl IndexWriterStatus {
|
|
||||||
/// Returns true iff the index writer is alive.
|
|
||||||
pub fn is_alive(&self) -> bool {
|
|
||||||
self.inner.as_ref().is_alive()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns a copy of the operation receiver.
|
|
||||||
/// If the index writer was killed, returns None.
|
|
||||||
pub fn operation_receiver(&self) -> Option<AddBatchReceiver> {
|
|
||||||
let rlock = self
|
|
||||||
.inner
|
|
||||||
.receive_channel
|
|
||||||
.read()
|
|
||||||
.expect("This lock should never be poisoned");
|
|
||||||
rlock.as_ref().cloned()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create an index writer bomb.
|
|
||||||
/// If dropped, the index writer status will be killed.
|
|
||||||
pub(crate) fn create_bomb(&self) -> IndexWriterBomb {
|
|
||||||
IndexWriterBomb {
|
|
||||||
inner: Some(self.inner.clone()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Inner {
|
|
||||||
is_alive: AtomicBool,
|
|
||||||
receive_channel: RwLock<Option<AddBatchReceiver>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Inner {
|
|
||||||
fn is_alive(&self) -> bool {
|
|
||||||
self.is_alive.load(Ordering::Relaxed)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn kill(&self) {
|
|
||||||
self.is_alive.store(false, Ordering::Relaxed);
|
|
||||||
self.receive_channel
|
|
||||||
.write()
|
|
||||||
.expect("This lock should never be poisoned")
|
|
||||||
.take();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<AddBatchReceiver> for IndexWriterStatus {
|
|
||||||
fn from(receiver: AddBatchReceiver) -> Self {
|
|
||||||
IndexWriterStatus {
|
|
||||||
inner: Arc::new(Inner {
|
|
||||||
is_alive: AtomicBool::new(true),
|
|
||||||
receive_channel: RwLock::new(Some(receiver)),
|
|
||||||
}),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// If dropped, the index writer will be killed.
|
|
||||||
/// To prevent this, clients can call `.defuse()`.
|
|
||||||
pub(crate) struct IndexWriterBomb {
|
|
||||||
inner: Option<Arc<Inner>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl IndexWriterBomb {
|
|
||||||
/// Defuses the bomb.
|
|
||||||
///
|
|
||||||
/// This is the only way to drop the bomb without killing
|
|
||||||
/// the index writer.
|
|
||||||
pub fn defuse(mut self) {
|
|
||||||
self.inner = None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for IndexWriterBomb {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
if let Some(inner) = self.inner.take() {
|
|
||||||
inner.kill();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::IndexWriterStatus;
|
|
||||||
use crossbeam::channel;
|
|
||||||
use std::mem;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_bomb_goes_boom() {
|
|
||||||
let (_tx, rx) = channel::bounded(10);
|
|
||||||
let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx);
|
|
||||||
assert!(index_writer_status.operation_receiver().is_some());
|
|
||||||
let bomb = index_writer_status.create_bomb();
|
|
||||||
assert!(index_writer_status.operation_receiver().is_some());
|
|
||||||
mem::drop(bomb);
|
|
||||||
// boom!
|
|
||||||
assert!(index_writer_status.operation_receiver().is_none());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_bomb_defused() {
|
|
||||||
let (_tx, rx) = channel::bounded(10);
|
|
||||||
let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx);
|
|
||||||
assert!(index_writer_status.operation_receiver().is_some());
|
|
||||||
let bomb = index_writer_status.create_bomb();
|
|
||||||
bomb.defuse();
|
|
||||||
assert!(index_writer_status.operation_receiver().is_some());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -2,15 +2,12 @@ use super::merge_policy::{MergeCandidate, MergePolicy};
|
|||||||
use crate::core::SegmentMeta;
|
use crate::core::SegmentMeta;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use std::cmp;
|
use std::cmp;
|
||||||
|
use std::f64;
|
||||||
|
|
||||||
const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
|
const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
|
||||||
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
|
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
|
||||||
const DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE: usize = 8;
|
const DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE: usize = 8;
|
||||||
const DEFAULT_MAX_DOCS_BEFORE_MERGE: usize = 10_000_000;
|
const DEFAULT_MAX_DOCS_BEFORE_MERGE: usize = 10_000_000;
|
||||||
// The default value of 1 means that deletes are not taken in account when
|
|
||||||
// identifying merge candidates. This is not a very sensible default: it was
|
|
||||||
// set like that for backward compatibility and might change in the near future.
|
|
||||||
const DEFAULT_DEL_DOCS_RATIO_BEFORE_MERGE: f32 = 1.0f32;
|
|
||||||
|
|
||||||
/// `LogMergePolicy` tries to merge segments that have a similar number of
|
/// `LogMergePolicy` tries to merge segments that have a similar number of
|
||||||
/// documents.
|
/// documents.
|
||||||
@@ -20,7 +17,6 @@ pub struct LogMergePolicy {
|
|||||||
max_docs_before_merge: usize,
|
max_docs_before_merge: usize,
|
||||||
min_layer_size: u32,
|
min_layer_size: u32,
|
||||||
level_log_size: f64,
|
level_log_size: f64,
|
||||||
del_docs_ratio_before_merge: f32,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LogMergePolicy {
|
impl LogMergePolicy {
|
||||||
@@ -56,49 +52,19 @@ impl LogMergePolicy {
|
|||||||
pub fn set_level_log_size(&mut self, level_log_size: f64) {
|
pub fn set_level_log_size(&mut self, level_log_size: f64) {
|
||||||
self.level_log_size = level_log_size;
|
self.level_log_size = level_log_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set the ratio of deleted documents in a segment to tolerate.
|
|
||||||
///
|
|
||||||
/// If it is exceeded by any segment at a log level, a merge
|
|
||||||
/// will be triggered for that level.
|
|
||||||
///
|
|
||||||
/// If there is a single segment at a level, we effectively end up expunging
|
|
||||||
/// deleted documents from it.
|
|
||||||
///
|
|
||||||
/// # Panics
|
|
||||||
///
|
|
||||||
/// Panics if del_docs_ratio_before_merge is not within (0..1].
|
|
||||||
pub fn set_del_docs_ratio_before_merge(&mut self, del_docs_ratio_before_merge: f32) {
|
|
||||||
assert!(del_docs_ratio_before_merge <= 1.0f32);
|
|
||||||
assert!(del_docs_ratio_before_merge > 0f32);
|
|
||||||
self.del_docs_ratio_before_merge = del_docs_ratio_before_merge;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn has_segment_above_deletes_threshold(&self, level: &[&SegmentMeta]) -> bool {
|
|
||||||
level
|
|
||||||
.iter()
|
|
||||||
.any(|segment| deletes_ratio(segment) > self.del_docs_ratio_before_merge)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn deletes_ratio(segment: &SegmentMeta) -> f32 {
|
|
||||||
if segment.max_doc() == 0 {
|
|
||||||
return 0f32;
|
|
||||||
}
|
|
||||||
segment.num_deleted_docs() as f32 / segment.max_doc() as f32
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MergePolicy for LogMergePolicy {
|
impl MergePolicy for LogMergePolicy {
|
||||||
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
|
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
|
||||||
let size_sorted_segments = segments
|
let mut size_sorted_segments = segments
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|seg| seg.num_docs() <= (self.max_docs_before_merge as u32))
|
.filter(|segment_meta| segment_meta.num_docs() <= (self.max_docs_before_merge as u32))
|
||||||
.sorted_by_key(|seg| std::cmp::Reverse(seg.max_doc()))
|
|
||||||
.collect::<Vec<&SegmentMeta>>();
|
.collect::<Vec<&SegmentMeta>>();
|
||||||
|
|
||||||
if size_sorted_segments.is_empty() {
|
if size_sorted_segments.len() <= 1 {
|
||||||
return vec![];
|
return vec![];
|
||||||
}
|
}
|
||||||
|
size_sorted_segments.sort_by_key(|seg| std::cmp::Reverse(seg.num_docs()));
|
||||||
|
|
||||||
let mut current_max_log_size = f64::MAX;
|
let mut current_max_log_size = f64::MAX;
|
||||||
let mut levels = vec![];
|
let mut levels = vec![];
|
||||||
@@ -116,10 +82,7 @@ impl MergePolicy for LogMergePolicy {
|
|||||||
|
|
||||||
levels
|
levels
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|level| {
|
.filter(|level| level.len() >= self.min_num_segments)
|
||||||
level.len() >= self.min_num_segments
|
|
||||||
|| self.has_segment_above_deletes_threshold(level)
|
|
||||||
})
|
|
||||||
.map(|segments| MergeCandidate(segments.iter().map(|&seg| seg.id()).collect()))
|
.map(|segments| MergeCandidate(segments.iter().map(|&seg| seg.id()).collect()))
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
@@ -132,7 +95,6 @@ impl Default for LogMergePolicy {
|
|||||||
max_docs_before_merge: DEFAULT_MAX_DOCS_BEFORE_MERGE,
|
max_docs_before_merge: DEFAULT_MAX_DOCS_BEFORE_MERGE,
|
||||||
min_layer_size: DEFAULT_MIN_LAYER_SIZE,
|
min_layer_size: DEFAULT_MIN_LAYER_SIZE,
|
||||||
level_log_size: DEFAULT_LEVEL_LOG_SIZE,
|
level_log_size: DEFAULT_LEVEL_LOG_SIZE,
|
||||||
del_docs_ratio_before_merge: DEFAULT_DEL_DOCS_RATIO_BEFORE_MERGE,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -152,7 +114,7 @@ mod tests {
|
|||||||
use crate::Index;
|
use crate::Index;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn create_index_test_max_merge_issue_1035() -> crate::Result<()> {
|
fn create_index_test_max_merge_issue_1035() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let int_field = schema_builder.add_u64_field("intval", INDEXED);
|
let int_field = schema_builder.add_u64_field("intval", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -165,34 +127,34 @@ mod tests {
|
|||||||
log_merge_policy.set_max_docs_before_merge(1);
|
log_merge_policy.set_max_docs_before_merge(1);
|
||||||
log_merge_policy.set_min_layer_size(0);
|
log_merge_policy.set_min_layer_size(0);
|
||||||
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
||||||
|
|
||||||
// after every commit the merge checker is started, it will merge only segments with 1
|
// after every commit the merge checker is started, it will merge only segments with 1
|
||||||
// element in it because of the max_merge_size.
|
// element in it because of the max_merge_size.
|
||||||
index_writer.add_document(doc!(int_field=>1_u64))?;
|
index_writer.add_document(doc!(int_field=>1_u64));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
index_writer.add_document(doc!(int_field=>2_u64))?;
|
index_writer.add_document(doc!(int_field=>2_u64));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
index_writer.add_document(doc!(int_field=>3_u64))?;
|
index_writer.add_document(doc!(int_field=>3_u64));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
index_writer.add_document(doc!(int_field=>4_u64))?;
|
index_writer.add_document(doc!(int_field=>4_u64));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
index_writer.add_document(doc!(int_field=>5_u64))?;
|
index_writer.add_document(doc!(int_field=>5_u64));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
index_writer.add_document(doc!(int_field=>6_u64))?;
|
index_writer.add_document(doc!(int_field=>6_u64));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
index_writer.add_document(doc!(int_field=>7_u64))?;
|
index_writer.add_document(doc!(int_field=>7_u64));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
index_writer.add_document(doc!(int_field=>8_u64))?;
|
index_writer.add_document(doc!(int_field=>8_u64));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
let _segment_ids = index
|
let _segment_ids = index
|
||||||
@@ -207,7 +169,6 @@ mod tests {
|
|||||||
panic!("segment can't have more than two segments");
|
panic!("segment can't have more than two segments");
|
||||||
} // don't know how to wait for the merge, then it could be a simple eq
|
} // don't know how to wait for the merge, then it could be a simple eq
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_merge_policy() -> LogMergePolicy {
|
fn test_merge_policy() -> LogMergePolicy {
|
||||||
@@ -326,49 +287,4 @@ mod tests {
|
|||||||
assert_eq!(result_list[0].0[1], test_input[4].id());
|
assert_eq!(result_list[0].0[1], test_input[4].id());
|
||||||
assert_eq!(result_list[0].0[2], test_input[5].id());
|
assert_eq!(result_list[0].0[2], test_input[5].id());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_merge_single_segment_with_deletes_below_threshold() {
|
|
||||||
let mut test_merge_policy = test_merge_policy();
|
|
||||||
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
|
|
||||||
let test_input = vec![create_random_segment_meta(40_000).with_delete_meta(10_000, 1)];
|
|
||||||
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
|
|
||||||
assert!(merge_candidates.is_empty());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_merge_single_segment_with_deletes_above_threshold() {
|
|
||||||
let mut test_merge_policy = test_merge_policy();
|
|
||||||
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
|
|
||||||
let test_input = vec![create_random_segment_meta(40_000).with_delete_meta(10_001, 1)];
|
|
||||||
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
|
|
||||||
assert_eq!(merge_candidates.len(), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_merge_segments_with_deletes_above_threshold_all_in_level() {
|
|
||||||
let mut test_merge_policy = test_merge_policy();
|
|
||||||
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
|
|
||||||
let test_input = vec![
|
|
||||||
create_random_segment_meta(40_000).with_delete_meta(10_001, 1),
|
|
||||||
create_random_segment_meta(40_000),
|
|
||||||
];
|
|
||||||
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
|
|
||||||
assert_eq!(merge_candidates.len(), 1);
|
|
||||||
assert_eq!(merge_candidates[0].0.len(), 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_merge_segments_with_deletes_above_threshold_different_level_not_involved() {
|
|
||||||
let mut test_merge_policy = test_merge_policy();
|
|
||||||
test_merge_policy.set_del_docs_ratio_before_merge(0.25f32);
|
|
||||||
let test_input = vec![
|
|
||||||
create_random_segment_meta(100),
|
|
||||||
create_random_segment_meta(40_000).with_delete_meta(10_001, 1),
|
|
||||||
];
|
|
||||||
let merge_candidates = test_merge_policy.compute_merge_candidates(&test_input);
|
|
||||||
assert_eq!(merge_candidates.len(), 1);
|
|
||||||
assert_eq!(merge_candidates[0].0.len(), 1);
|
|
||||||
assert_eq!(merge_candidates[0].0[0], test_input[1].id());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use crate::Opstamp;
|
use crate::Opstamp;
|
||||||
use crate::SegmentId;
|
use crate::SegmentId;
|
||||||
use crate::{Inventory, TrackedObject};
|
use census::{Inventory, TrackedObject};
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use crate::error::DataCorruption;
|
use crate::error::DataCorruption;
|
||||||
use crate::fastfield::AliveBitSet;
|
|
||||||
use crate::fastfield::CompositeFastFieldSerializer;
|
use crate::fastfield::CompositeFastFieldSerializer;
|
||||||
use crate::fastfield::DynamicFastFieldReader;
|
use crate::fastfield::DynamicFastFieldReader;
|
||||||
use crate::fastfield::FastFieldDataAccess;
|
use crate::fastfield::FastFieldDataAccess;
|
||||||
@@ -10,7 +9,7 @@ use crate::fastfield::MultiValuedFastFieldReader;
|
|||||||
use crate::fieldnorm::FieldNormsSerializer;
|
use crate::fieldnorm::FieldNormsSerializer;
|
||||||
use crate::fieldnorm::FieldNormsWriter;
|
use crate::fieldnorm::FieldNormsWriter;
|
||||||
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
|
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
|
||||||
use crate::indexer::doc_id_mapping::SegmentDocIdMapping;
|
use crate::indexer::doc_id_mapping::SegmentDocidMapping;
|
||||||
use crate::indexer::SegmentSerializer;
|
use crate::indexer::SegmentSerializer;
|
||||||
use crate::postings::Postings;
|
use crate::postings::Postings;
|
||||||
use crate::postings::{InvertedIndexSerializer, SegmentPostings};
|
use crate::postings::{InvertedIndexSerializer, SegmentPostings};
|
||||||
@@ -41,54 +40,31 @@ use tantivy_bitpacker::minmax;
|
|||||||
/// We do not allow segments with more than
|
/// We do not allow segments with more than
|
||||||
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
||||||
|
|
||||||
fn estimate_total_num_tokens_in_single_segment(
|
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
|
||||||
reader: &SegmentReader,
|
let mut total_tokens = 0u64;
|
||||||
field: Field,
|
let mut count: [usize; 256] = [0; 256];
|
||||||
) -> crate::Result<u64> {
|
for reader in readers {
|
||||||
// There are no deletes. We can simply use the exact value saved into the posting list.
|
if reader.has_deletes() {
|
||||||
// Note that this value is not necessarily exact as it could have been the result of a merge between
|
// if there are deletes, then we use an approximation
|
||||||
// segments themselves containing deletes.
|
// using the fieldnorm
|
||||||
if !reader.has_deletes() {
|
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
|
||||||
return Ok(reader.inverted_index(field)?.total_num_tokens());
|
for doc in reader.doc_ids_alive() {
|
||||||
}
|
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
|
||||||
|
count[fieldnorm_id as usize] += 1;
|
||||||
// When there are deletes, we use an approximation either
|
}
|
||||||
// by using the fieldnorm.
|
} else {
|
||||||
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? {
|
total_tokens += reader.inverted_index(field)?.total_num_tokens();
|
||||||
let mut count: [usize; 256] = [0; 256];
|
|
||||||
for doc in reader.doc_ids_alive() {
|
|
||||||
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
|
|
||||||
count[fieldnorm_id as usize] += 1;
|
|
||||||
}
|
}
|
||||||
let total_num_tokens = count
|
}
|
||||||
|
Ok(total_tokens
|
||||||
|
+ count
|
||||||
.iter()
|
.iter()
|
||||||
.cloned()
|
.cloned()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(fieldnorm_ord, count)| {
|
.map(|(fieldnorm_ord, count)| {
|
||||||
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
|
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
|
||||||
})
|
})
|
||||||
.sum::<u64>();
|
.sum::<u64>())
|
||||||
return Ok(total_num_tokens);
|
|
||||||
}
|
|
||||||
|
|
||||||
// There are no fieldnorms available.
|
|
||||||
// Here we just do a pro-rata with the overall number of tokens an the ratio of
|
|
||||||
// documents alive.
|
|
||||||
let segment_num_tokens = reader.inverted_index(field)?.total_num_tokens();
|
|
||||||
if reader.max_doc() == 0 {
|
|
||||||
// That supposedly never happens, but let's be a bit defensive here.
|
|
||||||
return Ok(0u64);
|
|
||||||
}
|
|
||||||
let ratio = reader.num_docs() as f64 / reader.max_doc() as f64;
|
|
||||||
Ok((segment_num_tokens as f64 * ratio) as u64)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
|
|
||||||
let mut total_num_tokens: u64 = 0;
|
|
||||||
for reader in readers {
|
|
||||||
total_num_tokens += estimate_total_num_tokens_in_single_segment(reader, field)?;
|
|
||||||
}
|
|
||||||
Ok(total_num_tokens)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct IndexMerger {
|
pub struct IndexMerger {
|
||||||
@@ -180,38 +156,16 @@ impl IndexMerger {
|
|||||||
schema: Schema,
|
schema: Schema,
|
||||||
index_settings: IndexSettings,
|
index_settings: IndexSettings,
|
||||||
segments: &[Segment],
|
segments: &[Segment],
|
||||||
) -> crate::Result<IndexMerger> {
|
|
||||||
let delete_bitsets = segments.iter().map(|_| None).collect_vec();
|
|
||||||
Self::open_with_custom_alive_set(schema, index_settings, segments, delete_bitsets)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create merge with a custom delete set.
|
|
||||||
// For every Segment, a delete bitset can be provided, which
|
|
||||||
// will be merged with the existing bit set. Make sure the index
|
|
||||||
// corresponds to the segment index.
|
|
||||||
//
|
|
||||||
// If `None` is provided for custom alive set, the regular alive set will be used.
|
|
||||||
// If a delete_bitsets is provided, the union between the provided and regular
|
|
||||||
// alive set will be used.
|
|
||||||
//
|
|
||||||
// This can be used to merge but also apply an additional filter.
|
|
||||||
// One use case is demux, which is basically taking a list of
|
|
||||||
// segments and partitions them e.g. by a value in a field.
|
|
||||||
pub fn open_with_custom_alive_set(
|
|
||||||
schema: Schema,
|
|
||||||
index_settings: IndexSettings,
|
|
||||||
segments: &[Segment],
|
|
||||||
alive_bitset_opt: Vec<Option<AliveBitSet>>,
|
|
||||||
) -> crate::Result<IndexMerger> {
|
) -> crate::Result<IndexMerger> {
|
||||||
let mut readers = vec![];
|
let mut readers = vec![];
|
||||||
for (segment, new_alive_bitset_opt) in segments.iter().zip(alive_bitset_opt.into_iter()) {
|
let mut max_doc: u32 = 0u32;
|
||||||
|
for segment in segments {
|
||||||
if segment.meta().num_docs() > 0 {
|
if segment.meta().num_docs() > 0 {
|
||||||
let reader =
|
let reader = SegmentReader::open(segment)?;
|
||||||
SegmentReader::open_with_custom_alive_set(segment, new_alive_bitset_opt)?;
|
max_doc += reader.num_docs();
|
||||||
readers.push(reader);
|
readers.push(reader);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let max_doc = readers.iter().map(|reader| reader.num_docs()).sum();
|
|
||||||
if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() {
|
if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() {
|
||||||
readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?;
|
readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?;
|
||||||
}
|
}
|
||||||
@@ -259,7 +213,7 @@ impl IndexMerger {
|
|||||||
fn write_fieldnorms(
|
fn write_fieldnorms(
|
||||||
&self,
|
&self,
|
||||||
mut fieldnorms_serializer: FieldNormsSerializer,
|
mut fieldnorms_serializer: FieldNormsSerializer,
|
||||||
doc_id_mapping: &SegmentDocIdMapping,
|
doc_id_mapping: &SegmentDocidMapping,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema);
|
let fields = FieldNormsWriter::fields_with_fieldnorm(&self.schema);
|
||||||
let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize);
|
let mut fieldnorms_data = Vec::with_capacity(self.max_doc as usize);
|
||||||
@@ -287,17 +241,17 @@ impl IndexMerger {
|
|||||||
&self,
|
&self,
|
||||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||||
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
|
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
|
||||||
doc_id_mapping: &SegmentDocIdMapping,
|
doc_id_mapping: &SegmentDocidMapping,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
debug_time!("write_fast_fields");
|
debug_time!("write_fast_fields");
|
||||||
|
|
||||||
for (field, field_entry) in self.schema.fields() {
|
for (field, field_entry) in self.schema.fields() {
|
||||||
let field_type = field_entry.field_type();
|
let field_type = field_entry.field_type();
|
||||||
match field_type {
|
match field_type {
|
||||||
FieldType::Facet(_) => {
|
FieldType::HierarchicalFacet(_) => {
|
||||||
let term_ordinal_mapping = term_ord_mappings
|
let term_ordinal_mapping = term_ord_mappings
|
||||||
.remove(&field)
|
.remove(&field)
|
||||||
.expect("Logic Error in Tantivy (Please report). Facet field should have required a\
|
.expect("Logic Error in Tantivy (Please report). HierarchicalFact field should have required a\
|
||||||
`term_ordinal_mapping`.");
|
`term_ordinal_mapping`.");
|
||||||
self.write_hierarchical_facet_field(
|
self.write_hierarchical_facet_field(
|
||||||
field,
|
field,
|
||||||
@@ -338,15 +292,16 @@ impl IndexMerger {
|
|||||||
&self,
|
&self,
|
||||||
field: Field,
|
field: Field,
|
||||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||||
doc_id_mapping: &SegmentDocIdMapping,
|
doc_id_mapping: &SegmentDocidMapping,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let (min_value, max_value) = self.readers.iter().filter_map(|reader|{
|
let (min_value, max_value) = self.readers.iter().map(|reader|{
|
||||||
let u64_reader: DynamicFastFieldReader<u64> = reader
|
let u64_reader: DynamicFastFieldReader<u64> = reader
|
||||||
.fast_fields()
|
.fast_fields()
|
||||||
.typed_fast_field_reader(field)
|
.typed_fast_field_reader(field)
|
||||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||||
compute_min_max_val(&u64_reader, reader)
|
compute_min_max_val(&u64_reader, reader)
|
||||||
})
|
})
|
||||||
|
.flatten()
|
||||||
.reduce(|a, b| {
|
.reduce(|a, b| {
|
||||||
(a.0.min(b.0), a.1.max(b.1))
|
(a.0.min(b.0), a.1.max(b.1))
|
||||||
}).expect("Unexpected error, empty readers in IndexMerger");
|
}).expect("Unexpected error, empty readers in IndexMerger");
|
||||||
@@ -369,17 +324,17 @@ impl IndexMerger {
|
|||||||
num_vals: doc_id_mapping.len() as u64,
|
num_vals: doc_id_mapping.len() as u64,
|
||||||
};
|
};
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
struct SortedDocIdFieldAccessProvider<'a> {
|
struct SortedDocidFieldAccessProvider<'a> {
|
||||||
doc_id_mapping: &'a SegmentDocIdMapping,
|
doc_id_mapping: &'a SegmentDocidMapping,
|
||||||
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
|
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
|
||||||
}
|
}
|
||||||
impl<'a> FastFieldDataAccess for SortedDocIdFieldAccessProvider<'a> {
|
impl<'a> FastFieldDataAccess for SortedDocidFieldAccessProvider<'a> {
|
||||||
fn get_val(&self, doc: u64) -> u64 {
|
fn get_val(&self, doc: u64) -> u64 {
|
||||||
let (doc_id, reader_ordinal) = self.doc_id_mapping[doc as usize];
|
let (doc_id, reader_ordinal) = self.doc_id_mapping[doc as usize];
|
||||||
self.fast_field_readers[reader_ordinal as usize].get(doc_id)
|
self.fast_field_readers[reader_ordinal as usize].get(doc_id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let fastfield_accessor = SortedDocIdFieldAccessProvider {
|
let fastfield_accessor = SortedDocidFieldAccessProvider {
|
||||||
doc_id_mapping,
|
doc_id_mapping,
|
||||||
fast_field_readers: &fast_field_readers,
|
fast_field_readers: &fast_field_readers,
|
||||||
};
|
};
|
||||||
@@ -434,9 +389,9 @@ impl IndexMerger {
|
|||||||
Ok(value_accessor)
|
Ok(value_accessor)
|
||||||
}
|
}
|
||||||
/// Collecting value_accessors into a vec to bind the lifetime.
|
/// Collecting value_accessors into a vec to bind the lifetime.
|
||||||
pub(crate) fn get_reader_with_sort_field_accessor(
|
pub(crate) fn get_reader_with_sort_field_accessor<'a, 'b>(
|
||||||
&self,
|
&'a self,
|
||||||
sort_by_field: &IndexSortByField,
|
sort_by_field: &'b IndexSortByField,
|
||||||
) -> crate::Result<Vec<(SegmentOrdinal, impl FastFieldReader<u64> + Clone)>> {
|
) -> crate::Result<Vec<(SegmentOrdinal, impl FastFieldReader<u64> + Clone)>> {
|
||||||
let reader_ordinal_and_field_accessors = self
|
let reader_ordinal_and_field_accessors = self
|
||||||
.readers
|
.readers
|
||||||
@@ -461,7 +416,7 @@ impl IndexMerger {
|
|||||||
pub(crate) fn generate_doc_id_mapping(
|
pub(crate) fn generate_doc_id_mapping(
|
||||||
&self,
|
&self,
|
||||||
sort_by_field: &IndexSortByField,
|
sort_by_field: &IndexSortByField,
|
||||||
) -> crate::Result<SegmentDocIdMapping> {
|
) -> crate::Result<SegmentDocidMapping> {
|
||||||
let reader_ordinal_and_field_accessors =
|
let reader_ordinal_and_field_accessors =
|
||||||
self.get_reader_with_sort_field_accessor(sort_by_field)?;
|
self.get_reader_with_sort_field_accessor(sort_by_field)?;
|
||||||
// Loading the field accessor on demand causes a 15x regression
|
// Loading the field accessor on demand causes a 15x regression
|
||||||
@@ -504,7 +459,7 @@ impl IndexMerger {
|
|||||||
})
|
})
|
||||||
.map(|(doc_id, reader_with_id, _)| (doc_id, reader_with_id)),
|
.map(|(doc_id, reader_with_id, _)| (doc_id, reader_with_id)),
|
||||||
);
|
);
|
||||||
Ok(SegmentDocIdMapping::new(sorted_doc_ids, false))
|
Ok(SegmentDocidMapping::new(sorted_doc_ids, false))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Creating the index file to point into the data, generic over `BytesFastFieldReader` and
|
// Creating the index file to point into the data, generic over `BytesFastFieldReader` and
|
||||||
@@ -513,7 +468,7 @@ impl IndexMerger {
|
|||||||
fn write_1_n_fast_field_idx_generic<T: MultiValueLength>(
|
fn write_1_n_fast_field_idx_generic<T: MultiValueLength>(
|
||||||
field: Field,
|
field: Field,
|
||||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||||
doc_id_mapping: &SegmentDocIdMapping,
|
doc_id_mapping: &SegmentDocidMapping,
|
||||||
reader_and_field_accessors: &[(&SegmentReader, T)],
|
reader_and_field_accessors: &[(&SegmentReader, T)],
|
||||||
) -> crate::Result<Vec<u64>> {
|
) -> crate::Result<Vec<u64>> {
|
||||||
let mut total_num_vals = 0u64;
|
let mut total_num_vals = 0u64;
|
||||||
@@ -572,7 +527,7 @@ impl IndexMerger {
|
|||||||
&self,
|
&self,
|
||||||
field: Field,
|
field: Field,
|
||||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||||
doc_id_mapping: &SegmentDocIdMapping,
|
doc_id_mapping: &SegmentDocidMapping,
|
||||||
) -> crate::Result<Vec<u64>> {
|
) -> crate::Result<Vec<u64>> {
|
||||||
let reader_ordinal_and_field_accessors = self.readers.iter().map(|reader|{
|
let reader_ordinal_and_field_accessors = self.readers.iter().map(|reader|{
|
||||||
let u64s_reader: MultiValuedFastFieldReader<u64> = reader.fast_fields()
|
let u64s_reader: MultiValuedFastFieldReader<u64> = reader.fast_fields()
|
||||||
@@ -594,7 +549,7 @@ impl IndexMerger {
|
|||||||
field: Field,
|
field: Field,
|
||||||
term_ordinal_mappings: &TermOrdinalMapping,
|
term_ordinal_mappings: &TermOrdinalMapping,
|
||||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||||
doc_id_mapping: &SegmentDocIdMapping,
|
doc_id_mapping: &SegmentDocidMapping,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
debug_time!("write_hierarchical_facet_field");
|
debug_time!("write_hierarchical_facet_field");
|
||||||
|
|
||||||
@@ -643,7 +598,7 @@ impl IndexMerger {
|
|||||||
|
|
||||||
/// Creates a mapping if the segments are stacked. this is helpful to merge codelines between index
|
/// Creates a mapping if the segments are stacked. this is helpful to merge codelines between index
|
||||||
/// sorting and the others
|
/// sorting and the others
|
||||||
pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocIdMapping> {
|
pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocidMapping> {
|
||||||
let total_num_new_docs = self
|
let total_num_new_docs = self
|
||||||
.readers
|
.readers
|
||||||
.iter()
|
.iter()
|
||||||
@@ -656,19 +611,20 @@ impl IndexMerger {
|
|||||||
self.readers
|
self.readers
|
||||||
.iter()
|
.iter()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.flat_map(|(reader_ordinal, reader)| {
|
.map(|(reader_ordinal, reader)| {
|
||||||
reader
|
reader
|
||||||
.doc_ids_alive()
|
.doc_ids_alive()
|
||||||
.map(move |doc_id| (doc_id, reader_ordinal as SegmentOrdinal))
|
.map(move |doc_id| (doc_id, reader_ordinal as SegmentOrdinal))
|
||||||
}),
|
})
|
||||||
|
.flatten(),
|
||||||
);
|
);
|
||||||
Ok(SegmentDocIdMapping::new(mapping, true))
|
Ok(SegmentDocidMapping::new(mapping, true))
|
||||||
}
|
}
|
||||||
fn write_multi_fast_field(
|
fn write_multi_fast_field(
|
||||||
&self,
|
&self,
|
||||||
field: Field,
|
field: Field,
|
||||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||||
doc_id_mapping: &SegmentDocIdMapping,
|
doc_id_mapping: &SegmentDocidMapping,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
// Multifastfield consists in 2 fastfields.
|
// Multifastfield consists in 2 fastfields.
|
||||||
// The first serves as an index into the second one and is stricly increasing.
|
// The first serves as an index into the second one and is stricly increasing.
|
||||||
@@ -724,16 +680,16 @@ impl IndexMerger {
|
|||||||
min_value,
|
min_value,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct SortedDocIdMultiValueAccessProvider<'a> {
|
struct SortedDocidMultiValueAccessProvider<'a> {
|
||||||
doc_id_mapping: &'a SegmentDocIdMapping,
|
doc_id_mapping: &'a SegmentDocidMapping,
|
||||||
fast_field_readers: &'a Vec<MultiValuedFastFieldReader<u64>>,
|
fast_field_readers: &'a Vec<MultiValuedFastFieldReader<u64>>,
|
||||||
offsets: Vec<u64>,
|
offsets: Vec<u64>,
|
||||||
}
|
}
|
||||||
impl<'a> FastFieldDataAccess for SortedDocIdMultiValueAccessProvider<'a> {
|
impl<'a> FastFieldDataAccess for SortedDocidMultiValueAccessProvider<'a> {
|
||||||
fn get_val(&self, pos: u64) -> u64 {
|
fn get_val(&self, pos: u64) -> u64 {
|
||||||
// use the offsets index to find the doc_id which will contain the position.
|
// use the offsets index to find the doc_id which will contain the position.
|
||||||
// the offsets are stricly increasing so we can do a simple search on it.
|
// the offsets are stricly increasing so we can do a simple search on it.
|
||||||
let new_doc_id = self
|
let new_docid = self
|
||||||
.offsets
|
.offsets
|
||||||
.iter()
|
.iter()
|
||||||
.position(|&offset| offset > pos)
|
.position(|&offset| offset > pos)
|
||||||
@@ -741,10 +697,10 @@ impl IndexMerger {
|
|||||||
- 1;
|
- 1;
|
||||||
|
|
||||||
// now we need to find the position of `pos` in the multivalued bucket
|
// now we need to find the position of `pos` in the multivalued bucket
|
||||||
let num_pos_covered_until_now = self.offsets[new_doc_id];
|
let num_pos_covered_until_now = self.offsets[new_docid];
|
||||||
let pos_in_values = pos - num_pos_covered_until_now;
|
let pos_in_values = pos - num_pos_covered_until_now;
|
||||||
|
|
||||||
let (old_doc_id, reader_ordinal) = self.doc_id_mapping[new_doc_id as usize];
|
let (old_doc_id, reader_ordinal) = self.doc_id_mapping[new_docid as usize];
|
||||||
let num_vals = self.fast_field_readers[reader_ordinal as usize].get_len(old_doc_id);
|
let num_vals = self.fast_field_readers[reader_ordinal as usize].get_len(old_doc_id);
|
||||||
assert!(num_vals >= pos_in_values);
|
assert!(num_vals >= pos_in_values);
|
||||||
let mut vals = vec![];
|
let mut vals = vec![];
|
||||||
@@ -753,23 +709,29 @@ impl IndexMerger {
|
|||||||
vals[pos_in_values as usize]
|
vals[pos_in_values as usize]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let fastfield_accessor = SortedDocIdMultiValueAccessProvider {
|
let fastfield_accessor = SortedDocidMultiValueAccessProvider {
|
||||||
doc_id_mapping,
|
doc_id_mapping,
|
||||||
fast_field_readers: &ff_readers,
|
fast_field_readers: &ff_readers,
|
||||||
offsets,
|
offsets,
|
||||||
};
|
};
|
||||||
let iter1 = doc_id_mapping.iter().flat_map(|(doc_id, reader_ordinal)| {
|
let iter1 = doc_id_mapping
|
||||||
let ff_reader = &ff_readers[*reader_ordinal as usize];
|
.iter()
|
||||||
let mut vals = vec![];
|
.map(|(doc_id, reader_ordinal)| {
|
||||||
ff_reader.get_vals(*doc_id, &mut vals);
|
let ff_reader = &ff_readers[*reader_ordinal as usize];
|
||||||
vals.into_iter()
|
let mut vals = vec![];
|
||||||
});
|
ff_reader.get_vals(*doc_id, &mut vals);
|
||||||
let iter2 = doc_id_mapping.iter().flat_map(|(doc_id, reader_ordinal)| {
|
vals.into_iter()
|
||||||
let ff_reader = &ff_readers[*reader_ordinal as usize];
|
})
|
||||||
let mut vals = vec![];
|
.flatten();
|
||||||
ff_reader.get_vals(*doc_id, &mut vals);
|
let iter2 = doc_id_mapping
|
||||||
vals.into_iter()
|
.iter()
|
||||||
});
|
.map(|(doc_id, reader_ordinal)| {
|
||||||
|
let ff_reader = &ff_readers[*reader_ordinal as usize];
|
||||||
|
let mut vals = vec![];
|
||||||
|
ff_reader.get_vals(*doc_id, &mut vals);
|
||||||
|
vals.into_iter()
|
||||||
|
})
|
||||||
|
.flatten();
|
||||||
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx(
|
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx(
|
||||||
field,
|
field,
|
||||||
stats,
|
stats,
|
||||||
@@ -786,7 +748,7 @@ impl IndexMerger {
|
|||||||
&self,
|
&self,
|
||||||
field: Field,
|
field: Field,
|
||||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||||
doc_id_mapping: &SegmentDocIdMapping,
|
doc_id_mapping: &SegmentDocidMapping,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let reader_and_field_accessors = self
|
let reader_and_field_accessors = self
|
||||||
.readers
|
.readers
|
||||||
@@ -822,7 +784,7 @@ impl IndexMerger {
|
|||||||
field_type: &FieldType,
|
field_type: &FieldType,
|
||||||
serializer: &mut InvertedIndexSerializer,
|
serializer: &mut InvertedIndexSerializer,
|
||||||
fieldnorm_reader: Option<FieldNormReader>,
|
fieldnorm_reader: Option<FieldNormReader>,
|
||||||
doc_id_mapping: &SegmentDocIdMapping,
|
doc_id_mapping: &SegmentDocidMapping,
|
||||||
) -> crate::Result<Option<TermOrdinalMapping>> {
|
) -> crate::Result<Option<TermOrdinalMapping>> {
|
||||||
debug_time!("write_postings_for_field");
|
debug_time!("write_postings_for_field");
|
||||||
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
|
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
|
||||||
@@ -844,7 +806,7 @@ impl IndexMerger {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let mut term_ord_mapping_opt = match field_type {
|
let mut term_ord_mapping_opt = match field_type {
|
||||||
FieldType::Facet(_) => Some(TermOrdinalMapping::new(max_term_ords)),
|
FieldType::HierarchicalFacet(_) => Some(TermOrdinalMapping::new(max_term_ords)),
|
||||||
_ => None,
|
_ => None,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -861,14 +823,15 @@ impl IndexMerger {
|
|||||||
segment_local_map
|
segment_local_map
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
for (new_doc_id, (old_doc_id, segment_ord)) in doc_id_mapping.iter().enumerate() {
|
for (new_doc_id, (old_doc_id, segment_ordinal)) in doc_id_mapping.iter().enumerate() {
|
||||||
let segment_map = &mut merged_doc_id_map[*segment_ord as usize];
|
let segment_map = &mut merged_doc_id_map[*segment_ordinal as usize];
|
||||||
segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId);
|
segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note that the total number of tokens is not exact.
|
// The total number of tokens will only be exact when there has been no deletes.
|
||||||
// It is only used as a parameter in the BM25 formula.
|
//
|
||||||
let total_num_tokens: u64 = estimate_total_num_tokens(&self.readers, indexed_field)?;
|
// Otherwise, we approximate by removing deleted documents proportionally.
|
||||||
|
let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field)?;
|
||||||
|
|
||||||
// Create the total list of doc ids
|
// Create the total list of doc ids
|
||||||
// by stacking the doc ids from the different segment.
|
// by stacking the doc ids from the different segment.
|
||||||
@@ -903,7 +866,7 @@ impl IndexMerger {
|
|||||||
let mut total_doc_freq = 0;
|
let mut total_doc_freq = 0;
|
||||||
|
|
||||||
// Let's compute the list of non-empty posting lists
|
// Let's compute the list of non-empty posting lists
|
||||||
for (segment_ord, term_info) in merged_terms.current_segment_ords_and_term_infos() {
|
for (segment_ord, term_info) in merged_terms.current_segment_ordinals_and_term_infos() {
|
||||||
let segment_reader = &self.readers[segment_ord];
|
let segment_reader = &self.readers[segment_ord];
|
||||||
let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord];
|
let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord];
|
||||||
let segment_postings = inverted_index
|
let segment_postings = inverted_index
|
||||||
@@ -953,9 +916,9 @@ impl IndexMerger {
|
|||||||
// there is at least one document.
|
// there is at least one document.
|
||||||
let term_freq = segment_postings.term_freq();
|
let term_freq = segment_postings.term_freq();
|
||||||
segment_postings.positions(&mut positions_buffer);
|
segment_postings.positions(&mut positions_buffer);
|
||||||
// if doc_id_mapping exists, the doc_ids are reordered, they are
|
// if doc_id_mapping exists, the docids are reordered, they are
|
||||||
// not just stacked. The field serializer expects monotonically increasing
|
// not just stacked. The field serializer expects monotonically increasing
|
||||||
// doc_ids, so we collect and sort them first, before writing.
|
// docids, so we collect and sort them first, before writing.
|
||||||
//
|
//
|
||||||
// I think this is not strictly necessary, it would be possible to
|
// I think this is not strictly necessary, it would be possible to
|
||||||
// avoid the loading into a vec via some form of kmerge, but then the merge
|
// avoid the loading into a vec via some form of kmerge, but then the merge
|
||||||
@@ -995,7 +958,7 @@ impl IndexMerger {
|
|||||||
&self,
|
&self,
|
||||||
serializer: &mut InvertedIndexSerializer,
|
serializer: &mut InvertedIndexSerializer,
|
||||||
fieldnorm_readers: FieldNormReaders,
|
fieldnorm_readers: FieldNormReaders,
|
||||||
doc_id_mapping: &SegmentDocIdMapping,
|
doc_id_mapping: &SegmentDocidMapping,
|
||||||
) -> crate::Result<HashMap<Field, TermOrdinalMapping>> {
|
) -> crate::Result<HashMap<Field, TermOrdinalMapping>> {
|
||||||
let mut term_ordinal_mappings = HashMap::new();
|
let mut term_ordinal_mappings = HashMap::new();
|
||||||
for (field, field_entry) in self.schema.fields() {
|
for (field, field_entry) in self.schema.fields() {
|
||||||
@@ -1018,7 +981,7 @@ impl IndexMerger {
|
|||||||
fn write_storable_fields(
|
fn write_storable_fields(
|
||||||
&self,
|
&self,
|
||||||
store_writer: &mut StoreWriter,
|
store_writer: &mut StoreWriter,
|
||||||
doc_id_mapping: &SegmentDocIdMapping,
|
doc_id_mapping: &SegmentDocidMapping,
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
debug_time!("write_storable_fields");
|
debug_time!("write_storable_fields");
|
||||||
|
|
||||||
@@ -1060,7 +1023,7 @@ impl IndexMerger {
|
|||||||
// the doc stores would be on average half full, given total randomness (which
|
// the doc stores would be on average half full, given total randomness (which
|
||||||
// is not the case here, but not sure how it behaves exactly).
|
// is not the case here, but not sure how it behaves exactly).
|
||||||
//
|
//
|
||||||
// https://github.com/quickwit-inc/tantivy/issues/1053
|
// https://github.com/tantivy-search/tantivy/issues/1053
|
||||||
//
|
//
|
||||||
// take 7 in order to not walk over all checkpoints.
|
// take 7 in order to not walk over all checkpoints.
|
||||||
|| store_reader.block_checkpoints().take(7).count() < 6
|
|| store_reader.block_checkpoints().take(7).count() < 6
|
||||||
@@ -1132,13 +1095,13 @@ mod tests {
|
|||||||
use crate::query::BooleanQuery;
|
use crate::query::BooleanQuery;
|
||||||
use crate::query::Scorer;
|
use crate::query::Scorer;
|
||||||
use crate::query::TermQuery;
|
use crate::query::TermQuery;
|
||||||
|
use crate::schema::Document;
|
||||||
use crate::schema::Facet;
|
use crate::schema::Facet;
|
||||||
use crate::schema::IndexRecordOption;
|
use crate::schema::IndexRecordOption;
|
||||||
use crate::schema::IntOptions;
|
use crate::schema::IntOptions;
|
||||||
use crate::schema::Term;
|
use crate::schema::Term;
|
||||||
use crate::schema::TextFieldIndexing;
|
use crate::schema::TextFieldIndexing;
|
||||||
use crate::schema::{Cardinality, TEXT};
|
use crate::schema::{Cardinality, TEXT};
|
||||||
use crate::schema::{Document, FacetOptions};
|
|
||||||
use crate::DocAddress;
|
use crate::DocAddress;
|
||||||
use crate::IndexSettings;
|
use crate::IndexSettings;
|
||||||
use crate::IndexSortByField;
|
use crate::IndexSortByField;
|
||||||
@@ -1176,17 +1139,18 @@ mod tests {
|
|||||||
score_field => 3u64,
|
score_field => 3u64,
|
||||||
date_field => curr_time,
|
date_field => curr_time,
|
||||||
bytes_score_field => 3u32.to_be_bytes().as_ref()
|
bytes_score_field => 3u32.to_be_bytes().as_ref()
|
||||||
))?;
|
));
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "a b c",
|
text_field => "a b c",
|
||||||
score_field => 5u64,
|
score_field => 5u64,
|
||||||
bytes_score_field => 5u32.to_be_bytes().as_ref()
|
bytes_score_field => 5u32.to_be_bytes().as_ref()
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "a b c d",
|
text_field => "a b c d",
|
||||||
score_field => 7u64,
|
score_field => 7u64,
|
||||||
bytes_score_field => 7u32.to_be_bytes().as_ref()
|
bytes_score_field => 7u32.to_be_bytes().as_ref()
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
// writing the segment
|
// writing the segment
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
@@ -1194,12 +1158,12 @@ mod tests {
|
|||||||
date_field => curr_time,
|
date_field => curr_time,
|
||||||
score_field => 11u64,
|
score_field => 11u64,
|
||||||
bytes_score_field => 11u32.to_be_bytes().as_ref()
|
bytes_score_field => 11u32.to_be_bytes().as_ref()
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "a b c g",
|
text_field => "a b c g",
|
||||||
score_field => 13u64,
|
score_field => 13u64,
|
||||||
bytes_score_field => 13u32.to_be_bytes().as_ref()
|
bytes_score_field => 13u32.to_be_bytes().as_ref()
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -1333,18 +1297,18 @@ mod tests {
|
|||||||
text_field => "a b d",
|
text_field => "a b d",
|
||||||
score_field => 1u64,
|
score_field => 1u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 1],
|
bytes_score_field => vec![0u8, 0, 0, 1],
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "b c",
|
text_field => "b c",
|
||||||
score_field => 2u64,
|
score_field => 2u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 2],
|
bytes_score_field => vec![0u8, 0, 0, 2],
|
||||||
))?;
|
));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "c d",
|
text_field => "c d",
|
||||||
score_field => 3u64,
|
score_field => 3u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 3],
|
bytes_score_field => vec![0u8, 0, 0, 3],
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
reader.reload()?;
|
reader.reload()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -1374,24 +1338,24 @@ mod tests {
|
|||||||
text_field => "a d e",
|
text_field => "a d e",
|
||||||
score_field => 4_000u64,
|
score_field => 4_000u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 4],
|
bytes_score_field => vec![0u8, 0, 0, 4],
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "e f",
|
text_field => "e f",
|
||||||
score_field => 5_000u64,
|
score_field => 5_000u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 5],
|
bytes_score_field => vec![0u8, 0, 0, 5],
|
||||||
))?;
|
));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "f"));
|
index_writer.delete_term(Term::from_field_text(text_field, "f"));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "f g",
|
text_field => "f g",
|
||||||
score_field => 6_000u64,
|
score_field => 6_000u64,
|
||||||
bytes_score_field => vec![0u8, 0, 23, 112],
|
bytes_score_field => vec![0u8, 0, 23, 112],
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "g h",
|
text_field => "g h",
|
||||||
score_field => 7_000u64,
|
score_field => 7_000u64,
|
||||||
bytes_score_field => vec![0u8, 0, 27, 88],
|
bytes_score_field => vec![0u8, 0, 27, 88],
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
reader.reload()?;
|
reader.reload()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -1609,7 +1573,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_facets_sort_asc() {
|
fn test_merge_facets_sort_asc() {
|
||||||
// In the merge case this will go through the doc_id mapping code
|
// In the merge case this will go through the docid mapping code
|
||||||
test_merge_facets(
|
test_merge_facets(
|
||||||
Some(IndexSettings {
|
Some(IndexSettings {
|
||||||
sort_by_field: Some(IndexSortByField {
|
sort_by_field: Some(IndexSortByField {
|
||||||
@@ -1620,7 +1584,7 @@ mod tests {
|
|||||||
}),
|
}),
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
// In the merge case this will not go through the doc_id mapping code, because the data is
|
// In the merge case this will not go through the docid mapping code, because the data is
|
||||||
// sorted and disjunct
|
// sorted and disjunct
|
||||||
test_merge_facets(
|
test_merge_facets(
|
||||||
Some(IndexSettings {
|
Some(IndexSettings {
|
||||||
@@ -1636,7 +1600,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_facets_sort_desc() {
|
fn test_merge_facets_sort_desc() {
|
||||||
// In the merge case this will go through the doc_id mapping code
|
// In the merge case this will go through the docid mapping code
|
||||||
test_merge_facets(
|
test_merge_facets(
|
||||||
Some(IndexSettings {
|
Some(IndexSettings {
|
||||||
sort_by_field: Some(IndexSortByField {
|
sort_by_field: Some(IndexSortByField {
|
||||||
@@ -1647,7 +1611,7 @@ mod tests {
|
|||||||
}),
|
}),
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
// In the merge case this will not go through the doc_id mapping code, because the data is
|
// In the merge case this will not go through the docid mapping code, because the data is
|
||||||
// sorted and disjunct
|
// sorted and disjunct
|
||||||
test_merge_facets(
|
test_merge_facets(
|
||||||
Some(IndexSettings {
|
Some(IndexSettings {
|
||||||
@@ -1664,7 +1628,7 @@ mod tests {
|
|||||||
// ranges between segments so that merge algorithm can't apply certain optimizations
|
// ranges between segments so that merge algorithm can't apply certain optimizations
|
||||||
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
|
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||||
let int_options = IntOptions::default()
|
let int_options = IntOptions::default()
|
||||||
.set_fast(Cardinality::SingleValue)
|
.set_fast(Cardinality::SingleValue)
|
||||||
.set_indexed();
|
.set_indexed();
|
||||||
@@ -1687,7 +1651,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
doc.add_u64(int_field, *int_val);
|
doc.add_u64(int_field, *int_val);
|
||||||
*int_val += 1;
|
*int_val += 1;
|
||||||
index_writer.add_document(doc).unwrap();
|
index_writer.add_document(doc);
|
||||||
};
|
};
|
||||||
|
|
||||||
index_doc(
|
index_doc(
|
||||||
@@ -1800,69 +1764,70 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_bug_merge() -> crate::Result<()> {
|
fn test_bug_merge() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let int_field = schema_builder.add_u64_field("intvals", INDEXED);
|
let int_field = schema_builder.add_u64_field("intvals", INDEXED);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_for_tests().unwrap();
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(int_field => 1u64))?;
|
index_writer.add_document(doc!(int_field => 1u64));
|
||||||
index_writer.commit().expect("commit failed");
|
index_writer.commit().expect("commit failed");
|
||||||
index_writer.add_document(doc!(int_field => 1u64))?;
|
index_writer.add_document(doc!(int_field => 1u64));
|
||||||
index_writer.commit().expect("commit failed");
|
index_writer.commit().expect("commit failed");
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
assert_eq!(searcher.num_docs(), 2);
|
assert_eq!(searcher.num_docs(), 2);
|
||||||
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
||||||
let segment_ids = index
|
let segment_ids = index
|
||||||
.searchable_segment_ids()
|
.searchable_segment_ids()
|
||||||
.expect("Searchable segments failed.");
|
.expect("Searchable segments failed.");
|
||||||
block_on(index_writer.merge(&segment_ids))?;
|
block_on(index_writer.merge(&segment_ids)).expect("Merging failed");
|
||||||
reader.reload()?;
|
reader.reload().unwrap();
|
||||||
// commit has not been called yet. The document should still be
|
// commit has not been called yet. The document should still be
|
||||||
// there.
|
// there.
|
||||||
assert_eq!(reader.searcher().num_docs(), 2);
|
assert_eq!(reader.searcher().num_docs(), 2);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_multivalued_int_fields_all_deleted() -> crate::Result<()> {
|
fn test_merge_multivalued_int_fields_all_deleted() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let int_options = IntOptions::default()
|
let int_options = IntOptions::default()
|
||||||
.set_fast(Cardinality::MultiValues)
|
.set_fast(Cardinality::MultiValues)
|
||||||
.set_indexed();
|
.set_indexed();
|
||||||
let int_field = schema_builder.add_u64_field("intvals", int_options);
|
let int_field = schema_builder.add_u64_field("intvals", int_options);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
doc.add_u64(int_field, 1);
|
doc.add_u64(int_field, 1);
|
||||||
index_writer.add_document(doc.clone())?;
|
index_writer.add_document(doc.clone());
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
index_writer.add_document(doc)?;
|
index_writer.add_document(doc);
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
||||||
let segment_ids = index.searchable_segment_ids()?;
|
|
||||||
block_on(index_writer.merge(&segment_ids))?;
|
let segment_ids = index
|
||||||
|
.searchable_segment_ids()
|
||||||
|
.expect("Searchable segments failed.");
|
||||||
|
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
|
||||||
|
|
||||||
// assert delete has not been committed
|
// assert delete has not been committed
|
||||||
reader.reload()?;
|
assert!(reader.reload().is_ok());
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
assert_eq!(searcher.num_docs(), 2);
|
assert_eq!(searcher.num_docs(), 2);
|
||||||
|
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
|
|
||||||
index_writer.wait_merging_threads()?;
|
index_writer.wait_merging_threads().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
reader.reload()?;
|
reader.reload().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
assert_eq!(searcher.num_docs(), 0);
|
assert_eq!(searcher.num_docs(), 0);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> {
|
fn test_merge_multivalued_int_fields_simple() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let int_options = IntOptions::default()
|
let int_options = IntOptions::default()
|
||||||
.set_fast(Cardinality::MultiValues)
|
.set_fast(Cardinality::MultiValues)
|
||||||
@@ -1871,13 +1836,13 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
|
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
for &val in int_vals {
|
for &val in int_vals {
|
||||||
doc.add_u64(int_field, val);
|
doc.add_u64(int_field, val);
|
||||||
}
|
}
|
||||||
index_writer.add_document(doc).unwrap();
|
index_writer.add_document(doc);
|
||||||
};
|
};
|
||||||
index_doc(&mut index_writer, &[1, 2]);
|
index_doc(&mut index_writer, &[1, 2]);
|
||||||
index_doc(&mut index_writer, &[1, 2, 3]);
|
index_doc(&mut index_writer, &[1, 2, 3]);
|
||||||
@@ -1893,7 +1858,7 @@ mod tests {
|
|||||||
index_doc(&mut index_writer, &[1_000]);
|
index_doc(&mut index_writer, &[1_000]);
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let mut vals: Vec<u64> = Vec::new();
|
let mut vals: Vec<u64> = Vec::new();
|
||||||
|
|
||||||
@@ -1942,12 +1907,14 @@ mod tests {
|
|||||||
|
|
||||||
// Merging the segments
|
// Merging the segments
|
||||||
{
|
{
|
||||||
let segment_ids = index.searchable_segment_ids()?;
|
let segment_ids = index
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
.searchable_segment_ids()
|
||||||
block_on(index_writer.merge(&segment_ids))?;
|
.expect("Searchable segments failed.");
|
||||||
index_writer.wait_merging_threads()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
|
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
|
||||||
|
assert!(index_writer.wait_merging_threads().is_ok());
|
||||||
}
|
}
|
||||||
reader.reload()?;
|
assert!(reader.reload().is_ok());
|
||||||
|
|
||||||
{
|
{
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -1984,7 +1951,6 @@ mod tests {
|
|||||||
ff_reader.get_vals(9, &mut vals);
|
ff_reader.get_vals(9, &mut vals);
|
||||||
assert_eq!(&vals, &[20]);
|
assert_eq!(&vals, &[20]);
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -2010,7 +1976,7 @@ mod tests {
|
|||||||
doc.add_f64(field, 42.0);
|
doc.add_f64(field, 42.0);
|
||||||
doc.add_f64(multi_field, 0.24);
|
doc.add_f64(multi_field, 0.24);
|
||||||
doc.add_f64(multi_field, 0.27);
|
doc.add_f64(multi_field, 0.27);
|
||||||
writer.add_document(doc)?;
|
writer.add_document(doc);
|
||||||
if i % 5 == 0 {
|
if i % 5 == 0 {
|
||||||
writer.commit()?;
|
writer.commit()?;
|
||||||
}
|
}
|
||||||
@@ -2034,7 +2000,7 @@ mod tests {
|
|||||||
let happy_term = Term::from_field_text(text, "happy");
|
let happy_term = Term::from_field_text(text, "happy");
|
||||||
let term_query = TermQuery::new(happy_term, IndexRecordOption::WithFreqs);
|
let term_query = TermQuery::new(happy_term, IndexRecordOption::WithFreqs);
|
||||||
for _ in 0..62 {
|
for _ in 0..62 {
|
||||||
writer.add_document(doc!(text=>"hello happy tax payer"))?;
|
writer.add_document(doc!(text=>"hello happy tax payer"));
|
||||||
}
|
}
|
||||||
writer.commit()?;
|
writer.commit()?;
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
@@ -2046,7 +2012,7 @@ mod tests {
|
|||||||
assert_nearly_equals!(term_scorer.block_max_score(), 0.0079681855);
|
assert_nearly_equals!(term_scorer.block_max_score(), 0.0079681855);
|
||||||
assert_nearly_equals!(term_scorer.score(), 0.0079681855);
|
assert_nearly_equals!(term_scorer.score(), 0.0079681855);
|
||||||
for _ in 0..81 {
|
for _ in 0..81 {
|
||||||
writer.add_document(doc!(text=>"hello happy tax payer"))?;
|
writer.add_document(doc!(text=>"hello happy tax payer"));
|
||||||
}
|
}
|
||||||
writer.commit()?;
|
writer.commit()?;
|
||||||
reader.reload()?;
|
reader.reload()?;
|
||||||
|
|||||||
@@ -1,17 +1,22 @@
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use crate::collector::TopDocs;
|
|
||||||
use crate::core::Index;
|
|
||||||
use crate::fastfield::MultiValuedFastFieldReader;
|
|
||||||
use crate::fastfield::{AliveBitSet, FastFieldReader};
|
use crate::fastfield::{AliveBitSet, FastFieldReader};
|
||||||
use crate::query::QueryParser;
|
use crate::schema::IndexRecordOption;
|
||||||
use crate::schema::{
|
use crate::{
|
||||||
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, TextFieldIndexing,
|
collector::TopDocs,
|
||||||
|
schema::{Cardinality, TextFieldIndexing},
|
||||||
|
};
|
||||||
|
use crate::{core::Index, fastfield::MultiValuedFastFieldReader};
|
||||||
|
use crate::{
|
||||||
|
query::QueryParser,
|
||||||
|
schema::{IntOptions, TextOptions},
|
||||||
|
};
|
||||||
|
use crate::{schema::Facet, IndexSortByField};
|
||||||
|
use crate::{schema::INDEXED, Order};
|
||||||
|
use crate::{
|
||||||
|
schema::{self, BytesOptions},
|
||||||
|
DocAddress,
|
||||||
};
|
};
|
||||||
use crate::schema::{IntOptions, TextOptions};
|
|
||||||
use crate::DocAddress;
|
|
||||||
use crate::IndexSortByField;
|
|
||||||
use crate::Order;
|
|
||||||
use crate::{DocSet, IndexSettings, Postings, Term};
|
use crate::{DocSet, IndexSettings, Postings, Term};
|
||||||
use futures::executor::block_on;
|
use futures::executor::block_on;
|
||||||
|
|
||||||
@@ -22,7 +27,7 @@ mod tests {
|
|||||||
.set_indexed();
|
.set_indexed();
|
||||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||||
|
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||||
|
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
@@ -34,17 +39,14 @@ mod tests {
|
|||||||
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests().unwrap();
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer
|
|
||||||
.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")))
|
index_writer.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")));
|
||||||
.unwrap();
|
index_writer.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")));
|
||||||
index_writer
|
|
||||||
.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")))
|
assert!(index_writer.commit().is_ok());
|
||||||
.unwrap();
|
index_writer.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")));
|
||||||
index_writer.commit().unwrap();
|
|
||||||
index_writer
|
assert!(index_writer.commit().is_ok());
|
||||||
.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")))
|
|
||||||
.unwrap();
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Merging the segments
|
// Merging the segments
|
||||||
@@ -64,7 +66,7 @@ mod tests {
|
|||||||
fn create_test_index(
|
fn create_test_index(
|
||||||
index_settings: Option<IndexSettings>,
|
index_settings: Option<IndexSettings>,
|
||||||
force_disjunct_segment_sort_values: bool,
|
force_disjunct_segment_sort_values: bool,
|
||||||
) -> crate::Result<Index> {
|
) -> Index {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::Schema::builder();
|
||||||
let int_options = IntOptions::default()
|
let int_options = IntOptions::default()
|
||||||
.set_fast(Cardinality::SingleValue)
|
.set_fast(Cardinality::SingleValue)
|
||||||
@@ -74,7 +76,7 @@ mod tests {
|
|||||||
|
|
||||||
let bytes_options = BytesOptions::default().set_fast().set_indexed();
|
let bytes_options = BytesOptions::default().set_fast().set_indexed();
|
||||||
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
|
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
|
||||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||||
|
|
||||||
let multi_numbers = schema_builder.add_u64_field(
|
let multi_numbers = schema_builder.add_u64_field(
|
||||||
"multi_numbers",
|
"multi_numbers",
|
||||||
@@ -93,34 +95,34 @@ mod tests {
|
|||||||
if let Some(settings) = index_settings {
|
if let Some(settings) = index_settings {
|
||||||
index_builder = index_builder.settings(settings);
|
index_builder = index_builder.settings(settings);
|
||||||
}
|
}
|
||||||
let index = index_builder.create_in_ram()?;
|
let index = index_builder.create_in_ram().unwrap();
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
|
|
||||||
// segment 1 - range 1-3
|
// segment 1 - range 1-3
|
||||||
index_writer.add_document(doc!(int_field=>1_u64))?;
|
index_writer.add_document(doc!(int_field=>1_u64));
|
||||||
index_writer.add_document(
|
index_writer.add_document(
|
||||||
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
|
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
|
||||||
)?;
|
);
|
||||||
index_writer.add_document(
|
index_writer.add_document(
|
||||||
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
|
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
|
||||||
)?;
|
);
|
||||||
index_writer.add_document(
|
index_writer.add_document(
|
||||||
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
|
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
|
||||||
)?;
|
);
|
||||||
|
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
|
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
|
||||||
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?;
|
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64));
|
||||||
|
|
||||||
let in_val = if force_disjunct_segment_sort_values {
|
let in_val = if force_disjunct_segment_sort_values {
|
||||||
10_u64
|
10_u64
|
||||||
} else {
|
} else {
|
||||||
1
|
1
|
||||||
};
|
};
|
||||||
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?;
|
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
|
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
|
||||||
let int_vals = if force_disjunct_segment_sort_values {
|
let int_vals = if force_disjunct_segment_sort_values {
|
||||||
[100_u64, 50]
|
[100_u64, 50]
|
||||||
@@ -129,24 +131,26 @@ mod tests {
|
|||||||
};
|
};
|
||||||
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
|
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
|
||||||
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
|
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
|
||||||
)?;
|
);
|
||||||
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?;
|
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"));
|
||||||
index_writer.add_document(
|
index_writer.add_document(
|
||||||
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
|
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
|
||||||
)?;
|
);
|
||||||
|
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
|
index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Merging the segments
|
// Merging the segments
|
||||||
{
|
{
|
||||||
let segment_ids = index.searchable_segment_ids()?;
|
let segment_ids = index
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
.searchable_segment_ids()
|
||||||
block_on(index_writer.merge(&segment_ids))?;
|
.expect("Searchable segments failed.");
|
||||||
index_writer.wait_merging_threads()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
|
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
|
||||||
|
assert!(index_writer.wait_merging_threads().is_ok());
|
||||||
}
|
}
|
||||||
Ok(index)
|
index
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -179,8 +183,7 @@ mod tests {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
}),
|
}),
|
||||||
force_disjunct_segment_sort_values,
|
force_disjunct_segment_sort_values,
|
||||||
)
|
);
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let int_field = index.schema().get_field("intval").unwrap();
|
let int_field = index.schema().get_field("intval").unwrap();
|
||||||
let reader = index.reader().unwrap();
|
let reader = index.reader().unwrap();
|
||||||
@@ -297,8 +300,7 @@ mod tests {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
}),
|
}),
|
||||||
false,
|
false,
|
||||||
)
|
);
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let reader = index.reader().unwrap();
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -365,8 +367,7 @@ mod tests {
|
|||||||
..Default::default()
|
..Default::default()
|
||||||
}),
|
}),
|
||||||
false,
|
false,
|
||||||
)
|
);
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let int_field = index.schema().get_field("intval").unwrap();
|
let int_field = index.schema().get_field("intval").unwrap();
|
||||||
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
||||||
@@ -516,7 +517,7 @@ mod bench_sorted_index_merge {
|
|||||||
let index_doc = |index_writer: &mut IndexWriter, val: u64| {
|
let index_doc = |index_writer: &mut IndexWriter, val: u64| {
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
doc.add_u64(int_field, val);
|
doc.add_u64(int_field, val);
|
||||||
index_writer.add_document(doc).unwrap();
|
index_writer.add_document(doc);
|
||||||
};
|
};
|
||||||
// 3 segments with 10_000 values in the fast fields
|
// 3 segments with 10_000 values in the fast fields
|
||||||
for _ in 0..3 {
|
for _ in 0..3 {
|
||||||
@@ -553,7 +554,7 @@ mod bench_sorted_index_merge {
|
|||||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||||
(doc_id, reader, u64_reader)
|
(doc_id, reader, u64_reader)
|
||||||
});
|
});
|
||||||
// add values in order of the new doc_ids
|
// add values in order of the new docids
|
||||||
let mut val = 0;
|
let mut val = 0;
|
||||||
for (doc_id, _reader, field_reader) in sorted_doc_ids {
|
for (doc_id, _reader, field_reader) in sorted_doc_ids {
|
||||||
val = field_reader.get(*doc_id);
|
val = field_reader.get(*doc_id);
|
||||||
@@ -566,7 +567,7 @@ mod bench_sorted_index_merge {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
#[bench]
|
#[bench]
|
||||||
fn create_sorted_index_create_doc_id_mapping(b: &mut Bencher) -> crate::Result<()> {
|
fn create_sorted_index_create_docid_mapping(b: &mut Bencher) -> crate::Result<()> {
|
||||||
let sort_by_field = IndexSortByField {
|
let sort_by_field = IndexSortByField {
|
||||||
field: "intval".to_string(),
|
field: "intval".to_string(),
|
||||||
order: Order::Desc,
|
order: Order::Desc,
|
||||||
|
|||||||
@@ -1,17 +1,15 @@
|
|||||||
pub mod delete_queue;
|
pub mod delete_queue;
|
||||||
|
|
||||||
pub mod demuxer;
|
|
||||||
pub mod doc_id_mapping;
|
pub mod doc_id_mapping;
|
||||||
mod doc_opstamp_mapping;
|
mod doc_opstamp_mapping;
|
||||||
pub mod index_writer;
|
pub mod index_writer;
|
||||||
mod index_writer_status;
|
|
||||||
mod log_merge_policy;
|
mod log_merge_policy;
|
||||||
mod merge_operation;
|
mod merge_operation;
|
||||||
pub mod merge_policy;
|
pub mod merge_policy;
|
||||||
pub mod merger;
|
pub mod merger;
|
||||||
mod merger_sorted_index_test;
|
mod merger_sorted_index_test;
|
||||||
pub mod operation;
|
pub mod operation;
|
||||||
pub mod prepared_commit;
|
mod prepared_commit;
|
||||||
mod segment_entry;
|
mod segment_entry;
|
||||||
mod segment_manager;
|
mod segment_manager;
|
||||||
mod segment_register;
|
mod segment_register;
|
||||||
@@ -20,11 +18,6 @@ pub mod segment_updater;
|
|||||||
mod segment_writer;
|
mod segment_writer;
|
||||||
mod stamper;
|
mod stamper;
|
||||||
|
|
||||||
use crossbeam::channel;
|
|
||||||
use smallvec::SmallVec;
|
|
||||||
|
|
||||||
use crate::indexer::operation::AddOperation;
|
|
||||||
|
|
||||||
pub use self::index_writer::IndexWriter;
|
pub use self::index_writer::IndexWriter;
|
||||||
pub use self::log_merge_policy::LogMergePolicy;
|
pub use self::log_merge_policy::LogMergePolicy;
|
||||||
pub use self::merge_operation::MergeOperation;
|
pub use self::merge_operation::MergeOperation;
|
||||||
@@ -33,23 +26,12 @@ pub use self::prepared_commit::PreparedCommit;
|
|||||||
pub use self::segment_entry::SegmentEntry;
|
pub use self::segment_entry::SegmentEntry;
|
||||||
pub use self::segment_manager::SegmentManager;
|
pub use self::segment_manager::SegmentManager;
|
||||||
pub use self::segment_serializer::SegmentSerializer;
|
pub use self::segment_serializer::SegmentSerializer;
|
||||||
pub use self::segment_updater::merge_filtered_segments;
|
pub use self::segment_updater::merge_segments;
|
||||||
pub use self::segment_updater::merge_indices;
|
|
||||||
pub use self::segment_writer::SegmentWriter;
|
pub use self::segment_writer::SegmentWriter;
|
||||||
|
|
||||||
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
||||||
pub type DefaultMergePolicy = LogMergePolicy;
|
pub type DefaultMergePolicy = LogMergePolicy;
|
||||||
|
|
||||||
// Batch of documents.
|
|
||||||
// Most of the time, users will send operation one-by-one, but it can be useful to
|
|
||||||
// send them as a small block to ensure that
|
|
||||||
// - all docs in the operation will happen on the same segment and continuous doc_ids.
|
|
||||||
// - all operations in the group are committed at the same time, making the group
|
|
||||||
// atomic.
|
|
||||||
type AddBatch = SmallVec<[AddOperation; 4]>;
|
|
||||||
type AddBatchSender = channel::Sender<AddBatch>;
|
|
||||||
type AddBatchReceiver = channel::Receiver<AddBatch>;
|
|
||||||
|
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests_mmap {
|
mod tests_mmap {
|
||||||
@@ -57,20 +39,19 @@ mod tests_mmap {
|
|||||||
use crate::{Index, Term};
|
use crate::{Index, Term};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_advance_delete_bug() -> crate::Result<()> {
|
fn test_advance_delete_bug() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_from_tempdir(schema_builder.build())?;
|
let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
// there must be one deleted document in the segment
|
// there must be one deleted document in the segment
|
||||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "b"));
|
index_writer.delete_term(Term::from_field_text(text_field, "b"));
|
||||||
// we need enough data to trigger the bug (at least 32 documents)
|
// we need enough data to trigger the bug (at least 32 documents)
|
||||||
for _ in 0..32 {
|
for _ in 0..32 {
|
||||||
index_writer.add_document(doc!(text_field=>"c"))?;
|
index_writer.add_document(doc!(text_field=>"c"));
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,38 +18,25 @@ impl<'a> PreparedCommit<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the opstamp associated to the prepared commit.
|
|
||||||
pub fn opstamp(&self) -> Opstamp {
|
pub fn opstamp(&self) -> Opstamp {
|
||||||
self.opstamp
|
self.opstamp
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Adds an arbitrary payload to the commit.
|
|
||||||
pub fn set_payload(&mut self, payload: &str) {
|
pub fn set_payload(&mut self, payload: &str) {
|
||||||
self.payload = Some(payload.to_string())
|
self.payload = Some(payload.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Rollbacks any change.
|
|
||||||
pub fn abort(self) -> crate::Result<Opstamp> {
|
pub fn abort(self) -> crate::Result<Opstamp> {
|
||||||
self.index_writer.rollback()
|
self.index_writer.rollback()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Proceeds to commit.
|
|
||||||
/// See `.commit_async()`.
|
|
||||||
pub fn commit(self) -> crate::Result<Opstamp> {
|
pub fn commit(self) -> crate::Result<Opstamp> {
|
||||||
block_on(self.commit_async())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Proceeds to commit.
|
|
||||||
///
|
|
||||||
/// Unfortunately, contrary to what `PrepareCommit` may suggests,
|
|
||||||
/// this operation is not at all really light.
|
|
||||||
/// At this point deletes have not been flushed yet.
|
|
||||||
pub async fn commit_async(self) -> crate::Result<Opstamp> {
|
|
||||||
info!("committing {}", self.opstamp);
|
info!("committing {}", self.opstamp);
|
||||||
self.index_writer
|
let _ = block_on(
|
||||||
.segment_updater()
|
self.index_writer
|
||||||
.schedule_commit(self.opstamp, self.payload)
|
.segment_updater()
|
||||||
.await?;
|
.schedule_commit(self.opstamp, self.payload),
|
||||||
|
);
|
||||||
Ok(self.opstamp)
|
Ok(self.opstamp)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -66,10 +66,13 @@ impl SegmentRegister {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
||||||
self.segment_states
|
let mut segment_ids: Vec<SegmentMeta> = self
|
||||||
|
.segment_states
|
||||||
.values()
|
.values()
|
||||||
.map(|segment_entry| segment_entry.meta().clone())
|
.map(|segment_entry| segment_entry.meta().clone())
|
||||||
.collect()
|
.collect();
|
||||||
|
segment_ids.sort_by_key(SegmentMeta::id);
|
||||||
|
segment_ids
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {
|
pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ use crate::core::SegmentId;
|
|||||||
use crate::core::SegmentMeta;
|
use crate::core::SegmentMeta;
|
||||||
use crate::core::META_FILEPATH;
|
use crate::core::META_FILEPATH;
|
||||||
use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
|
use crate::directory::{Directory, DirectoryClone, GarbageCollectionResult};
|
||||||
use crate::fastfield::AliveBitSet;
|
|
||||||
use crate::indexer::delete_queue::DeleteCursor;
|
use crate::indexer::delete_queue::DeleteCursor;
|
||||||
use crate::indexer::index_writer::advance_deletes;
|
use crate::indexer::index_writer::advance_deletes;
|
||||||
use crate::indexer::merge_operation::MergeOperationInventory;
|
use crate::indexer::merge_operation::MergeOperationInventory;
|
||||||
@@ -20,15 +19,12 @@ use crate::indexer::{DefaultMergePolicy, MergePolicy};
|
|||||||
use crate::indexer::{MergeCandidate, MergeOperation};
|
use crate::indexer::{MergeCandidate, MergeOperation};
|
||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
use crate::Opstamp;
|
use crate::Opstamp;
|
||||||
use crate::TantivyError;
|
|
||||||
use fail::fail_point;
|
|
||||||
use futures::channel::oneshot;
|
use futures::channel::oneshot;
|
||||||
use futures::executor::{ThreadPool, ThreadPoolBuilder};
|
use futures::executor::{ThreadPool, ThreadPoolBuilder};
|
||||||
use futures::future::Future;
|
use futures::future::Future;
|
||||||
use futures::future::TryFutureExt;
|
use futures::future::TryFutureExt;
|
||||||
use std::borrow::BorrowMut;
|
use std::borrow::BorrowMut;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::io;
|
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
@@ -61,9 +57,7 @@ pub fn save_new_metas(
|
|||||||
payload: None,
|
payload: None,
|
||||||
},
|
},
|
||||||
directory,
|
directory,
|
||||||
)?;
|
)
|
||||||
directory.sync_directory()?;
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Save the index meta file.
|
/// Save the index meta file.
|
||||||
@@ -80,11 +74,6 @@ fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()>
|
|||||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||||
// Just adding a new line at the end of the buffer.
|
// Just adding a new line at the end of the buffer.
|
||||||
writeln!(&mut buffer)?;
|
writeln!(&mut buffer)?;
|
||||||
fail_point!("save_metas", |msg| Err(TantivyError::from(io::Error::new(
|
|
||||||
io::ErrorKind::Other,
|
|
||||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
|
||||||
))));
|
|
||||||
directory.sync_directory()?;
|
|
||||||
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
||||||
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -170,9 +159,9 @@ fn merge(
|
|||||||
/// meant to work if you have an IndexWriter running for the origin indices, or
|
/// meant to work if you have an IndexWriter running for the origin indices, or
|
||||||
/// the destination Index.
|
/// the destination Index.
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
pub fn merge_indices<T: Into<Box<dyn Directory>>>(
|
pub fn merge_segments<Dir: Directory>(
|
||||||
indices: &[Index],
|
indices: &[Index],
|
||||||
output_directory: T,
|
output_directory: Dir,
|
||||||
) -> crate::Result<Index> {
|
) -> crate::Result<Index> {
|
||||||
if indices.is_empty() {
|
if indices.is_empty() {
|
||||||
// If there are no indices to merge, there is no need to do anything.
|
// If there are no indices to merge, there is no need to do anything.
|
||||||
@@ -181,8 +170,19 @@ pub fn merge_indices<T: Into<Box<dyn Directory>>>(
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let target_schema = indices[0].schema();
|
||||||
let target_settings = indices[0].settings().clone();
|
let target_settings = indices[0].settings().clone();
|
||||||
|
|
||||||
|
// let's check that all of the indices have the same schema
|
||||||
|
if indices
|
||||||
|
.iter()
|
||||||
|
.skip(1)
|
||||||
|
.any(|index| index.schema() != target_schema)
|
||||||
|
{
|
||||||
|
return Err(crate::TantivyError::InvalidArgument(
|
||||||
|
"Attempt to merge different schema indices".to_string(),
|
||||||
|
));
|
||||||
|
}
|
||||||
// let's check that all of the indices have the same index settings
|
// let's check that all of the indices have the same index settings
|
||||||
if indices
|
if indices
|
||||||
.iter()
|
.iter()
|
||||||
@@ -199,61 +199,13 @@ pub fn merge_indices<T: Into<Box<dyn Directory>>>(
|
|||||||
segments.extend(index.searchable_segments()?);
|
segments.extend(index.searchable_segments()?);
|
||||||
}
|
}
|
||||||
|
|
||||||
let non_filter = segments.iter().map(|_| None).collect::<Vec<_>>();
|
let mut merged_index = Index::create(output_directory, target_schema.clone(), target_settings)?;
|
||||||
merge_filtered_segments(&segments, target_settings, non_filter, output_directory)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Advanced: Merges a list of segments from different indices in a new index.
|
|
||||||
/// Additional you can provide a delete bitset for each segment to ignore doc_ids.
|
|
||||||
///
|
|
||||||
/// Returns `TantivyError` if the the indices list is empty or their
|
|
||||||
/// schemas don't match.
|
|
||||||
///
|
|
||||||
/// `output_directory`: is assumed to be empty.
|
|
||||||
///
|
|
||||||
/// # Warning
|
|
||||||
/// This function does NOT check or take the `IndexWriter` is running. It is not
|
|
||||||
/// meant to work if you have an IndexWriter running for the origin indices, or
|
|
||||||
/// the destination Index.
|
|
||||||
#[doc(hidden)]
|
|
||||||
pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
|
|
||||||
segments: &[Segment],
|
|
||||||
target_settings: IndexSettings,
|
|
||||||
filter_doc_ids: Vec<Option<AliveBitSet>>,
|
|
||||||
output_directory: T,
|
|
||||||
) -> crate::Result<Index> {
|
|
||||||
if segments.is_empty() {
|
|
||||||
// If there are no indices to merge, there is no need to do anything.
|
|
||||||
return Err(crate::TantivyError::InvalidArgument(
|
|
||||||
"No segments given to marge".to_string(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
let target_schema = segments[0].schema();
|
|
||||||
|
|
||||||
// let's check that all of the indices have the same schema
|
|
||||||
if segments
|
|
||||||
.iter()
|
|
||||||
.skip(1)
|
|
||||||
.any(|index| index.schema() != target_schema)
|
|
||||||
{
|
|
||||||
return Err(crate::TantivyError::InvalidArgument(
|
|
||||||
"Attempt to merge different schema indices".to_string(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut merged_index = Index::create(
|
|
||||||
output_directory,
|
|
||||||
target_schema.clone(),
|
|
||||||
target_settings.clone(),
|
|
||||||
)?;
|
|
||||||
let merged_segment = merged_index.new_segment();
|
let merged_segment = merged_index.new_segment();
|
||||||
let merged_segment_id = merged_segment.id();
|
let merged_segment_id = merged_segment.id();
|
||||||
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
let merger: IndexMerger = IndexMerger::open(
|
||||||
merged_index.schema(),
|
merged_index.schema(),
|
||||||
merged_index.settings().clone(),
|
merged_index.settings().clone(),
|
||||||
segments,
|
&segments[..],
|
||||||
filter_doc_ids,
|
|
||||||
)?;
|
)?;
|
||||||
let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
|
let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
|
||||||
let num_docs = merger.write(segment_serializer)?;
|
let num_docs = merger.write(segment_serializer)?;
|
||||||
@@ -273,7 +225,7 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
|
|||||||
);
|
);
|
||||||
|
|
||||||
let index_meta = IndexMeta {
|
let index_meta = IndexMeta {
|
||||||
index_settings: target_settings, // index_settings of all segments should be the same
|
index_settings: indices[0].load_metas()?.index_settings, // index_settings of all segments should be the same
|
||||||
segments: vec![segment_meta],
|
segments: vec![segment_meta],
|
||||||
schema: target_schema,
|
schema: target_schema,
|
||||||
opstamp: 0u64,
|
opstamp: 0u64,
|
||||||
@@ -354,39 +306,37 @@ impl SegmentUpdater {
|
|||||||
*self.merge_policy.write().unwrap() = arc_merge_policy;
|
*self.merge_policy.write().unwrap() = arc_merge_policy;
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn schedule_task<
|
fn schedule_future<T: 'static + Send, F: Future<Output = crate::Result<T>> + 'static + Send>(
|
||||||
T: 'static + Send,
|
|
||||||
F: Future<Output = crate::Result<T>> + 'static + Send,
|
|
||||||
>(
|
|
||||||
&self,
|
&self,
|
||||||
task: F,
|
f: F,
|
||||||
) -> crate::Result<T> {
|
) -> impl Future<Output = crate::Result<T>> {
|
||||||
if !self.is_alive() {
|
|
||||||
return Err(crate::TantivyError::SystemError(
|
|
||||||
"Segment updater killed".to_string(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
let (sender, receiver) = oneshot::channel();
|
let (sender, receiver) = oneshot::channel();
|
||||||
self.pool.spawn_ok(async move {
|
if self.is_alive() {
|
||||||
let task_result = task.await;
|
self.pool.spawn_ok(async move {
|
||||||
let _ = sender.send(task_result);
|
let _ = sender.send(f.await);
|
||||||
});
|
});
|
||||||
let task_result = receiver.await;
|
} else {
|
||||||
task_result.unwrap_or_else(|_| {
|
let _ = sender.send(Err(crate::TantivyError::SystemError(
|
||||||
|
"Segment updater killed".to_string(),
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
receiver.unwrap_or_else(|_| {
|
||||||
let err_msg =
|
let err_msg =
|
||||||
"A segment_updater future did not success. This should never happen.".to_string();
|
"A segment_updater future did not success. This should never happen.".to_string();
|
||||||
Err(crate::TantivyError::SystemError(err_msg))
|
Err(crate::TantivyError::SystemError(err_msg))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn schedule_add_segment(&self, segment_entry: SegmentEntry) -> crate::Result<()> {
|
pub fn schedule_add_segment(
|
||||||
|
&self,
|
||||||
|
segment_entry: SegmentEntry,
|
||||||
|
) -> impl Future<Output = crate::Result<()>> {
|
||||||
let segment_updater = self.clone();
|
let segment_updater = self.clone();
|
||||||
self.schedule_task(async move {
|
self.schedule_future(async move {
|
||||||
segment_updater.segment_manager.add_segment(segment_entry);
|
segment_updater.segment_manager.add_segment(segment_entry);
|
||||||
segment_updater.consider_merge_options().await;
|
segment_updater.consider_merge_options().await;
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
.await
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Orders `SegmentManager` to remove all segments
|
/// Orders `SegmentManager` to remove all segments
|
||||||
@@ -453,9 +403,11 @@ impl SegmentUpdater {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn schedule_garbage_collect(&self) -> crate::Result<GarbageCollectionResult> {
|
pub fn schedule_garbage_collect(
|
||||||
|
&self,
|
||||||
|
) -> impl Future<Output = crate::Result<GarbageCollectionResult>> {
|
||||||
let garbage_collect_future = garbage_collect_files(self.clone());
|
let garbage_collect_future = garbage_collect_files(self.clone());
|
||||||
self.schedule_task(garbage_collect_future).await
|
self.schedule_future(garbage_collect_future)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// List the files that are useful to the index.
|
/// List the files that are useful to the index.
|
||||||
@@ -473,13 +425,13 @@ impl SegmentUpdater {
|
|||||||
files
|
files
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn schedule_commit(
|
pub fn schedule_commit(
|
||||||
&self,
|
&self,
|
||||||
opstamp: Opstamp,
|
opstamp: Opstamp,
|
||||||
payload: Option<String>,
|
payload: Option<String>,
|
||||||
) -> crate::Result<()> {
|
) -> impl Future<Output = crate::Result<()>> {
|
||||||
let segment_updater: SegmentUpdater = self.clone();
|
let segment_updater: SegmentUpdater = self.clone();
|
||||||
self.schedule_task(async move {
|
self.schedule_future(async move {
|
||||||
let segment_entries = segment_updater.purge_deletes(opstamp)?;
|
let segment_entries = segment_updater.purge_deletes(opstamp)?;
|
||||||
segment_updater.segment_manager.commit(segment_entries);
|
segment_updater.segment_manager.commit(segment_entries);
|
||||||
segment_updater.save_metas(opstamp, payload)?;
|
segment_updater.save_metas(opstamp, payload)?;
|
||||||
@@ -487,7 +439,6 @@ impl SegmentUpdater {
|
|||||||
segment_updater.consider_merge_options().await;
|
segment_updater.consider_merge_options().await;
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
.await
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn store_meta(&self, index_meta: &IndexMeta) {
|
fn store_meta(&self, index_meta: &IndexMeta) {
|
||||||
@@ -562,7 +513,9 @@ impl SegmentUpdater {
|
|||||||
e
|
e
|
||||||
);
|
);
|
||||||
// ... cancel merge
|
// ... cancel merge
|
||||||
assert!(!cfg!(test), "Merge failed.");
|
if cfg!(test) {
|
||||||
|
panic!("Merge failed.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -615,14 +568,14 @@ impl SegmentUpdater {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn end_merge(
|
fn end_merge(
|
||||||
&self,
|
&self,
|
||||||
merge_operation: MergeOperation,
|
merge_operation: MergeOperation,
|
||||||
mut after_merge_segment_entry: SegmentEntry,
|
mut after_merge_segment_entry: SegmentEntry,
|
||||||
) -> crate::Result<SegmentMeta> {
|
) -> impl Future<Output = crate::Result<SegmentMeta>> {
|
||||||
let segment_updater = self.clone();
|
let segment_updater = self.clone();
|
||||||
let after_merge_segment_meta = after_merge_segment_entry.meta().clone();
|
let after_merge_segment_meta = after_merge_segment_entry.meta().clone();
|
||||||
self.schedule_task(async move {
|
let end_merge_future = self.schedule_future(async move {
|
||||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||||
{
|
{
|
||||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||||
@@ -641,8 +594,9 @@ impl SegmentUpdater {
|
|||||||
merge_operation.segment_ids(),
|
merge_operation.segment_ids(),
|
||||||
advance_deletes_err
|
advance_deletes_err
|
||||||
);
|
);
|
||||||
assert!(!cfg!(test), "Merge failed.");
|
if cfg!(test) {
|
||||||
|
panic!("Merge failed.");
|
||||||
|
}
|
||||||
// ... cancel merge
|
// ... cancel merge
|
||||||
// `merge_operations` are tracked. As it is dropped, the
|
// `merge_operations` are tracked. As it is dropped, the
|
||||||
// the segment_ids will be available again for merge.
|
// the segment_ids will be available again for merge.
|
||||||
@@ -665,9 +619,8 @@ impl SegmentUpdater {
|
|||||||
|
|
||||||
let _ = garbage_collect_files(segment_updater).await;
|
let _ = garbage_collect_files(segment_updater).await;
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
});
|
||||||
.await?;
|
end_merge_future.map_ok(|_| after_merge_segment_meta)
|
||||||
Ok(after_merge_segment_meta)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wait for current merging threads.
|
/// Wait for current merging threads.
|
||||||
@@ -693,19 +646,11 @@ impl SegmentUpdater {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::merge_indices;
|
use super::merge_segments;
|
||||||
use crate::collector::TopDocs;
|
|
||||||
use crate::directory::RamDirectory;
|
use crate::directory::RamDirectory;
|
||||||
use crate::fastfield::AliveBitSet;
|
|
||||||
use crate::indexer::merge_policy::tests::MergeWheneverPossible;
|
use crate::indexer::merge_policy::tests::MergeWheneverPossible;
|
||||||
use crate::indexer::merger::IndexMerger;
|
|
||||||
use crate::indexer::segment_updater::merge_filtered_segments;
|
|
||||||
use crate::query::QueryParser;
|
|
||||||
use crate::schema::*;
|
use crate::schema::*;
|
||||||
use crate::Directory;
|
|
||||||
use crate::DocAddress;
|
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use crate::Segment;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_during_merge() -> crate::Result<()> {
|
fn test_delete_during_merge() -> crate::Result<()> {
|
||||||
@@ -718,19 +663,19 @@ mod tests {
|
|||||||
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
|
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
|
||||||
|
|
||||||
for _ in 0..100 {
|
for _ in 0..100 {
|
||||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
index_writer.add_document(doc!(text_field=>"a"));
|
||||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
for _ in 0..100 {
|
for _ in 0..100 {
|
||||||
index_writer.add_document(doc!(text_field=>"c"))?;
|
index_writer.add_document(doc!(text_field=>"c"));
|
||||||
index_writer.add_document(doc!(text_field=>"d"))?;
|
index_writer.add_document(doc!(text_field=>"d"));
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
index_writer.add_document(doc!(text_field=>"e"))?;
|
index_writer.add_document(doc!(text_field=>"e"));
|
||||||
index_writer.add_document(doc!(text_field=>"f"))?;
|
index_writer.add_document(doc!(text_field=>"f"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let term = Term::from_field_text(text_field, "a");
|
let term = Term::from_field_text(text_field, "a");
|
||||||
@@ -748,50 +693,6 @@ mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn delete_all_docs_min() -> crate::Result<()> {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
|
|
||||||
// writing the segment
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
|
|
||||||
for _ in 0..10 {
|
|
||||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
|
||||||
}
|
|
||||||
index_writer.commit()?;
|
|
||||||
|
|
||||||
let seg_ids = index.searchable_segment_ids()?;
|
|
||||||
// docs exist, should have at least 1 segment
|
|
||||||
assert!(!seg_ids.is_empty());
|
|
||||||
|
|
||||||
let term = Term::from_field_text(text_field, "a");
|
|
||||||
index_writer.delete_term(term);
|
|
||||||
index_writer.commit()?;
|
|
||||||
|
|
||||||
let term = Term::from_field_text(text_field, "b");
|
|
||||||
index_writer.delete_term(term);
|
|
||||||
index_writer.commit()?;
|
|
||||||
|
|
||||||
index_writer.wait_merging_threads()?;
|
|
||||||
|
|
||||||
let reader = index.reader()?;
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
|
||||||
|
|
||||||
let seg_ids = index.searchable_segment_ids()?;
|
|
||||||
assert!(seg_ids.is_empty());
|
|
||||||
|
|
||||||
reader.reload()?;
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
|
||||||
// empty segments should be erased
|
|
||||||
assert!(index.searchable_segment_metas()?.is_empty());
|
|
||||||
assert!(reader.searcher().segment_readers().is_empty());
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn delete_all_docs() -> crate::Result<()> {
|
fn delete_all_docs() -> crate::Result<()> {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
@@ -802,19 +703,19 @@ mod tests {
|
|||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
|
|
||||||
for _ in 0..100 {
|
for _ in 0..100 {
|
||||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
index_writer.add_document(doc!(text_field=>"a"));
|
||||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
for _ in 0..100 {
|
for _ in 0..100 {
|
||||||
index_writer.add_document(doc!(text_field=>"c"))?;
|
index_writer.add_document(doc!(text_field=>"c"));
|
||||||
index_writer.add_document(doc!(text_field=>"d"))?;
|
index_writer.add_document(doc!(text_field=>"d"));
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
index_writer.add_document(doc!(text_field=>"e"))?;
|
index_writer.add_document(doc!(text_field=>"e"));
|
||||||
index_writer.add_document(doc!(text_field=>"f"))?;
|
index_writer.add_document(doc!(text_field=>"f"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let seg_ids = index.searchable_segment_ids()?;
|
let seg_ids = index.searchable_segment_ids()?;
|
||||||
@@ -854,8 +755,8 @@ mod tests {
|
|||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
for _ in 0..100 {
|
for _ in 0..100 {
|
||||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
index_writer.add_document(doc!(text_field=>"a"));
|
||||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
@@ -881,22 +782,22 @@ mod tests {
|
|||||||
// writing two segments
|
// writing two segments
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
for _ in 0..100 {
|
for _ in 0..100 {
|
||||||
index_writer.add_document(doc!(text_field=>"fizz"))?;
|
index_writer.add_document(doc!(text_field=>"fizz"));
|
||||||
index_writer.add_document(doc!(text_field=>"buzz"))?;
|
index_writer.add_document(doc!(text_field=>"buzz"));
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
for _ in 0..1000 {
|
for _ in 0..1000 {
|
||||||
index_writer.add_document(doc!(text_field=>"foo"))?;
|
index_writer.add_document(doc!(text_field=>"foo"));
|
||||||
index_writer.add_document(doc!(text_field=>"bar"))?;
|
index_writer.add_document(doc!(text_field=>"bar"));
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
indices.push(index);
|
indices.push(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
assert_eq!(indices.len(), 3);
|
assert_eq!(indices.len(), 3);
|
||||||
let output_directory: Box<dyn Directory> = Box::new(RamDirectory::default());
|
let output_directory = RamDirectory::default();
|
||||||
let index = merge_indices(&indices, output_directory)?;
|
let index = merge_segments(&indices, output_directory)?;
|
||||||
assert_eq!(index.schema(), schema);
|
assert_eq!(index.schema(), schema);
|
||||||
|
|
||||||
let segments = index.searchable_segments()?;
|
let segments = index.searchable_segments()?;
|
||||||
@@ -910,7 +811,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_empty_indices_array() {
|
fn test_merge_empty_indices_array() {
|
||||||
let merge_result = merge_indices(&[], RamDirectory::default());
|
let merge_result = merge_segments(&[], RamDirectory::default());
|
||||||
assert!(merge_result.is_err());
|
assert!(merge_result.is_err());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -921,7 +822,7 @@ mod tests {
|
|||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(text_field=>"some text"))?;
|
index_writer.add_document(doc!(text_field=>"some text"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
index
|
index
|
||||||
};
|
};
|
||||||
@@ -931,197 +832,15 @@ mod tests {
|
|||||||
let body_field = schema_builder.add_text_field("body", TEXT);
|
let body_field = schema_builder.add_text_field("body", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(body_field=>"some body"))?;
|
index_writer.add_document(doc!(body_field=>"some body"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
index
|
index
|
||||||
};
|
};
|
||||||
|
|
||||||
// mismatched schema index list
|
// mismatched schema index list
|
||||||
let result = merge_indices(&[first_index, second_index], RamDirectory::default());
|
let result = merge_segments(&[first_index, second_index], RamDirectory::default());
|
||||||
assert!(result.is_err());
|
assert!(result.is_err());
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_merge_filtered_segments() -> crate::Result<()> {
|
|
||||||
let first_index = {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"some text 1"))?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"some text 2"))?;
|
|
||||||
index_writer.commit()?;
|
|
||||||
index
|
|
||||||
};
|
|
||||||
|
|
||||||
let second_index = {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"some text 3"))?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"some text 4"))?;
|
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "4"));
|
|
||||||
|
|
||||||
index_writer.commit()?;
|
|
||||||
index
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut segments: Vec<Segment> = Vec::new();
|
|
||||||
segments.extend(first_index.searchable_segments()?);
|
|
||||||
segments.extend(second_index.searchable_segments()?);
|
|
||||||
|
|
||||||
let target_settings = first_index.settings().clone();
|
|
||||||
|
|
||||||
let filter_segment_1 = AliveBitSet::for_test_from_deleted_docs(&[1], 2);
|
|
||||||
let filter_segment_2 = AliveBitSet::for_test_from_deleted_docs(&[0], 2);
|
|
||||||
|
|
||||||
let filter_segments = vec![Some(filter_segment_1), Some(filter_segment_2)];
|
|
||||||
|
|
||||||
let merged_index = merge_filtered_segments(
|
|
||||||
&segments,
|
|
||||||
target_settings,
|
|
||||||
filter_segments,
|
|
||||||
RamDirectory::default(),
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let segments = merged_index.searchable_segments()?;
|
|
||||||
assert_eq!(segments.len(), 1);
|
|
||||||
|
|
||||||
let segment_metas = segments[0].meta();
|
|
||||||
assert_eq!(segment_metas.num_deleted_docs(), 0);
|
|
||||||
assert_eq!(segment_metas.num_docs(), 1);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_merge_single_filtered_segments() -> crate::Result<()> {
|
|
||||||
let first_index = {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"test text"))?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"some text 2"))?;
|
|
||||||
|
|
||||||
index_writer.add_document(doc!(text_field=>"some text 3"))?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"some text 4"))?;
|
|
||||||
|
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "4"));
|
|
||||||
|
|
||||||
index_writer.commit()?;
|
|
||||||
index
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut segments: Vec<Segment> = Vec::new();
|
|
||||||
segments.extend(first_index.searchable_segments()?);
|
|
||||||
|
|
||||||
let target_settings = first_index.settings().clone();
|
|
||||||
|
|
||||||
let filter_segment = AliveBitSet::for_test_from_deleted_docs(&[0], 4);
|
|
||||||
|
|
||||||
let filter_segments = vec![Some(filter_segment)];
|
|
||||||
|
|
||||||
let index = merge_filtered_segments(
|
|
||||||
&segments,
|
|
||||||
target_settings,
|
|
||||||
filter_segments,
|
|
||||||
RamDirectory::default(),
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let segments = index.searchable_segments()?;
|
|
||||||
assert_eq!(segments.len(), 1);
|
|
||||||
|
|
||||||
let segment_metas = segments[0].meta();
|
|
||||||
assert_eq!(segment_metas.num_deleted_docs(), 0);
|
|
||||||
assert_eq!(segment_metas.num_docs(), 2);
|
|
||||||
|
|
||||||
let searcher = index.reader()?.searcher();
|
|
||||||
{
|
|
||||||
let text_field = index.schema().get_field("text").unwrap();
|
|
||||||
|
|
||||||
let do_search = |term: &str| {
|
|
||||||
let query = QueryParser::for_index(&index, vec![text_field])
|
|
||||||
.parse_query(term)
|
|
||||||
.unwrap();
|
|
||||||
let top_docs: Vec<(f32, DocAddress)> =
|
|
||||||
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
|
|
||||||
|
|
||||||
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
|
|
||||||
};
|
|
||||||
|
|
||||||
assert_eq!(do_search("test"), vec![] as Vec<u32>);
|
|
||||||
assert_eq!(do_search("text"), vec![0, 1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_apply_doc_id_filter_in_merger() -> crate::Result<()> {
|
|
||||||
let first_index = {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"some text 1"))?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"some text 2"))?;
|
|
||||||
|
|
||||||
index_writer.add_document(doc!(text_field=>"some text 3"))?;
|
|
||||||
index_writer.add_document(doc!(text_field=>"some text 4"))?;
|
|
||||||
|
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "4"));
|
|
||||||
|
|
||||||
index_writer.commit()?;
|
|
||||||
index
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut segments: Vec<Segment> = Vec::new();
|
|
||||||
segments.extend(first_index.searchable_segments()?);
|
|
||||||
|
|
||||||
let target_settings = first_index.settings().clone();
|
|
||||||
{
|
|
||||||
let filter_segment = AliveBitSet::for_test_from_deleted_docs(&[1], 4);
|
|
||||||
let filter_segments = vec![Some(filter_segment)];
|
|
||||||
let target_schema = segments[0].schema();
|
|
||||||
let merged_index = Index::create(
|
|
||||||
RamDirectory::default(),
|
|
||||||
target_schema.clone(),
|
|
||||||
target_settings.clone(),
|
|
||||||
)?;
|
|
||||||
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
|
||||||
merged_index.schema(),
|
|
||||||
merged_index.settings().clone(),
|
|
||||||
&segments[..],
|
|
||||||
filter_segments,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let doc_ids_alive: Vec<_> = merger.readers[0].doc_ids_alive().collect();
|
|
||||||
assert_eq!(doc_ids_alive, vec![0, 2]);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
let filter_segments = vec![None];
|
|
||||||
let target_schema = segments[0].schema();
|
|
||||||
let merged_index = Index::create(
|
|
||||||
RamDirectory::default(),
|
|
||||||
target_schema.clone(),
|
|
||||||
target_settings.clone(),
|
|
||||||
)?;
|
|
||||||
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
|
|
||||||
merged_index.schema(),
|
|
||||||
merged_index.settings().clone(),
|
|
||||||
&segments[..],
|
|
||||||
filter_segments,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let doc_ids_alive: Vec<_> = merger.readers[0].doc_ids_alive().collect();
|
|
||||||
assert_eq!(doc_ids_alive, vec![0, 1, 2]);
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ use super::{
|
|||||||
doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping},
|
doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping},
|
||||||
operation::AddOperation,
|
operation::AddOperation,
|
||||||
};
|
};
|
||||||
|
use crate::fastfield::FastFieldsWriter;
|
||||||
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
|
||||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||||
use crate::postings::compute_table_size;
|
use crate::postings::compute_table_size;
|
||||||
@@ -17,7 +18,6 @@ use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
|
|||||||
use crate::tokenizer::{TokenStreamChain, Tokenizer};
|
use crate::tokenizer::{TokenStreamChain, Tokenizer};
|
||||||
use crate::Opstamp;
|
use crate::Opstamp;
|
||||||
use crate::{core::Segment, store::StoreWriter};
|
use crate::{core::Segment, store::StoreWriter};
|
||||||
use crate::{fastfield::FastFieldsWriter, schema::Type};
|
|
||||||
use crate::{DocId, SegmentComponent};
|
use crate::{DocId, SegmentComponent};
|
||||||
|
|
||||||
/// Computes the initial size of the hash table.
|
/// Computes the initial size of the hash table.
|
||||||
@@ -173,11 +173,18 @@ impl SegmentWriter {
|
|||||||
let (term_buffer, multifield_postings) =
|
let (term_buffer, multifield_postings) =
|
||||||
(&mut self.term_buffer, &mut self.multifield_postings);
|
(&mut self.term_buffer, &mut self.multifield_postings);
|
||||||
match *field_entry.field_type() {
|
match *field_entry.field_type() {
|
||||||
FieldType::Facet(_) => {
|
FieldType::HierarchicalFacet(_) => {
|
||||||
term_buffer.set_field(Type::Facet, field);
|
term_buffer.set_field(field);
|
||||||
for field_value in field_values {
|
let facets =
|
||||||
let facet = field_value.value().facet().ok_or_else(make_schema_error)?;
|
field_values
|
||||||
let facet_str = facet.encoded_str();
|
.iter()
|
||||||
|
.flat_map(|field_value| match *field_value.value() {
|
||||||
|
Value::Facet(ref facet) => Some(facet.encoded_str()),
|
||||||
|
_ => {
|
||||||
|
panic!("Expected hierarchical facet");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
for facet_str in facets {
|
||||||
let mut unordered_term_id_opt = None;
|
let mut unordered_term_id_opt = None;
|
||||||
FacetTokenizer
|
FacetTokenizer
|
||||||
.token_stream(facet_str)
|
.token_stream(facet_str)
|
||||||
@@ -234,11 +241,12 @@ impl SegmentWriter {
|
|||||||
term_buffer,
|
term_buffer,
|
||||||
)
|
)
|
||||||
};
|
};
|
||||||
|
|
||||||
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
||||||
}
|
}
|
||||||
FieldType::U64(_) => {
|
FieldType::U64(_) => {
|
||||||
for field_value in field_values {
|
for field_value in field_values {
|
||||||
term_buffer.set_field(Type::U64, field_value.field());
|
term_buffer.set_field(field_value.field());
|
||||||
let u64_val = field_value
|
let u64_val = field_value
|
||||||
.value()
|
.value()
|
||||||
.u64_value()
|
.u64_value()
|
||||||
@@ -249,7 +257,7 @@ impl SegmentWriter {
|
|||||||
}
|
}
|
||||||
FieldType::Date(_) => {
|
FieldType::Date(_) => {
|
||||||
for field_value in field_values {
|
for field_value in field_values {
|
||||||
term_buffer.set_field(Type::Date, field_value.field());
|
term_buffer.set_field(field_value.field());
|
||||||
let date_val = field_value
|
let date_val = field_value
|
||||||
.value()
|
.value()
|
||||||
.date_value()
|
.date_value()
|
||||||
@@ -260,7 +268,7 @@ impl SegmentWriter {
|
|||||||
}
|
}
|
||||||
FieldType::I64(_) => {
|
FieldType::I64(_) => {
|
||||||
for field_value in field_values {
|
for field_value in field_values {
|
||||||
term_buffer.set_field(Type::I64, field_value.field());
|
term_buffer.set_field(field_value.field());
|
||||||
let i64_val = field_value
|
let i64_val = field_value
|
||||||
.value()
|
.value()
|
||||||
.i64_value()
|
.i64_value()
|
||||||
@@ -271,7 +279,7 @@ impl SegmentWriter {
|
|||||||
}
|
}
|
||||||
FieldType::F64(_) => {
|
FieldType::F64(_) => {
|
||||||
for field_value in field_values {
|
for field_value in field_values {
|
||||||
term_buffer.set_field(Type::F64, field_value.field());
|
term_buffer.set_field(field_value.field());
|
||||||
let f64_val = field_value
|
let f64_val = field_value
|
||||||
.value()
|
.value()
|
||||||
.f64_value()
|
.f64_value()
|
||||||
@@ -282,7 +290,7 @@ impl SegmentWriter {
|
|||||||
}
|
}
|
||||||
FieldType::Bytes(_) => {
|
FieldType::Bytes(_) => {
|
||||||
for field_value in field_values {
|
for field_value in field_values {
|
||||||
term_buffer.set_field(Type::Bytes, field_value.field());
|
term_buffer.set_field(field_value.field());
|
||||||
let bytes = field_value
|
let bytes = field_value
|
||||||
.value()
|
.value()
|
||||||
.bytes_value()
|
.bytes_value()
|
||||||
|
|||||||
134
src/lib.rs
134
src/lib.rs
@@ -10,8 +10,8 @@
|
|||||||
)]
|
)]
|
||||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||||
#![warn(missing_docs)]
|
#![warn(missing_docs)]
|
||||||
#![allow(clippy::len_without_is_empty)]
|
|
||||||
#![allow(clippy::return_self_not_must_use)]
|
#![feature(async_closure)]
|
||||||
|
|
||||||
//! # `tantivy`
|
//! # `tantivy`
|
||||||
//!
|
//!
|
||||||
@@ -64,7 +64,7 @@
|
|||||||
//! body => "He was an old man who fished alone in a skiff in \
|
//! body => "He was an old man who fished alone in a skiff in \
|
||||||
//! the Gulf Stream and he had gone eighty-four days \
|
//! the Gulf Stream and he had gone eighty-four days \
|
||||||
//! now without taking a fish."
|
//! now without taking a fish."
|
||||||
//! ))?;
|
//! ));
|
||||||
//!
|
//!
|
||||||
//! // We need to call .commit() explicitly to force the
|
//! // We need to call .commit() explicitly to force the
|
||||||
//! // index_writer to finish processing the documents in the queue,
|
//! // index_writer to finish processing the documents in the queue,
|
||||||
@@ -105,7 +105,7 @@
|
|||||||
//! A good place for you to get started is to check out
|
//! A good place for you to get started is to check out
|
||||||
//! the example code (
|
//! the example code (
|
||||||
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
|
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
|
||||||
//! [source code](https://github.com/quickwit-inc/tantivy/blob/main/examples/basic_search.rs))
|
//! [source code](https://github.com/tantivy-search/tantivy/blob/main/examples/basic_search.rs))
|
||||||
|
|
||||||
#[cfg_attr(test, macro_use)]
|
#[cfg_attr(test, macro_use)]
|
||||||
extern crate serde_json;
|
extern crate serde_json;
|
||||||
@@ -128,6 +128,8 @@ mod macros;
|
|||||||
pub use crate::error::TantivyError;
|
pub use crate::error::TantivyError;
|
||||||
pub use chrono;
|
pub use chrono;
|
||||||
|
|
||||||
|
pub const PKG_JS: &'static str = "./pkg/pool_exec.js"; // path to `wasm-bindgen`'s JS binding
|
||||||
|
|
||||||
/// Tantivy result.
|
/// Tantivy result.
|
||||||
///
|
///
|
||||||
/// Within tantivy, please avoid importing `Result` using `use crate::Result`
|
/// Within tantivy, please avoid importing `Result` using `use crate::Result`
|
||||||
@@ -158,7 +160,7 @@ pub mod termdict;
|
|||||||
|
|
||||||
mod reader;
|
mod reader;
|
||||||
|
|
||||||
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer};
|
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy};
|
||||||
mod snippet;
|
mod snippet;
|
||||||
pub use self::snippet::{Snippet, SnippetGenerator};
|
pub use self::snippet::{Snippet, SnippetGenerator};
|
||||||
|
|
||||||
@@ -166,20 +168,17 @@ mod docset;
|
|||||||
pub use self::docset::{DocSet, TERMINATED};
|
pub use self::docset::{DocSet, TERMINATED};
|
||||||
pub use crate::core::{Executor, SegmentComponent};
|
pub use crate::core::{Executor, SegmentComponent};
|
||||||
pub use crate::core::{
|
pub use crate::core::{
|
||||||
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher,
|
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, Order, Searcher, Segment,
|
||||||
SearcherGeneration, Segment, SegmentId, SegmentMeta,
|
SegmentId, SegmentMeta,
|
||||||
};
|
};
|
||||||
pub use crate::core::{InvertedIndexReader, SegmentReader};
|
pub use crate::core::{InvertedIndexReader, SegmentReader};
|
||||||
pub use crate::directory::Directory;
|
pub use crate::directory::Directory;
|
||||||
pub use crate::indexer::demuxer::*;
|
pub use crate::indexer::merge_segments;
|
||||||
pub use crate::indexer::merge_filtered_segments;
|
|
||||||
pub use crate::indexer::merge_indices;
|
|
||||||
pub use crate::indexer::operation::UserOperation;
|
pub use crate::indexer::operation::UserOperation;
|
||||||
pub use crate::indexer::{IndexWriter, PreparedCommit};
|
pub use crate::indexer::IndexWriter;
|
||||||
pub use crate::postings::Postings;
|
pub use crate::postings::Postings;
|
||||||
pub use crate::reader::LeasedItem;
|
pub use crate::reader::LeasedItem;
|
||||||
pub use crate::schema::{Document, Term};
|
pub use crate::schema::{Document, Term};
|
||||||
pub use census::{Inventory, TrackedObject};
|
|
||||||
pub use common::HasLen;
|
pub use common::HasLen;
|
||||||
pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
pub use common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
@@ -239,7 +238,6 @@ pub fn version_string() -> &'static str {
|
|||||||
pub mod merge_policy {
|
pub mod merge_policy {
|
||||||
pub use crate::indexer::DefaultMergePolicy;
|
pub use crate::indexer::DefaultMergePolicy;
|
||||||
pub use crate::indexer::LogMergePolicy;
|
pub use crate::indexer::LogMergePolicy;
|
||||||
pub use crate::indexer::MergeCandidate;
|
|
||||||
pub use crate::indexer::MergePolicy;
|
pub use crate::indexer::MergePolicy;
|
||||||
pub use crate::indexer::NoMergePolicy;
|
pub use crate::indexer::NoMergePolicy;
|
||||||
}
|
}
|
||||||
@@ -382,22 +380,24 @@ pub mod tests {
|
|||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_from_tempdir(schema)?;
|
let index = Index::create_from_tempdir(schema).unwrap();
|
||||||
// writing the segment
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
{
|
{
|
||||||
let doc = doc!(text_field=>"af b");
|
// writing the segment
|
||||||
index_writer.add_document(doc)?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
|
{
|
||||||
|
let doc = doc!(text_field=>"af b");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let doc = doc!(text_field=>"a b c");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let doc = doc!(text_field=>"a b c d");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
{
|
|
||||||
let doc = doc!(text_field=>"a b c");
|
|
||||||
index_writer.add_document(doc)?;
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let doc = doc!(text_field=>"a b c d");
|
|
||||||
index_writer.add_document(doc)?;
|
|
||||||
}
|
|
||||||
index_writer.commit()?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -407,12 +407,12 @@ pub mod tests {
|
|||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
index_writer.add_document(doc!(text_field=>"a"));
|
||||||
index_writer.add_document(doc!(text_field=>"a a"))?;
|
index_writer.add_document(doc!(text_field=>"a a"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
index_writer.add_document(doc!(text_field=>"c"))?;
|
index_writer.add_document(doc!(text_field=>"c"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -434,7 +434,7 @@ pub mod tests {
|
|||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let index_reader = index.reader()?;
|
let index_reader = index.reader()?;
|
||||||
let searcher = index_reader.searcher();
|
let searcher = index_reader.searcher();
|
||||||
@@ -456,9 +456,9 @@ pub mod tests {
|
|||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||||
index_writer.add_document(doc!())?;
|
index_writer.add_document(doc!());
|
||||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -500,20 +500,20 @@ pub mod tests {
|
|||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
// 0
|
// 0
|
||||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
// 1
|
// 1
|
||||||
index_writer.add_document(doc!(text_field=>" a c"))?;
|
index_writer.add_document(doc!(text_field=>" a c"));
|
||||||
// 2
|
// 2
|
||||||
index_writer.add_document(doc!(text_field=>" b c"))?;
|
index_writer.add_document(doc!(text_field=>" b c"));
|
||||||
// 3
|
// 3
|
||||||
index_writer.add_document(doc!(text_field=>" b d"))?;
|
index_writer.add_document(doc!(text_field=>" b d"));
|
||||||
|
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||||
// 4
|
// 4
|
||||||
index_writer.add_document(doc!(text_field=>" b c"))?;
|
index_writer.add_document(doc!(text_field=>" b c"));
|
||||||
// 5
|
// 5
|
||||||
index_writer.add_document(doc!(text_field=>" a"))?;
|
index_writer.add_document(doc!(text_field=>" a"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -547,7 +547,7 @@ pub mod tests {
|
|||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
// 0
|
// 0
|
||||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
// 1
|
// 1
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
index_writer.rollback()?;
|
index_writer.rollback()?;
|
||||||
@@ -583,7 +583,7 @@ pub mod tests {
|
|||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
index_writer.rollback()?;
|
index_writer.rollback()?;
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||||
@@ -633,7 +633,7 @@ pub mod tests {
|
|||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(field=>1u64))?;
|
index_writer.add_document(doc!(field=>1u64));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -657,7 +657,7 @@ pub mod tests {
|
|||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
let negative_val = -1i64;
|
let negative_val = -1i64;
|
||||||
index_writer.add_document(doc!(value_field => negative_val))?;
|
index_writer.add_document(doc!(value_field => negative_val));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -681,7 +681,7 @@ pub mod tests {
|
|||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
let val = std::f64::consts::PI;
|
let val = std::f64::consts::PI;
|
||||||
index_writer.add_document(doc!(value_field => val))?;
|
index_writer.add_document(doc!(value_field => val));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -704,7 +704,7 @@ pub mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(text_field=>"a"))?;
|
index_writer.add_document(doc!(text_field=>"a"));
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -727,14 +727,14 @@ pub mod tests {
|
|||||||
|
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(text_field=>"63"))?;
|
index_writer.add_document(doc!(text_field=>"63"));
|
||||||
index_writer.add_document(doc!(text_field=>"70"))?;
|
index_writer.add_document(doc!(text_field=>"70"));
|
||||||
index_writer.add_document(doc!(text_field=>"34"))?;
|
index_writer.add_document(doc!(text_field=>"34"));
|
||||||
index_writer.add_document(doc!(text_field=>"1"))?;
|
index_writer.add_document(doc!(text_field=>"1"));
|
||||||
index_writer.add_document(doc!(text_field=>"38"))?;
|
index_writer.add_document(doc!(text_field=>"38"));
|
||||||
index_writer.add_document(doc!(text_field=>"33"))?;
|
index_writer.add_document(doc!(text_field=>"33"));
|
||||||
index_writer.add_document(doc!(text_field=>"40"))?;
|
index_writer.add_document(doc!(text_field=>"40"));
|
||||||
index_writer.add_document(doc!(text_field=>"17"))?;
|
index_writer.add_document(doc!(text_field=>"17"));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "38"));
|
index_writer.delete_term(Term::from_field_text(text_field, "38"));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "34"));
|
index_writer.delete_term(Term::from_field_text(text_field, "34"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
@@ -752,7 +752,7 @@ pub mod tests {
|
|||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(text_field=>"af af af bc bc"))?;
|
index_writer.add_document(doc!(text_field=>"af af af bc bc"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -784,9 +784,9 @@ pub mod tests {
|
|||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(text_field=>"af af af b"))?;
|
index_writer.add_document(doc!(text_field=>"af af af b"));
|
||||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||||
index_writer.add_document(doc!(text_field=>"a b c d"))?;
|
index_writer.add_document(doc!(text_field=>"a b c d"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
reader.reload()?;
|
reader.reload()?;
|
||||||
@@ -848,9 +848,9 @@ pub mod tests {
|
|||||||
assert_eq!(reader.searcher().num_docs(), 0u64);
|
assert_eq!(reader.searcher().num_docs(), 0u64);
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(text_field=>"af b"))?;
|
index_writer.add_document(doc!(text_field=>"af b"));
|
||||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||||
index_writer.add_document(doc!(text_field=>"a b c d"))?;
|
index_writer.add_document(doc!(text_field=>"a b c d"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
reader.reload()?;
|
reader.reload()?;
|
||||||
assert_eq!(reader.searcher().num_docs(), 3u64);
|
assert_eq!(reader.searcher().num_docs(), 3u64);
|
||||||
@@ -890,7 +890,7 @@ pub mod tests {
|
|||||||
{
|
{
|
||||||
let document =
|
let document =
|
||||||
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
|
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
|
||||||
index_writer.add_document(document)?;
|
index_writer.add_document(document);
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
}
|
}
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
@@ -957,7 +957,7 @@ pub mod tests {
|
|||||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
|
|
||||||
for doc_id in 0u64..DOC_COUNT {
|
for doc_id in 0u64..DOC_COUNT {
|
||||||
index_writer.add_document(doc!(id => doc_id))?;
|
index_writer.add_document(doc!(id => doc_id));
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
@@ -974,7 +974,7 @@ pub mod tests {
|
|||||||
index_writer.delete_term(Term::from_field_u64(id, doc_id));
|
index_writer.delete_term(Term::from_field_u64(id, doc_id));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
index_reader.reload()?;
|
index_reader.reload()?;
|
||||||
index_writer.add_document(doc!(id => doc_id))?;
|
index_writer.add_document(doc!(id => doc_id));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
index_reader.reload()?;
|
index_reader.reload()?;
|
||||||
let searcher = index_reader.searcher();
|
let searcher = index_reader.searcher();
|
||||||
@@ -1009,8 +1009,8 @@ pub mod tests {
|
|||||||
let index = Index::create_in_dir(&index_path, schema)?;
|
let index = Index::create_in_dir(&index_path, schema)?;
|
||||||
let mut writer = index.writer(50_000_000)?;
|
let mut writer = index.writer(50_000_000)?;
|
||||||
for _ in 0..5000 {
|
for _ in 0..5000 {
|
||||||
writer.add_document(doc!(body => "foo"))?;
|
writer.add_document(doc!(body => "foo"));
|
||||||
writer.add_document(doc!(body => "boo"))?;
|
writer.add_document(doc!(body => "boo"));
|
||||||
}
|
}
|
||||||
writer.commit()?;
|
writer.commit()?;
|
||||||
assert!(index.validate_checksum()?.is_empty());
|
assert!(index.validate_checksum()?.is_empty());
|
||||||
|
|||||||
@@ -1,5 +1,14 @@
|
|||||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||||
|
|
||||||
|
unsafe fn binary_search_step(ptr: *const u32, target: u32, half_size: isize) -> *const u32 {
|
||||||
|
let mid = ptr.offset(half_size);
|
||||||
|
if *mid < target {
|
||||||
|
mid.offset(1)
|
||||||
|
} else {
|
||||||
|
ptr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Search the first index containing an element greater or equal to
|
/// Search the first index containing an element greater or equal to
|
||||||
/// the target.
|
/// the target.
|
||||||
///
|
///
|
||||||
@@ -21,16 +30,18 @@ use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
|||||||
/// end of the last block for instance.
|
/// end of the last block for instance.
|
||||||
/// - The target is assumed smaller or equal to the last element of the block.
|
/// - The target is assumed smaller or equal to the last element of the block.
|
||||||
pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
|
pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
|
||||||
let mut start = 0;
|
let start_ptr: *const u32 = &arr[0] as *const u32;
|
||||||
let mut len = arr.len();
|
unsafe {
|
||||||
for _ in 0..7 {
|
let mut ptr = start_ptr;
|
||||||
len /= 2;
|
ptr = binary_search_step(ptr, target, 63);
|
||||||
let pivot = unsafe { *arr.get_unchecked(start + len - 1) };
|
ptr = binary_search_step(ptr, target, 31);
|
||||||
if pivot < target {
|
ptr = binary_search_step(ptr, target, 15);
|
||||||
start += len;
|
ptr = binary_search_step(ptr, target, 7);
|
||||||
}
|
ptr = binary_search_step(ptr, target, 3);
|
||||||
|
ptr = binary_search_step(ptr, target, 1);
|
||||||
|
let extra = if *ptr < target { 1 } else { 0 };
|
||||||
|
(ptr.offset_from(start_ptr) as usize) + extra
|
||||||
}
|
}
|
||||||
start
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -393,8 +393,8 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_block_segment_postings() -> crate::Result<()> {
|
fn test_block_segment_postings() {
|
||||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>())?;
|
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||||
let mut offset: u32 = 0u32;
|
let mut offset: u32 = 0u32;
|
||||||
// checking that the `doc_freq` is correct
|
// checking that the `doc_freq` is correct
|
||||||
assert_eq!(block_segments.doc_freq(), 100_000);
|
assert_eq!(block_segments.doc_freq(), 100_000);
|
||||||
@@ -409,17 +409,16 @@ mod tests {
|
|||||||
offset += block.len() as u32;
|
offset += block.len() as u32;
|
||||||
block_segments.advance();
|
block_segments.advance();
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_skip_right_at_new_block() -> crate::Result<()> {
|
fn test_skip_right_at_new_block() {
|
||||||
let mut doc_ids = (0..128).collect::<Vec<u32>>();
|
let mut doc_ids = (0..128).collect::<Vec<u32>>();
|
||||||
// 128 is missing
|
// 128 is missing
|
||||||
doc_ids.push(129);
|
doc_ids.push(129);
|
||||||
doc_ids.push(130);
|
doc_ids.push(130);
|
||||||
{
|
{
|
||||||
let block_segments = build_block_postings(&doc_ids)?;
|
let block_segments = build_block_postings(&doc_ids);
|
||||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||||
assert_eq!(docset.seek(128), 129);
|
assert_eq!(docset.seek(128), 129);
|
||||||
assert_eq!(docset.doc(), 129);
|
assert_eq!(docset.doc(), 129);
|
||||||
@@ -428,7 +427,7 @@ mod tests {
|
|||||||
assert_eq!(docset.advance(), TERMINATED);
|
assert_eq!(docset.advance(), TERMINATED);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let block_segments = build_block_postings(&doc_ids).unwrap();
|
let block_segments = build_block_postings(&doc_ids);
|
||||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||||
assert_eq!(docset.seek(129), 129);
|
assert_eq!(docset.seek(129), 129);
|
||||||
assert_eq!(docset.doc(), 129);
|
assert_eq!(docset.doc(), 129);
|
||||||
@@ -437,47 +436,46 @@ mod tests {
|
|||||||
assert_eq!(docset.advance(), TERMINATED);
|
assert_eq!(docset.advance(), TERMINATED);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let block_segments = build_block_postings(&doc_ids)?;
|
let block_segments = build_block_postings(&doc_ids);
|
||||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
||||||
assert_eq!(docset.doc(), 0);
|
assert_eq!(docset.doc(), 0);
|
||||||
assert_eq!(docset.seek(131), TERMINATED);
|
assert_eq!(docset.seek(131), TERMINATED);
|
||||||
assert_eq!(docset.doc(), TERMINATED);
|
assert_eq!(docset.doc(), TERMINATED);
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_block_postings(docs: &[DocId]) -> crate::Result<BlockSegmentPostings> {
|
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let int_field = schema_builder.add_u64_field("id", INDEXED);
|
let int_field = schema_builder.add_u64_field("id", INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
let mut last_doc = 0u32;
|
let mut last_doc = 0u32;
|
||||||
for &doc in docs {
|
for &doc in docs {
|
||||||
for _ in last_doc..doc {
|
for _ in last_doc..doc {
|
||||||
index_writer.add_document(doc!(int_field=>1u64))?;
|
index_writer.add_document(doc!(int_field=>1u64));
|
||||||
}
|
}
|
||||||
index_writer.add_document(doc!(int_field=>0u64))?;
|
index_writer.add_document(doc!(int_field=>0u64));
|
||||||
last_doc = doc + 1;
|
last_doc = doc + 1;
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
|
let inverted_index = segment_reader.inverted_index(int_field).unwrap();
|
||||||
let term = Term::from_field_u64(int_field, 0u64);
|
let term = Term::from_field_u64(int_field, 0u64);
|
||||||
let term_info = inverted_index.get_term_info(&term)?.unwrap();
|
let term_info = inverted_index.get_term_info(&term).unwrap().unwrap();
|
||||||
let block_postings = inverted_index
|
inverted_index
|
||||||
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)?;
|
.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
|
||||||
Ok(block_postings)
|
.unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_block_segment_postings_seek() -> crate::Result<()> {
|
fn test_block_segment_postings_seek() {
|
||||||
let mut docs = vec![0];
|
let mut docs = vec![0];
|
||||||
for i in 0..1300 {
|
for i in 0..1300 {
|
||||||
docs.push((i * i / 100) + i);
|
docs.push((i * i / 100) + i);
|
||||||
}
|
}
|
||||||
let mut block_postings = build_block_postings(&docs[..])?;
|
let mut block_postings = build_block_postings(&docs[..]);
|
||||||
for i in &[0, 424, 10000] {
|
for i in &[0, 424, 10000] {
|
||||||
block_postings.seek(*i);
|
block_postings.seek(*i);
|
||||||
let docs = block_postings.docs();
|
let docs = block_postings.docs();
|
||||||
@@ -486,7 +484,6 @@ mod tests {
|
|||||||
}
|
}
|
||||||
block_postings.seek(100_000);
|
block_postings.seek(100_000);
|
||||||
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
|
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -500,7 +497,7 @@ mod tests {
|
|||||||
// the other containing odd numbers.
|
// the other containing odd numbers.
|
||||||
for i in 0..6 {
|
for i in 0..6 {
|
||||||
let doc = doc!(int_field=> (i % 2) as u64);
|
let doc = doc!(int_field=> (i % 2) as u64);
|
||||||
index_writer.add_document(doc)?;
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ pub mod tests {
|
|||||||
use crate::fieldnorm::FieldNormReader;
|
use crate::fieldnorm::FieldNormReader;
|
||||||
use crate::indexer::operation::AddOperation;
|
use crate::indexer::operation::AddOperation;
|
||||||
use crate::indexer::SegmentWriter;
|
use crate::indexer::SegmentWriter;
|
||||||
|
use crate::merge_policy::NoMergePolicy;
|
||||||
use crate::query::Scorer;
|
use crate::query::Scorer;
|
||||||
use crate::schema::{Field, TextOptions};
|
use crate::schema::{Field, TextOptions};
|
||||||
use crate::schema::{IndexRecordOption, TextFieldIndexing};
|
use crate::schema::{IndexRecordOption, TextFieldIndexing};
|
||||||
@@ -86,12 +87,12 @@ pub mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
index_writer.add_document(doc!(title => r#"abc abc abc"#))?;
|
index_writer.add_document(doc!(title => r#"abc abc abc"#));
|
||||||
index_writer.add_document(doc!(title => r#"abc be be be be abc"#))?;
|
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
|
||||||
for _ in 0..1_000 {
|
for _ in 0..1_000 {
|
||||||
index_writer.add_document(doc!(title => r#"abc abc abc"#))?;
|
index_writer.add_document(doc!(title => r#"abc abc abc"#));
|
||||||
}
|
}
|
||||||
index_writer.add_document(doc!(title => r#"abc be be be be abc"#))?;
|
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
@@ -152,68 +153,50 @@ pub mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
pub fn test_index_max_length_token() -> crate::Result<()> {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_options = TextOptions::default().set_indexing_options(
|
|
||||||
TextFieldIndexing::default()
|
|
||||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
|
||||||
.set_tokenizer("simple_no_truncation"),
|
|
||||||
);
|
|
||||||
let text_field = schema_builder.add_text_field("text", text_options);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
index
|
|
||||||
.tokenizers()
|
|
||||||
.register("simple_no_truncation", SimpleTokenizer);
|
|
||||||
let reader = index.reader()?;
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
|
|
||||||
let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
|
|
||||||
index_writer.add_document(doc!(text_field=>ok_token_text.clone()))?;
|
|
||||||
index_writer.commit()?;
|
|
||||||
reader.reload()?;
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let segment_reader = searcher.segment_reader(0u32);
|
|
||||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
|
||||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
|
||||||
let mut bytes = vec![];
|
|
||||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
|
||||||
assert_eq!(&bytes[..], ok_token_text.as_bytes());
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_drop_token_that_are_too_long() -> crate::Result<()> {
|
pub fn test_drop_token_that_are_too_long() -> crate::Result<()> {
|
||||||
let mut schema_builder = Schema::builder();
|
let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN);
|
||||||
let text_options = TextOptions::default().set_indexing_options(
|
|
||||||
TextFieldIndexing::default()
|
|
||||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
|
||||||
.set_tokenizer("simple_no_truncation"),
|
|
||||||
);
|
|
||||||
let text_field = schema_builder.add_text_field("text", text_options);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
index
|
|
||||||
.tokenizers()
|
|
||||||
.register("simple_no_truncation", SimpleTokenizer);
|
|
||||||
let reader = index.reader()?;
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
|
|
||||||
let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1);
|
let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1);
|
||||||
exceeding_token_text.push_str(" hello");
|
exceeding_token_text.push_str(" hello");
|
||||||
index_writer.add_document(doc!(text_field=>exceeding_token_text))?;
|
let mut schema_builder = Schema::builder();
|
||||||
index_writer.commit()?;
|
let text_options = TextOptions::default().set_indexing_options(
|
||||||
reader.reload()?;
|
TextFieldIndexing::default()
|
||||||
let searcher = reader.searcher();
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
||||||
let segment_reader = searcher.segment_reader(0u32);
|
.set_tokenizer("simple_no_truncation"),
|
||||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
);
|
||||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
let text_field = schema_builder.add_text_field("text", text_options);
|
||||||
let mut bytes = vec![];
|
let schema = schema_builder.build();
|
||||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
let index = Index::create_in_ram(schema);
|
||||||
assert_eq!(&bytes, b"hello");
|
index
|
||||||
|
.tokenizers()
|
||||||
|
.register("simple_no_truncation", SimpleTokenizer);
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
|
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
|
{
|
||||||
|
index_writer.add_document(doc!(text_field=>exceeding_token_text));
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
reader.reload().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let segment_reader = searcher.segment_reader(0u32);
|
||||||
|
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||||
|
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||||
|
let mut bytes = vec![];
|
||||||
|
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||||
|
assert_eq!(&bytes, b"hello");
|
||||||
|
}
|
||||||
|
{
|
||||||
|
index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
reader.reload().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let segment_reader = searcher.segment_reader(1u32);
|
||||||
|
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||||
|
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||||
|
let mut bytes = vec![];
|
||||||
|
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||||
|
assert_eq!(&bytes[..], ok_token_text.as_bytes());
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -332,13 +315,13 @@ pub mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(text_field => "g b b d c g c"))?;
|
index_writer.add_document(doc!(text_field => "g b b d c g c"));
|
||||||
index_writer.add_document(doc!(text_field => "g a b b a d c g c"))?;
|
index_writer.add_document(doc!(text_field => "g a b b a d c g c"));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
let term_a = Term::from_field_text(text_field, "a");
|
let term_a = Term::from_field_text(text_field, "a");
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let mut postings = segment_reader
|
let mut postings = segment_reader
|
||||||
.inverted_index(text_field)?
|
.inverted_index(text_field)?
|
||||||
@@ -367,7 +350,7 @@ pub mod tests {
|
|||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
for i in 0u64..num_docs as u64 {
|
for i in 0u64..num_docs as u64 {
|
||||||
let doc = doc!(value_field => 2u64, value_field => i % 2u64);
|
let doc = doc!(value_field => 2u64, value_field => i % 2u64);
|
||||||
index_writer.add_document(doc)?;
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
@@ -617,7 +600,7 @@ mod bench {
|
|||||||
doc.add_text(text_field, "c");
|
doc.add_text(text_field, "c");
|
||||||
}
|
}
|
||||||
doc.add_text(text_field, "d");
|
doc.add_text(text_field, "d");
|
||||||
index_writer.add_document(doc).unwrap();
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,8 +5,8 @@ use crate::postings::recorder::{
|
|||||||
};
|
};
|
||||||
use crate::postings::UnorderedTermId;
|
use crate::postings::UnorderedTermId;
|
||||||
use crate::postings::{FieldSerializer, InvertedIndexSerializer};
|
use crate::postings::{FieldSerializer, InvertedIndexSerializer};
|
||||||
|
use crate::schema::IndexRecordOption;
|
||||||
use crate::schema::{Field, FieldEntry, FieldType, Schema, Term};
|
use crate::schema::{Field, FieldEntry, FieldType, Schema, Term};
|
||||||
use crate::schema::{IndexRecordOption, Type};
|
|
||||||
use crate::termdict::TermOrdinal;
|
use crate::termdict::TermOrdinal;
|
||||||
use crate::tokenizer::TokenStream;
|
use crate::tokenizer::TokenStream;
|
||||||
use crate::tokenizer::{Token, MAX_TOKEN_LEN};
|
use crate::tokenizer::{Token, MAX_TOKEN_LEN};
|
||||||
@@ -33,13 +33,15 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter>
|
|||||||
SpecializedPostingsWriter::<TfAndPositionRecorder>::new_boxed()
|
SpecializedPostingsWriter::<TfAndPositionRecorder>::new_boxed()
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.unwrap_or_else(SpecializedPostingsWriter::<NothingRecorder>::new_boxed),
|
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||||
FieldType::U64(_)
|
FieldType::U64(_)
|
||||||
| FieldType::I64(_)
|
| FieldType::I64(_)
|
||||||
| FieldType::F64(_)
|
| FieldType::F64(_)
|
||||||
| FieldType::Date(_)
|
| FieldType::Date(_)
|
||||||
| FieldType::Bytes(_)
|
| FieldType::Bytes(_)
|
||||||
| FieldType::Facet(_) => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
|
| FieldType::HierarchicalFacet(_) => {
|
||||||
|
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -51,11 +53,11 @@ pub struct MultiFieldPostingsWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn make_field_partition(
|
fn make_field_partition(
|
||||||
term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
term_offsets: &[(&[u8], Addr, UnorderedTermId)],
|
||||||
) -> Vec<(Field, Range<usize>)> {
|
) -> Vec<(Field, Range<usize>)> {
|
||||||
let term_offsets_it = term_offsets
|
let term_offsets_it = term_offsets
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(term, _, _)| term.field())
|
.map(|(key, _, _)| Term::wrap(key).field())
|
||||||
.enumerate();
|
.enumerate();
|
||||||
let mut prev_field_opt = None;
|
let mut prev_field_opt = None;
|
||||||
let mut fields = vec![];
|
let mut fields = vec![];
|
||||||
@@ -130,10 +132,10 @@ impl MultiFieldPostingsWriter {
|
|||||||
fieldnorm_readers: FieldNormReaders,
|
fieldnorm_readers: FieldNormReaders,
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
doc_id_map: Option<&DocIdMapping>,
|
||||||
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
|
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
|
||||||
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
|
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
|
||||||
Vec::with_capacity(self.term_index.len());
|
Vec::with_capacity(self.term_index.len());
|
||||||
term_offsets.extend(self.term_index.iter());
|
term_offsets.extend(self.term_index.iter());
|
||||||
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
|
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
|
||||||
|
|
||||||
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
|
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
|
||||||
HashMap::new();
|
HashMap::new();
|
||||||
@@ -144,7 +146,7 @@ impl MultiFieldPostingsWriter {
|
|||||||
let field_entry = self.schema.get_field_entry(field);
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
|
|
||||||
match *field_entry.field_type() {
|
match *field_entry.field_type() {
|
||||||
FieldType::Str(_) | FieldType::Facet(_) => {
|
FieldType::Str(_) | FieldType::HierarchicalFacet(_) => {
|
||||||
// populating the (unordered term ord) -> (ordered term ord) mapping
|
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||||
// for the field.
|
// for the field.
|
||||||
let unordered_term_ids = term_offsets[byte_offsets.clone()]
|
let unordered_term_ids = term_offsets[byte_offsets.clone()]
|
||||||
@@ -208,7 +210,7 @@ pub trait PostingsWriter {
|
|||||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||||
fn serialize(
|
fn serialize(
|
||||||
&self,
|
&self,
|
||||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||||
serializer: &mut FieldSerializer<'_>,
|
serializer: &mut FieldSerializer<'_>,
|
||||||
term_heap: &MemoryArena,
|
term_heap: &MemoryArena,
|
||||||
heap: &MemoryArena,
|
heap: &MemoryArena,
|
||||||
@@ -225,7 +227,7 @@ pub trait PostingsWriter {
|
|||||||
heap: &mut MemoryArena,
|
heap: &mut MemoryArena,
|
||||||
term_buffer: &mut Term,
|
term_buffer: &mut Term,
|
||||||
) -> u32 {
|
) -> u32 {
|
||||||
term_buffer.set_field(Type::Str, field);
|
term_buffer.set_field(field);
|
||||||
let mut sink = |token: &Token| {
|
let mut sink = |token: &Token| {
|
||||||
// We skip all tokens with a len greater than u16.
|
// We skip all tokens with a len greater than u16.
|
||||||
if token.text.len() <= MAX_TOKEN_LEN {
|
if token.text.len() <= MAX_TOKEN_LEN {
|
||||||
@@ -279,7 +281,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
|||||||
) -> UnorderedTermId {
|
) -> UnorderedTermId {
|
||||||
debug_assert!(term.as_slice().len() >= 4);
|
debug_assert!(term.as_slice().len() >= 4);
|
||||||
self.total_num_tokens += 1;
|
self.total_num_tokens += 1;
|
||||||
term_index.mutate_or_create(term.as_slice(), |opt_recorder: Option<Rec>| {
|
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
|
||||||
if let Some(mut recorder) = opt_recorder {
|
if let Some(mut recorder) = opt_recorder {
|
||||||
let current_doc = recorder.current_doc();
|
let current_doc = recorder.current_doc();
|
||||||
if current_doc != doc {
|
if current_doc != doc {
|
||||||
@@ -299,17 +301,17 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
|||||||
|
|
||||||
fn serialize(
|
fn serialize(
|
||||||
&self,
|
&self,
|
||||||
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
|
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||||
serializer: &mut FieldSerializer<'_>,
|
serializer: &mut FieldSerializer<'_>,
|
||||||
termdict_heap: &MemoryArena,
|
termdict_heap: &MemoryArena,
|
||||||
heap: &MemoryArena,
|
heap: &MemoryArena,
|
||||||
doc_id_map: Option<&DocIdMapping>,
|
doc_id_map: Option<&DocIdMapping>,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
let mut buffer_lender = BufferLender::default();
|
let mut buffer_lender = BufferLender::default();
|
||||||
for (term, addr, _) in term_addrs {
|
for &(term_bytes, addr, _) in term_addrs {
|
||||||
let recorder: Rec = termdict_heap.read(*addr);
|
let recorder: Rec = termdict_heap.read(addr);
|
||||||
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
|
||||||
serializer.new_term(term.value_bytes(), term_doc_freq)?;
|
serializer.new_term(&term_bytes[4..], term_doc_freq)?;
|
||||||
recorder.serialize(&mut buffer_lender, serializer, heap, doc_id_map);
|
recorder.serialize(&mut buffer_lender, serializer, heap, doc_id_map);
|
||||||
serializer.close_term()?;
|
serializer.close_term()?;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
|
|||||||
use crate::{DocId, Score};
|
use crate::{DocId, Score};
|
||||||
use common::CountingWriter;
|
use common::CountingWriter;
|
||||||
use common::{BinarySerializable, VInt};
|
use common::{BinarySerializable, VInt};
|
||||||
use fail::fail_point;
|
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
|
|
||||||
@@ -213,9 +212,6 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
/// If the current block is incomplete, it need to be encoded
|
/// If the current block is incomplete, it need to be encoded
|
||||||
/// using `VInt` encoding.
|
/// using `VInt` encoding.
|
||||||
pub fn close_term(&mut self) -> io::Result<()> {
|
pub fn close_term(&mut self) -> io::Result<()> {
|
||||||
fail_point!("FieldSerializer::close_term", |msg: Option<String>| {
|
|
||||||
Err(io::Error::new(io::ErrorKind::Other, format!("{:?}", msg)))
|
|
||||||
});
|
|
||||||
if self.term_open {
|
if self.term_open {
|
||||||
self.postings_serializer
|
self.postings_serializer
|
||||||
.close_term(self.current_term_info.doc_freq)?;
|
.close_term(self.current_term_info.doc_freq)?;
|
||||||
@@ -308,8 +304,10 @@ pub struct PostingsSerializer<W: Write> {
|
|||||||
fieldnorm_reader: Option<FieldNormReader>,
|
fieldnorm_reader: Option<FieldNormReader>,
|
||||||
|
|
||||||
bm25_weight: Option<Bm25Weight>,
|
bm25_weight: Option<Bm25Weight>,
|
||||||
|
|
||||||
|
num_docs: u32, // Number of docs in the segment
|
||||||
avg_fieldnorm: Score, // Average number of term in the field for that segment.
|
avg_fieldnorm: Score, // Average number of term in the field for that segment.
|
||||||
// this value is used to compute the block wand information.
|
// this value is used to compute the block wand information.
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<W: Write> PostingsSerializer<W> {
|
impl<W: Write> PostingsSerializer<W> {
|
||||||
@@ -319,6 +317,10 @@ impl<W: Write> PostingsSerializer<W> {
|
|||||||
mode: IndexRecordOption,
|
mode: IndexRecordOption,
|
||||||
fieldnorm_reader: Option<FieldNormReader>,
|
fieldnorm_reader: Option<FieldNormReader>,
|
||||||
) -> PostingsSerializer<W> {
|
) -> PostingsSerializer<W> {
|
||||||
|
let num_docs = fieldnorm_reader
|
||||||
|
.as_ref()
|
||||||
|
.map(|fieldnorm_reader| fieldnorm_reader.num_docs())
|
||||||
|
.unwrap_or(0u32);
|
||||||
PostingsSerializer {
|
PostingsSerializer {
|
||||||
output_write: CountingWriter::wrap(write),
|
output_write: CountingWriter::wrap(write),
|
||||||
|
|
||||||
@@ -333,33 +335,21 @@ impl<W: Write> PostingsSerializer<W> {
|
|||||||
|
|
||||||
fieldnorm_reader,
|
fieldnorm_reader,
|
||||||
bm25_weight: None,
|
bm25_weight: None,
|
||||||
|
|
||||||
|
num_docs,
|
||||||
avg_fieldnorm,
|
avg_fieldnorm,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new_term(&mut self, term_doc_freq: u32) {
|
pub fn new_term(&mut self, term_doc_freq: u32) {
|
||||||
self.bm25_weight = None;
|
if self.mode.has_freq() && self.num_docs > 0 {
|
||||||
|
let bm25_weight = Bm25Weight::for_one_term(
|
||||||
if !self.mode.has_freq() {
|
term_doc_freq as u64,
|
||||||
return;
|
self.num_docs as u64,
|
||||||
|
self.avg_fieldnorm,
|
||||||
|
);
|
||||||
|
self.bm25_weight = Some(bm25_weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
let num_docs_in_segment: u64 =
|
|
||||||
if let Some(fieldnorm_reader) = self.fieldnorm_reader.as_ref() {
|
|
||||||
fieldnorm_reader.num_docs() as u64
|
|
||||||
} else {
|
|
||||||
return;
|
|
||||||
};
|
|
||||||
|
|
||||||
if num_docs_in_segment == 0 {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
self.bm25_weight = Some(Bm25Weight::for_one_term(
|
|
||||||
term_doc_freq as u64,
|
|
||||||
num_docs_in_segment,
|
|
||||||
self.avg_fieldnorm,
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_block(&mut self) {
|
fn write_block(&mut self) {
|
||||||
|
|||||||
@@ -186,6 +186,7 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
||||||
|
|
||||||
|
#[test]
|
||||||
#[test]
|
#[test]
|
||||||
fn test_stack() {
|
fn test_stack() {
|
||||||
let mut heap = MemoryArena::new();
|
let mut heap = MemoryArena::new();
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ use murmurhash32::murmurhash2;
|
|||||||
use super::{Addr, MemoryArena};
|
use super::{Addr, MemoryArena};
|
||||||
use crate::postings::stacker::memory_arena::store;
|
use crate::postings::stacker::memory_arena::store;
|
||||||
use crate::postings::UnorderedTermId;
|
use crate::postings::UnorderedTermId;
|
||||||
use crate::Term;
|
|
||||||
use byteorder::{ByteOrder, NativeEndian};
|
use byteorder::{ByteOrder, NativeEndian};
|
||||||
use std::iter;
|
use std::iter;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
@@ -82,13 +81,13 @@ pub struct Iter<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterator for Iter<'a> {
|
impl<'a> Iterator for Iter<'a> {
|
||||||
type Item = (Term<&'a [u8]>, Addr, UnorderedTermId);
|
type Item = (&'a [u8], Addr, UnorderedTermId);
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
self.inner.next().cloned().map(move |bucket: usize| {
|
self.inner.next().cloned().map(move |bucket: usize| {
|
||||||
let kv = self.hashmap.table[bucket];
|
let kv = self.hashmap.table[bucket];
|
||||||
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
|
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
|
||||||
(Term::wrap(key), offset, kv.unordered_term_id)
|
(key, offset, kv.unordered_term_id)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -190,19 +189,21 @@ impl TermHashMap {
|
|||||||
/// will be in charge of returning a default value.
|
/// will be in charge of returning a default value.
|
||||||
/// If the key already as an associated value, then it will be passed
|
/// If the key already as an associated value, then it will be passed
|
||||||
/// `Some(previous_value)`.
|
/// `Some(previous_value)`.
|
||||||
pub fn mutate_or_create<V, TMutator>(
|
pub fn mutate_or_create<S, V, TMutator>(
|
||||||
&mut self,
|
&mut self,
|
||||||
key: &[u8],
|
key: S,
|
||||||
mut updater: TMutator,
|
mut updater: TMutator,
|
||||||
) -> UnorderedTermId
|
) -> UnorderedTermId
|
||||||
where
|
where
|
||||||
|
S: AsRef<[u8]>,
|
||||||
V: Copy + 'static,
|
V: Copy + 'static,
|
||||||
TMutator: FnMut(Option<V>) -> V,
|
TMutator: FnMut(Option<V>) -> V,
|
||||||
{
|
{
|
||||||
if self.is_saturated() {
|
if self.is_saturated() {
|
||||||
self.resize();
|
self.resize();
|
||||||
}
|
}
|
||||||
let hash = murmurhash2(key);
|
let key_bytes: &[u8] = key.as_ref();
|
||||||
|
let hash = murmurhash2(key.as_ref());
|
||||||
let mut probe = self.probe(hash);
|
let mut probe = self.probe(hash);
|
||||||
loop {
|
loop {
|
||||||
let bucket = probe.next_probe();
|
let bucket = probe.next_probe();
|
||||||
@@ -210,18 +211,21 @@ impl TermHashMap {
|
|||||||
if kv.is_empty() {
|
if kv.is_empty() {
|
||||||
// The key does not exists yet.
|
// The key does not exists yet.
|
||||||
let val = updater(None);
|
let val = updater(None);
|
||||||
let num_bytes = std::mem::size_of::<u16>() + key.len() + std::mem::size_of::<V>();
|
let num_bytes =
|
||||||
|
std::mem::size_of::<u16>() + key_bytes.len() + std::mem::size_of::<V>();
|
||||||
let key_addr = self.heap.allocate_space(num_bytes);
|
let key_addr = self.heap.allocate_space(num_bytes);
|
||||||
{
|
{
|
||||||
let data = self.heap.slice_mut(key_addr, num_bytes);
|
let data = self.heap.slice_mut(key_addr, num_bytes);
|
||||||
NativeEndian::write_u16(data, key.len() as u16);
|
NativeEndian::write_u16(data, key_bytes.len() as u16);
|
||||||
let stop = 2 + key.len();
|
let stop = 2 + key_bytes.len();
|
||||||
data[2..stop].copy_from_slice(key);
|
data[2..stop].copy_from_slice(key_bytes);
|
||||||
store(&mut data[stop..], val);
|
store(&mut data[stop..], val);
|
||||||
}
|
}
|
||||||
return self.set_bucket(hash, key_addr, bucket);
|
return self.set_bucket(hash, key_addr, bucket);
|
||||||
} else if kv.hash == hash {
|
} else if kv.hash == hash {
|
||||||
if let Some(val_addr) = self.get_value_addr_if_key_match(key, kv.key_value_addr) {
|
if let Some(val_addr) =
|
||||||
|
self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr)
|
||||||
|
{
|
||||||
let v = self.heap.read(val_addr);
|
let v = self.heap.read(val_addr);
|
||||||
let new_v = updater(Some(v));
|
let new_v = updater(Some(v));
|
||||||
self.heap.write_at(val_addr, new_v);
|
self.heap.write_at(val_addr, new_v);
|
||||||
@@ -241,18 +245,25 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_hash_map() {
|
fn test_hash_map() {
|
||||||
let mut hash_map: TermHashMap = TermHashMap::new(18);
|
let mut hash_map: TermHashMap = TermHashMap::new(18);
|
||||||
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
|
{
|
||||||
assert_eq!(opt_val, None);
|
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
|
||||||
3u32
|
assert_eq!(opt_val, None);
|
||||||
});
|
3u32
|
||||||
hash_map.mutate_or_create(b"abcd", |opt_val: Option<u32>| {
|
});
|
||||||
assert_eq!(opt_val, None);
|
}
|
||||||
4u32
|
{
|
||||||
});
|
hash_map.mutate_or_create("abcd", |opt_val: Option<u32>| {
|
||||||
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
|
assert_eq!(opt_val, None);
|
||||||
assert_eq!(opt_val, Some(3u32));
|
4u32
|
||||||
5u32
|
});
|
||||||
});
|
}
|
||||||
|
{
|
||||||
|
hash_map.mutate_or_create("abc", |opt_val: Option<u32>| {
|
||||||
|
assert_eq!(opt_val, Some(3u32));
|
||||||
|
5u32
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
let mut vanilla_hash_map = HashMap::new();
|
let mut vanilla_hash_map = HashMap::new();
|
||||||
let iter_values = hash_map.iter();
|
let iter_values = hash_map.iter();
|
||||||
for (key, addr, _) in iter_values {
|
for (key, addr, _) in iter_values {
|
||||||
|
|||||||
@@ -78,29 +78,29 @@ mod tests {
|
|||||||
use crate::schema::{Schema, TEXT};
|
use crate::schema::{Schema, TEXT};
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
|
|
||||||
fn create_test_index() -> crate::Result<Index> {
|
fn create_test_index() -> Index {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let field = schema_builder.add_text_field("text", TEXT);
|
let field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(field=>"aaa"))?;
|
index_writer.add_document(doc!(field=>"aaa"));
|
||||||
index_writer.add_document(doc!(field=>"bbb"))?;
|
index_writer.add_document(doc!(field=>"bbb"));
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
index_writer.add_document(doc!(field=>"ccc"))?;
|
index_writer.add_document(doc!(field=>"ccc"));
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
Ok(index)
|
index
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_all_query() -> crate::Result<()> {
|
fn test_all_query() {
|
||||||
let index = create_test_index()?;
|
let index = create_test_index();
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let weight = AllQuery.weight(&searcher, false)?;
|
let weight = AllQuery.weight(&searcher, false).unwrap();
|
||||||
{
|
{
|
||||||
let reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
let mut scorer = weight.scorer(reader, 1.0)?;
|
let mut scorer = weight.scorer(reader, 1.0).unwrap();
|
||||||
assert_eq!(scorer.doc(), 0u32);
|
assert_eq!(scorer.doc(), 0u32);
|
||||||
assert_eq!(scorer.advance(), 1u32);
|
assert_eq!(scorer.advance(), 1u32);
|
||||||
assert_eq!(scorer.doc(), 1u32);
|
assert_eq!(scorer.doc(), 1u32);
|
||||||
@@ -108,30 +108,28 @@ mod tests {
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
let reader = searcher.segment_reader(1);
|
let reader = searcher.segment_reader(1);
|
||||||
let mut scorer = weight.scorer(reader, 1.0)?;
|
let mut scorer = weight.scorer(reader, 1.0).unwrap();
|
||||||
assert_eq!(scorer.doc(), 0u32);
|
assert_eq!(scorer.doc(), 0u32);
|
||||||
assert_eq!(scorer.advance(), TERMINATED);
|
assert_eq!(scorer.advance(), TERMINATED);
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_all_query_with_boost() -> crate::Result<()> {
|
fn test_all_query_with_boost() {
|
||||||
let index = create_test_index()?;
|
let index = create_test_index();
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let weight = AllQuery.weight(&searcher, false)?;
|
let weight = AllQuery.weight(&searcher, false).unwrap();
|
||||||
let reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
{
|
{
|
||||||
let mut scorer = weight.scorer(reader, 2.0)?;
|
let mut scorer = weight.scorer(reader, 2.0).unwrap();
|
||||||
assert_eq!(scorer.doc(), 0u32);
|
assert_eq!(scorer.doc(), 0u32);
|
||||||
assert_eq!(scorer.score(), 2.0);
|
assert_eq!(scorer.score(), 2.0);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut scorer = weight.scorer(reader, 1.5)?;
|
let mut scorer = weight.scorer(reader, 1.5).unwrap();
|
||||||
assert_eq!(scorer.doc(), 0u32);
|
assert_eq!(scorer.doc(), 0u32);
|
||||||
assert_eq!(scorer.score(), 1.5);
|
assert_eq!(scorer.score(), 1.5);
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -92,16 +92,16 @@ mod tests {
|
|||||||
use crate::Index;
|
use crate::Index;
|
||||||
use tantivy_fst::Automaton;
|
use tantivy_fst::Automaton;
|
||||||
|
|
||||||
fn create_index() -> crate::Result<Index> {
|
fn create_index() -> Index {
|
||||||
let mut schema = Schema::builder();
|
let mut schema = Schema::builder();
|
||||||
let title = schema.add_text_field("title", STRING);
|
let title = schema.add_text_field("title", STRING);
|
||||||
let index = Index::create_in_ram(schema.build());
|
let index = Index::create_in_ram(schema.build());
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(title=>"abc"))?;
|
index_writer.add_document(doc!(title=>"abc"));
|
||||||
index_writer.add_document(doc!(title=>"bcd"))?;
|
index_writer.add_document(doc!(title=>"bcd"));
|
||||||
index_writer.add_document(doc!(title=>"abcd"))?;
|
index_writer.add_document(doc!(title=>"abcd"));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
Ok(index)
|
index
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
@@ -140,32 +140,34 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_automaton_weight() -> crate::Result<()> {
|
fn test_automaton_weight() {
|
||||||
let index = create_index()?;
|
let index = create_index();
|
||||||
let field = index.schema().get_field("title").unwrap();
|
let field = index.schema().get_field("title").unwrap();
|
||||||
let automaton_weight = AutomatonWeight::new(field, PrefixedByA);
|
let automaton_weight = AutomatonWeight::new(field, PrefixedByA);
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let mut scorer = automaton_weight.scorer(searcher.segment_reader(0u32), 1.0)?;
|
let mut scorer = automaton_weight
|
||||||
|
.scorer(searcher.segment_reader(0u32), 1.0)
|
||||||
|
.unwrap();
|
||||||
assert_eq!(scorer.doc(), 0u32);
|
assert_eq!(scorer.doc(), 0u32);
|
||||||
assert_eq!(scorer.score(), 1.0);
|
assert_eq!(scorer.score(), 1.0);
|
||||||
assert_eq!(scorer.advance(), 2u32);
|
assert_eq!(scorer.advance(), 2u32);
|
||||||
assert_eq!(scorer.doc(), 2u32);
|
assert_eq!(scorer.doc(), 2u32);
|
||||||
assert_eq!(scorer.score(), 1.0);
|
assert_eq!(scorer.score(), 1.0);
|
||||||
assert_eq!(scorer.advance(), TERMINATED);
|
assert_eq!(scorer.advance(), TERMINATED);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_automaton_weight_boost() -> crate::Result<()> {
|
fn test_automaton_weight_boost() {
|
||||||
let index = create_index()?;
|
let index = create_index();
|
||||||
let field = index.schema().get_field("title").unwrap();
|
let field = index.schema().get_field("title").unwrap();
|
||||||
let automaton_weight = AutomatonWeight::new(field, PrefixedByA);
|
let automaton_weight = AutomatonWeight::new(field, PrefixedByA);
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let mut scorer = automaton_weight.scorer(searcher.segment_reader(0u32), 1.32)?;
|
let mut scorer = automaton_weight
|
||||||
|
.scorer(searcher.segment_reader(0u32), 1.32)
|
||||||
|
.unwrap();
|
||||||
assert_eq!(scorer.doc(), 0u32);
|
assert_eq!(scorer.doc(), 0u32);
|
||||||
assert_eq!(scorer.score(), 1.32);
|
assert_eq!(scorer.score(), 1.32);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -42,39 +42,27 @@ fn find_pivot_doc(
|
|||||||
Some((before_pivot_len, pivot_len, pivot_doc))
|
Some((before_pivot_len, pivot_len, pivot_doc))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Advance the scorer with best score among the scorers[..pivot_len] to
|
// Before and after calling this method, scorers need to be sorted by their `.doc()`.
|
||||||
/// the next doc candidate defined by the min of `last_doc_in_block + 1` for
|
|
||||||
/// scorer in scorers[..pivot_len] and `scorer.doc()` for scorer in scorers[pivot_len..].
|
|
||||||
/// Note: before and after calling this method, scorers need to be sorted by their `.doc()`.
|
|
||||||
fn block_max_was_too_low_advance_one_scorer(
|
fn block_max_was_too_low_advance_one_scorer(
|
||||||
scorers: &mut Vec<TermScorerWithMaxScore>,
|
scorers: &mut Vec<TermScorerWithMaxScore>,
|
||||||
pivot_len: usize,
|
pivot_len: usize,
|
||||||
) {
|
) {
|
||||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||||
let mut scorer_to_seek = pivot_len - 1;
|
let mut scorer_to_seek = pivot_len - 1;
|
||||||
let mut global_max_score = scorers[scorer_to_seek].max_score;
|
let mut doc_to_seek_after = scorers[scorer_to_seek].doc();
|
||||||
let mut doc_to_seek_after = scorers[scorer_to_seek].last_doc_in_block();
|
|
||||||
for scorer_ord in (0..pivot_len - 1).rev() {
|
for scorer_ord in (0..pivot_len - 1).rev() {
|
||||||
let scorer = &scorers[scorer_ord];
|
let scorer = &scorers[scorer_ord];
|
||||||
if scorer.last_doc_in_block() <= doc_to_seek_after {
|
if scorer.last_doc_in_block() <= doc_to_seek_after {
|
||||||
doc_to_seek_after = scorer.last_doc_in_block();
|
doc_to_seek_after = scorer.last_doc_in_block();
|
||||||
}
|
|
||||||
if scorers[scorer_ord].max_score > global_max_score {
|
|
||||||
global_max_score = scorers[scorer_ord].max_score;
|
|
||||||
scorer_to_seek = scorer_ord;
|
scorer_to_seek = scorer_ord;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Add +1 to go to the next block unless we are already at the end.
|
|
||||||
if doc_to_seek_after != TERMINATED {
|
|
||||||
doc_to_seek_after += 1;
|
|
||||||
}
|
|
||||||
for scorer in &scorers[pivot_len..] {
|
for scorer in &scorers[pivot_len..] {
|
||||||
if scorer.doc() <= doc_to_seek_after {
|
if scorer.doc() <= doc_to_seek_after {
|
||||||
doc_to_seek_after = scorer.doc();
|
doc_to_seek_after = scorer.doc();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
scorers[scorer_to_seek].seek(doc_to_seek_after);
|
scorers[scorer_to_seek].seek(doc_to_seek_after + 1);
|
||||||
|
|
||||||
restore_ordering(scorers, scorer_to_seek);
|
restore_ordering(scorers, scorer_to_seek);
|
||||||
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc())));
|
||||||
}
|
}
|
||||||
@@ -142,9 +130,6 @@ fn advance_all_scorers_on_pivot(term_scorers: &mut Vec<TermScorerWithMaxScore>,
|
|||||||
term_scorers.sort_by_key(|scorer| scorer.doc());
|
term_scorers.sort_by_key(|scorer| scorer.doc());
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Implements the WAND (Weak AND) algorithm for dynamic pruning
|
|
||||||
/// described in the paper "Faster Top-k Document Retrieval Using Block-Max Indexes".
|
|
||||||
/// Link: http://engineering.nyu.edu/~suel/papers/bmw.pdf
|
|
||||||
pub fn block_wand(
|
pub fn block_wand(
|
||||||
mut scorers: Vec<TermScorer>,
|
mut scorers: Vec<TermScorer>,
|
||||||
mut threshold: Score,
|
mut threshold: Score,
|
||||||
@@ -202,7 +187,6 @@ pub fn block_wand(
|
|||||||
.iter_mut()
|
.iter_mut()
|
||||||
.map(|scorer| scorer.score())
|
.map(|scorer| scorer.score())
|
||||||
.sum();
|
.sum();
|
||||||
|
|
||||||
if score > threshold {
|
if score > threshold {
|
||||||
threshold = callback(pivot_doc, score);
|
threshold = callback(pivot_doc, score);
|
||||||
}
|
}
|
||||||
@@ -211,56 +195,6 @@ pub fn block_wand(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Specialized version of [`block_wand`] for a single scorer.
|
|
||||||
/// In this case, the algorithm is simple and readable and faster (~ x3)
|
|
||||||
/// than the generic algorithm.
|
|
||||||
/// The algorithm behaves as follows:
|
|
||||||
/// - While we don't hit the end of the docset:
|
|
||||||
/// - While the block max score is under the `threshold`, go to the
|
|
||||||
/// next block.
|
|
||||||
/// - On a block, advance until the end and execute `callback``
|
|
||||||
/// when the doc score is greater or equal to the `threshold`.
|
|
||||||
pub fn block_wand_single_scorer(
|
|
||||||
mut scorer: TermScorer,
|
|
||||||
mut threshold: Score,
|
|
||||||
callback: &mut dyn FnMut(u32, Score) -> Score,
|
|
||||||
) {
|
|
||||||
let mut doc = scorer.doc();
|
|
||||||
loop {
|
|
||||||
// We position the scorer on a block that can reach
|
|
||||||
// the threshold.
|
|
||||||
while scorer.block_max_score() < threshold {
|
|
||||||
let last_doc_in_block = scorer.last_doc_in_block();
|
|
||||||
if last_doc_in_block == TERMINATED {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
doc = last_doc_in_block + 1;
|
|
||||||
scorer.shallow_seek(doc);
|
|
||||||
}
|
|
||||||
// Seek will effectively load that block.
|
|
||||||
doc = scorer.seek(doc);
|
|
||||||
if doc == TERMINATED {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
loop {
|
|
||||||
let score = scorer.score();
|
|
||||||
if score > threshold {
|
|
||||||
threshold = callback(doc, score);
|
|
||||||
}
|
|
||||||
debug_assert!(doc <= scorer.last_doc_in_block());
|
|
||||||
if doc == scorer.last_doc_in_block() {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
doc = scorer.advance();
|
|
||||||
if doc == TERMINATED {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
doc += 1;
|
|
||||||
scorer.shallow_seek(doc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct TermScorerWithMaxScore<'a> {
|
struct TermScorerWithMaxScore<'a> {
|
||||||
scorer: &'a mut TermScorer,
|
scorer: &'a mut TermScorer,
|
||||||
max_score: Score,
|
max_score: Score,
|
||||||
@@ -338,14 +272,13 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn compute_checkpoints_for_each_pruning(
|
fn compute_checkpoints_for_each_pruning(
|
||||||
mut term_scorers: Vec<TermScorer>,
|
term_scorers: Vec<TermScorer>,
|
||||||
n: usize,
|
n: usize,
|
||||||
) -> Vec<(DocId, Score)> {
|
) -> Vec<(DocId, Score)> {
|
||||||
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n);
|
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n);
|
||||||
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
|
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
|
||||||
let mut limit: Score = 0.0;
|
let mut limit: Score = 0.0;
|
||||||
|
super::block_wand(term_scorers, Score::MIN, &mut |doc, score| {
|
||||||
let callback = &mut |doc, score| {
|
|
||||||
heap.push(Float(score));
|
heap.push(Float(score));
|
||||||
if heap.len() > n {
|
if heap.len() > n {
|
||||||
heap.pop().unwrap();
|
heap.pop().unwrap();
|
||||||
@@ -357,14 +290,7 @@ mod tests {
|
|||||||
checkpoints.push((doc, score));
|
checkpoints.push((doc, score));
|
||||||
}
|
}
|
||||||
limit
|
limit
|
||||||
};
|
});
|
||||||
|
|
||||||
if term_scorers.len() == 1 {
|
|
||||||
let scorer = term_scorers.pop().unwrap();
|
|
||||||
super::block_wand_single_scorer(scorer, Score::MIN, callback);
|
|
||||||
} else {
|
|
||||||
super::block_wand(term_scorers, Score::MIN, callback);
|
|
||||||
}
|
|
||||||
checkpoints
|
checkpoints
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -498,14 +424,6 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
proptest! {
|
|
||||||
#![proptest_config(ProptestConfig::with_cases(500))]
|
|
||||||
#[test]
|
|
||||||
fn test_block_wand_single_term_scorer((posting_lists, fieldnorms) in gen_term_scorers(1)) {
|
|
||||||
test_block_wand_aux(&posting_lists[..], &fieldnorms[..]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_fn_reproduce_proptest() {
|
fn test_fn_reproduce_proptest() {
|
||||||
let postings_lists = &[
|
let postings_lists = &[
|
||||||
|
|||||||
@@ -41,22 +41,22 @@ use std::collections::BTreeMap;
|
|||||||
/// let mut index_writer = index.writer(3_000_000)?;
|
/// let mut index_writer = index.writer(3_000_000)?;
|
||||||
/// index_writer.add_document(doc!(
|
/// index_writer.add_document(doc!(
|
||||||
/// title => "The Name of the Wind",
|
/// title => "The Name of the Wind",
|
||||||
/// ))?;
|
/// ));
|
||||||
/// index_writer.add_document(doc!(
|
/// index_writer.add_document(doc!(
|
||||||
/// title => "The Diary of Muadib",
|
/// title => "The Diary of Muadib",
|
||||||
/// ))?;
|
/// ));
|
||||||
/// index_writer.add_document(doc!(
|
/// index_writer.add_document(doc!(
|
||||||
/// title => "A Dairy Cow",
|
/// title => "A Dairy Cow",
|
||||||
/// body => "hidden",
|
/// body => "hidden",
|
||||||
/// ))?;
|
/// ));
|
||||||
/// index_writer.add_document(doc!(
|
/// index_writer.add_document(doc!(
|
||||||
/// title => "A Dairy Cow",
|
/// title => "A Dairy Cow",
|
||||||
/// body => "found",
|
/// body => "found",
|
||||||
/// ))?;
|
/// ));
|
||||||
/// index_writer.add_document(doc!(
|
/// index_writer.add_document(doc!(
|
||||||
/// title => "The Diary of a Young Girl",
|
/// title => "The Diary of a Young Girl",
|
||||||
/// ))?;
|
/// ));
|
||||||
/// index_writer.commit()?;
|
/// index_writer.commit().unwrap();
|
||||||
/// }
|
/// }
|
||||||
///
|
///
|
||||||
/// let reader = index.reader()?;
|
/// let reader = index.reader()?;
|
||||||
@@ -217,11 +217,11 @@ mod tests {
|
|||||||
let text = schema_builder.add_text_field("text", TEXT);
|
let text = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut writer = index.writer_for_tests()?;
|
let mut writer = index.writer_for_tests().unwrap();
|
||||||
writer.add_document(doc!(text=>"b c"))?;
|
writer.add_document(doc!(text=>"b c"));
|
||||||
writer.add_document(doc!(text=>"a c"))?;
|
writer.add_document(doc!(text=>"a c"));
|
||||||
writer.add_document(doc!(text=>"a b"))?;
|
writer.add_document(doc!(text=>"a b"));
|
||||||
writer.add_document(doc!(text=>"a d"))?;
|
writer.add_document(doc!(text=>"a d"));
|
||||||
writer.commit()?;
|
writer.commit()?;
|
||||||
Ok(index)
|
Ok(index)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ mod boolean_query;
|
|||||||
mod boolean_weight;
|
mod boolean_weight;
|
||||||
|
|
||||||
pub(crate) use self::block_wand::block_wand;
|
pub(crate) use self::block_wand::block_wand;
|
||||||
pub(crate) use self::block_wand::block_wand_single_scorer;
|
|
||||||
pub use self::boolean_query::BooleanQuery;
|
pub use self::boolean_query::BooleanQuery;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -26,75 +25,72 @@ mod tests {
|
|||||||
use crate::Index;
|
use crate::Index;
|
||||||
use crate::{DocAddress, DocId, Score};
|
use crate::{DocAddress, DocId, Score};
|
||||||
|
|
||||||
fn aux_test_helper() -> crate::Result<(Index, Field)> {
|
fn aux_test_helper() -> (Index, Field) {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(text_field => "a b c"))?;
|
index_writer.add_document(doc!(text_field => "a b c"));
|
||||||
index_writer.add_document(doc!(text_field => "a c"))?;
|
index_writer.add_document(doc!(text_field => "a c"));
|
||||||
index_writer.add_document(doc!(text_field => "b c"))?;
|
index_writer.add_document(doc!(text_field => "b c"));
|
||||||
index_writer.add_document(doc!(text_field => "a b c d"))?;
|
index_writer.add_document(doc!(text_field => "a b c d"));
|
||||||
index_writer.add_document(doc!(text_field => "d"))?;
|
index_writer.add_document(doc!(text_field => "d"));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
Ok((index, text_field))
|
(index, text_field)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_boolean_non_all_term_disjunction() -> crate::Result<()> {
|
pub fn test_boolean_non_all_term_disjunction() {
|
||||||
let (index, text_field) = aux_test_helper()?;
|
let (index, text_field) = aux_test_helper();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||||
let query = query_parser.parse_query("(+a +b) d")?;
|
let query = query_parser.parse_query("(+a +b) d").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
assert_eq!(query.count(&searcher)?, 3);
|
assert_eq!(query.count(&searcher).unwrap(), 3);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_boolean_single_must_clause() -> crate::Result<()> {
|
pub fn test_boolean_single_must_clause() {
|
||||||
let (index, text_field) = aux_test_helper()?;
|
let (index, text_field) = aux_test_helper();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||||
let query = query_parser.parse_query("+a")?;
|
let query = query_parser.parse_query("+a").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let weight = query.weight(&searcher, true)?;
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
|
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
|
||||||
assert!(scorer.is::<TermScorer>());
|
assert!(scorer.is::<TermScorer>());
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_boolean_termonly_intersection() -> crate::Result<()> {
|
pub fn test_boolean_termonly_intersection() {
|
||||||
let (index, text_field) = aux_test_helper()?;
|
let (index, text_field) = aux_test_helper();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a +b +c")?;
|
let query = query_parser.parse_query("+a +b +c").unwrap();
|
||||||
let weight = query.weight(&searcher, true)?;
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
|
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
|
||||||
assert!(scorer.is::<Intersection<TermScorer>>());
|
assert!(scorer.is::<Intersection<TermScorer>>());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a +(b c)")?;
|
let query = query_parser.parse_query("+a +(b c)").unwrap();
|
||||||
let weight = query.weight(&searcher, true)?;
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
|
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
|
||||||
assert!(scorer.is::<Intersection<Box<dyn Scorer>>>());
|
assert!(scorer.is::<Intersection<Box<dyn Scorer>>>());
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_boolean_reqopt() -> crate::Result<()> {
|
pub fn test_boolean_reqopt() {
|
||||||
let (index, text_field) = aux_test_helper()?;
|
let (index, text_field) = aux_test_helper();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a b")?;
|
let query = query_parser.parse_query("+a b").unwrap();
|
||||||
let weight = query.weight(&searcher, true)?;
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
|
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
|
||||||
assert!(scorer.is::<RequiredOptionalScorer<
|
assert!(scorer.is::<RequiredOptionalScorer<
|
||||||
Box<dyn Scorer>,
|
Box<dyn Scorer>,
|
||||||
Box<dyn Scorer>,
|
Box<dyn Scorer>,
|
||||||
@@ -102,17 +98,16 @@ mod tests {
|
|||||||
>>());
|
>>());
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a b")?;
|
let query = query_parser.parse_query("+a b").unwrap();
|
||||||
let weight = query.weight(&searcher, false)?;
|
let weight = query.weight(&searcher, false).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
|
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0).unwrap();
|
||||||
assert!(scorer.is::<TermScorer>());
|
assert!(scorer.is::<TermScorer>());
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_boolean_query() -> crate::Result<()> {
|
pub fn test_boolean_query() {
|
||||||
let (index, text_field) = aux_test_helper()?;
|
let (index, text_field) = aux_test_helper();
|
||||||
|
|
||||||
let make_term_query = |text: &str| {
|
let make_term_query = |text: &str| {
|
||||||
let term_query = TermQuery::new(
|
let term_query = TermQuery::new(
|
||||||
@@ -123,7 +118,7 @@ mod tests {
|
|||||||
query
|
query
|
||||||
};
|
};
|
||||||
|
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
|
|
||||||
let matching_docs = |boolean_query: &dyn Query| {
|
let matching_docs = |boolean_query: &dyn Query| {
|
||||||
reader
|
reader
|
||||||
@@ -170,12 +165,11 @@ mod tests {
|
|||||||
let boolean_query = BooleanQuery::new(vec![(Occur::MustNot, make_term_query("d"))]);
|
let boolean_query = BooleanQuery::new(vec![(Occur::MustNot, make_term_query("d"))]);
|
||||||
assert_eq!(matching_docs(&boolean_query), Vec::<u32>::new());
|
assert_eq!(matching_docs(&boolean_query), Vec::<u32>::new());
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_boolean_query_two_excluded() -> crate::Result<()> {
|
pub fn test_boolean_query_two_excluded() {
|
||||||
let (index, text_field) = aux_test_helper()?;
|
let (index, text_field) = aux_test_helper();
|
||||||
|
|
||||||
let make_term_query = |text: &str| {
|
let make_term_query = |text: &str| {
|
||||||
let term_query = TermQuery::new(
|
let term_query = TermQuery::new(
|
||||||
@@ -186,7 +180,7 @@ mod tests {
|
|||||||
query
|
query
|
||||||
};
|
};
|
||||||
|
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
|
|
||||||
let matching_topdocs = |query: &dyn Query| {
|
let matching_topdocs = |query: &dyn Query| {
|
||||||
reader
|
reader
|
||||||
@@ -219,21 +213,20 @@ mod tests {
|
|||||||
assert_eq!(top_doc, DocAddress::new(0, 4));
|
assert_eq!(top_doc, DocAddress::new(0, 4));
|
||||||
assert_eq!(top_score, score_doc_4);
|
assert_eq!(top_score, score_doc_4);
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_boolean_query_with_weight() -> crate::Result<()> {
|
pub fn test_boolean_query_with_weight() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(text_field => "a b c"))?;
|
index_writer.add_document(doc!(text_field => "a b c"));
|
||||||
index_writer.add_document(doc!(text_field => "a c"))?;
|
index_writer.add_document(doc!(text_field => "a c"));
|
||||||
index_writer.add_document(doc!(text_field => "b c"))?;
|
index_writer.add_document(doc!(text_field => "b c"));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
let term_a: Box<dyn Query> = Box::new(TermQuery::new(
|
let term_a: Box<dyn Query> = Box::new(TermQuery::new(
|
||||||
Term::from_field_text(text_field, "a"),
|
Term::from_field_text(text_field, "a"),
|
||||||
@@ -249,21 +242,24 @@ mod tests {
|
|||||||
BooleanQuery::new(vec![(Occur::Should, term_a), (Occur::Should, term_b)]);
|
BooleanQuery::new(vec![(Occur::Should, term_a), (Occur::Should, term_b)]);
|
||||||
let boolean_weight = boolean_query.weight(&searcher, true).unwrap();
|
let boolean_weight = boolean_query.weight(&searcher, true).unwrap();
|
||||||
{
|
{
|
||||||
let mut boolean_scorer = boolean_weight.scorer(searcher.segment_reader(0u32), 1.0)?;
|
let mut boolean_scorer = boolean_weight
|
||||||
|
.scorer(searcher.segment_reader(0u32), 1.0)
|
||||||
|
.unwrap();
|
||||||
assert_eq!(boolean_scorer.doc(), 0u32);
|
assert_eq!(boolean_scorer.doc(), 0u32);
|
||||||
assert_nearly_equals!(boolean_scorer.score(), 0.84163445);
|
assert_nearly_equals!(boolean_scorer.score(), 0.84163445);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut boolean_scorer = boolean_weight.scorer(searcher.segment_reader(0u32), 2.0)?;
|
let mut boolean_scorer = boolean_weight
|
||||||
|
.scorer(searcher.segment_reader(0u32), 2.0)
|
||||||
|
.unwrap();
|
||||||
assert_eq!(boolean_scorer.doc(), 0u32);
|
assert_eq!(boolean_scorer.doc(), 0u32);
|
||||||
assert_nearly_equals!(boolean_scorer.score(), 1.6832689);
|
assert_nearly_equals!(boolean_scorer.score(), 1.6832689);
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_intersection_score() -> crate::Result<()> {
|
pub fn test_intersection_score() {
|
||||||
let (index, text_field) = aux_test_helper()?;
|
let (index, text_field) = aux_test_helper();
|
||||||
|
|
||||||
let make_term_query = |text: &str| {
|
let make_term_query = |text: &str| {
|
||||||
let term_query = TermQuery::new(
|
let term_query = TermQuery::new(
|
||||||
@@ -273,7 +269,7 @@ mod tests {
|
|||||||
let query: Box<dyn Query> = Box::new(term_query);
|
let query: Box<dyn Query> = Box::new(term_query);
|
||||||
query
|
query
|
||||||
};
|
};
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let score_docs = |boolean_query: &dyn Query| {
|
let score_docs = |boolean_query: &dyn Query| {
|
||||||
let fruit = reader
|
let fruit = reader
|
||||||
.searcher()
|
.searcher()
|
||||||
@@ -291,7 +287,6 @@ mod tests {
|
|||||||
assert_nearly_equals!(scores[0], 0.977973);
|
assert_nearly_equals!(scores[0], 0.977973);
|
||||||
assert_nearly_equals!(scores[1], 0.84699446);
|
assert_nearly_equals!(scores[1], 0.84699446);
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -301,8 +296,8 @@ mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 5_000_000)?;
|
let mut index_writer = index.writer_with_num_threads(1, 5_000_000)?;
|
||||||
index_writer.add_document(doc!(text=>"a"))?;
|
index_writer.add_document(doc!(text=>"a"));
|
||||||
index_writer.add_document(doc!(text=>"b"))?;
|
index_writer.add_document(doc!(text=>"b"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
let term_a: Box<dyn Query> = Box::new(TermQuery::new(
|
let term_a: Box<dyn Query> = Box::new(TermQuery::new(
|
||||||
|
|||||||
@@ -141,20 +141,19 @@ mod tests {
|
|||||||
use crate::{DocAddress, Document, Index};
|
use crate::{DocAddress, Document, Index};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_boost_query_explain() -> crate::Result<()> {
|
fn test_boost_query_explain() {
|
||||||
let schema = Schema::builder().build();
|
let schema = Schema::builder().build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(Document::new())?;
|
index_writer.add_document(Document::new());
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let query = BoostQuery::new(Box::new(AllQuery), 0.2);
|
let query = BoostQuery::new(Box::new(AllQuery), 0.2);
|
||||||
let explanation = query.explain(&searcher, DocAddress::new(0, 0u32)).unwrap();
|
let explanation = query.explain(&searcher, DocAddress::new(0, 0u32)).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
explanation.to_pretty_json(),
|
explanation.to_pretty_json(),
|
||||||
"{\n \"value\": 0.2,\n \"description\": \"Boost x0.2 of ...\",\n \"details\": [\n {\n \"value\": 1.0,\n \"description\": \"AllQuery\",\n \"context\": []\n }\n ],\n \"context\": []\n}"
|
"{\n \"value\": 0.2,\n \"description\": \"Boost x0.2 of ...\",\n \"details\": [\n {\n \"value\": 1.0,\n \"description\": \"AllQuery\",\n \"context\": []\n }\n ],\n \"context\": []\n}"
|
||||||
);
|
)
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -67,17 +67,17 @@ static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Laz
|
|||||||
/// let mut index_writer = index.writer(3_000_000)?;
|
/// let mut index_writer = index.writer(3_000_000)?;
|
||||||
/// index_writer.add_document(doc!(
|
/// index_writer.add_document(doc!(
|
||||||
/// title => "The Name of the Wind",
|
/// title => "The Name of the Wind",
|
||||||
/// ))?;
|
/// ));
|
||||||
/// index_writer.add_document(doc!(
|
/// index_writer.add_document(doc!(
|
||||||
/// title => "The Diary of Muadib",
|
/// title => "The Diary of Muadib",
|
||||||
/// ))?;
|
/// ));
|
||||||
/// index_writer.add_document(doc!(
|
/// index_writer.add_document(doc!(
|
||||||
/// title => "A Dairy Cow",
|
/// title => "A Dairy Cow",
|
||||||
/// ))?;
|
/// ));
|
||||||
/// index_writer.add_document(doc!(
|
/// index_writer.add_document(doc!(
|
||||||
/// title => "The Diary of a Young Girl",
|
/// title => "The Diary of a Young Girl",
|
||||||
/// ))?;
|
/// ));
|
||||||
/// index_writer.commit()?;
|
/// index_writer.commit().unwrap();
|
||||||
/// }
|
/// }
|
||||||
/// let reader = index.reader()?;
|
/// let reader = index.reader()?;
|
||||||
/// let searcher = reader.searcher();
|
/// let searcher = reader.searcher();
|
||||||
@@ -129,18 +129,13 @@ impl FuzzyTermQuery {
|
|||||||
|
|
||||||
fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DfaWrapper>> {
|
fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DfaWrapper>> {
|
||||||
// LEV_BUILDER is a HashMap, whose `get` method returns an Option
|
// LEV_BUILDER is a HashMap, whose `get` method returns an Option
|
||||||
match LEV_BUILDER.get(&(self.distance, self.transposition_cost_one)) {
|
match LEV_BUILDER.get(&(self.distance, false)) {
|
||||||
// Unwrap the option and build the Ok(AutomatonWeight)
|
// Unwrap the option and build the Ok(AutomatonWeight)
|
||||||
Some(automaton_builder) => {
|
Some(automaton_builder) => {
|
||||||
let term_text = self.term.as_str().ok_or_else(|| {
|
|
||||||
crate::TantivyError::InvalidArgument(
|
|
||||||
"The fuzzy term query requires a string term.".to_string(),
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
let automaton = if self.prefix {
|
let automaton = if self.prefix {
|
||||||
automaton_builder.build_prefix_dfa(term_text)
|
automaton_builder.build_prefix_dfa(self.term.text())
|
||||||
} else {
|
} else {
|
||||||
automaton_builder.build_dfa(term_text)
|
automaton_builder.build_dfa(self.term.text())
|
||||||
};
|
};
|
||||||
Ok(AutomatonWeight::new(
|
Ok(AutomatonWeight::new(
|
||||||
self.term.field(),
|
self.term.field(),
|
||||||
@@ -169,7 +164,6 @@ impl Query for FuzzyTermQuery {
|
|||||||
mod test {
|
mod test {
|
||||||
use super::FuzzyTermQuery;
|
use super::FuzzyTermQuery;
|
||||||
use crate::assert_nearly_equals;
|
use crate::assert_nearly_equals;
|
||||||
use crate::collector::Count;
|
|
||||||
use crate::collector::TopDocs;
|
use crate::collector::TopDocs;
|
||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
use crate::schema::TEXT;
|
use crate::schema::TEXT;
|
||||||
@@ -177,29 +171,32 @@ mod test {
|
|||||||
use crate::Term;
|
use crate::Term;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_fuzzy_term() -> crate::Result<()> {
|
pub fn test_fuzzy_term() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let country_field = schema_builder.add_text_field("country", TEXT);
|
let country_field = schema_builder.add_text_field("country", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
country_field => "japan",
|
country_field => "japan",
|
||||||
))?;
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
country_field => "korea",
|
country_field => "korea",
|
||||||
))?;
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
// passes because Levenshtein distance is 1 (substitute 'o' with 'a')
|
// passes because Levenshtein distance is 1 (substitute 'o' with 'a')
|
||||||
{
|
{
|
||||||
let term = Term::from_field_text(country_field, "japon");
|
let term = Term::from_field_text(country_field, "japon");
|
||||||
|
|
||||||
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
|
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
|
||||||
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?;
|
let top_docs = searcher
|
||||||
|
.search(&fuzzy_query, &TopDocs::with_limit(2))
|
||||||
|
.unwrap();
|
||||||
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
|
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
|
||||||
let (score, _) = top_docs[0];
|
let (score, _) = top_docs[0];
|
||||||
assert_nearly_equals!(1.0, score);
|
assert_nearly_equals!(1.0, score);
|
||||||
@@ -210,44 +207,23 @@ mod test {
|
|||||||
let term = Term::from_field_text(country_field, "jap");
|
let term = Term::from_field_text(country_field, "jap");
|
||||||
|
|
||||||
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
|
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
|
||||||
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?;
|
let top_docs = searcher
|
||||||
|
.search(&fuzzy_query, &TopDocs::with_limit(2))
|
||||||
|
.unwrap();
|
||||||
assert_eq!(top_docs.len(), 0, "Expected no document");
|
assert_eq!(top_docs.len(), 0, "Expected no document");
|
||||||
}
|
}
|
||||||
|
|
||||||
// passes because prefix Levenshtein distance is 0
|
// passes because prefix Levenshtein distance is 0
|
||||||
{
|
{
|
||||||
let term = Term::from_field_text(country_field, "jap");
|
let term = Term::from_field_text(country_field, "jap");
|
||||||
|
|
||||||
let fuzzy_query = FuzzyTermQuery::new_prefix(term, 1, true);
|
let fuzzy_query = FuzzyTermQuery::new_prefix(term, 1, true);
|
||||||
let top_docs = searcher.search(&fuzzy_query, &TopDocs::with_limit(2))?;
|
let top_docs = searcher
|
||||||
|
.search(&fuzzy_query, &TopDocs::with_limit(2))
|
||||||
|
.unwrap();
|
||||||
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
|
assert_eq!(top_docs.len(), 1, "Expected only 1 document");
|
||||||
let (score, _) = top_docs[0];
|
let (score, _) = top_docs[0];
|
||||||
assert_nearly_equals!(1.0, score);
|
assert_nearly_equals!(1.0, score);
|
||||||
}
|
}
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
pub fn test_fuzzy_term_transposition_cost_one() -> crate::Result<()> {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let country_field = schema_builder.add_text_field("country", TEXT);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
|
||||||
index_writer.add_document(doc!(country_field => "japan"))?;
|
|
||||||
index_writer.commit()?;
|
|
||||||
let reader = index.reader()?;
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let term_jaapn = Term::from_field_text(country_field, "jaapn");
|
|
||||||
{
|
|
||||||
let fuzzy_query_transposition = FuzzyTermQuery::new(term_jaapn.clone(), 1, true);
|
|
||||||
let count = searcher.search(&fuzzy_query_transposition, &Count)?;
|
|
||||||
assert_eq!(count, 1);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let fuzzy_query_transposition = FuzzyTermQuery::new(term_jaapn, 1, false);
|
|
||||||
let count = searcher.search(&fuzzy_query_transposition, &Count)?;
|
|
||||||
assert_eq!(count, 0);
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
70
src/query/intersection_two.rs
Normal file
70
src/query/intersection_two.rs
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
use docset::DocSet;
|
||||||
|
use query::Scorer;
|
||||||
|
use DocId;
|
||||||
|
use Score;
|
||||||
|
use SkipResult;
|
||||||
|
|
||||||
|
|
||||||
|
/// Creates a `DocSet` that iterate through the intersection of two `DocSet`s.
|
||||||
|
pub struct IntersectionTwoTerms<TDocSet> {
|
||||||
|
left: TDocSet,
|
||||||
|
right: TDocSet
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TDocSet: DocSet> IntersectionTwoTerms<TDocSet> {
|
||||||
|
pub fn new(left: TDocSet, right: TDocSet) -> IntersectionTwoTerms<TDocSet> {
|
||||||
|
IntersectionTwoTerms {
|
||||||
|
left,
|
||||||
|
right
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TDocSet: DocSet> DocSet for IntersectionTwoTerms<TDocSet> {
|
||||||
|
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
let (left, right) = (&mut self.left, &mut self.right);
|
||||||
|
if !left.advance() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
let mut candidate = left.doc();
|
||||||
|
loop {
|
||||||
|
match right.skip_next(candidate) {
|
||||||
|
SkipResult::Reached => {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
SkipResult::End => {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
SkipResult::OverStep => {
|
||||||
|
candidate = right.doc();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
match left.skip_next(candidate) {
|
||||||
|
SkipResult::Reached => {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
SkipResult::End => {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
SkipResult::OverStep => {
|
||||||
|
candidate = left.doc();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn doc(&self) -> DocId {
|
||||||
|
self.left.doc()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> u32 {
|
||||||
|
self.left.size_hint().min(self.right.size_hint())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TScorer: Scorer> Scorer for IntersectionTwoTerms<TScorer> {
|
||||||
|
fn score(&mut self) -> Score {
|
||||||
|
self.left.score() + self.right.score()
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -180,7 +180,7 @@ impl MoreLikeThis {
|
|||||||
|
|
||||||
// extract the raw value, possibly tokenizing & filtering to update the term frequency map
|
// extract the raw value, possibly tokenizing & filtering to update the term frequency map
|
||||||
match field_entry.field_type() {
|
match field_entry.field_type() {
|
||||||
FieldType::Facet(_) => {
|
FieldType::HierarchicalFacet(_) => {
|
||||||
let facets: Vec<&str> = field_values
|
let facets: Vec<&str> = field_values
|
||||||
.iter()
|
.iter()
|
||||||
.map(|field_value| match *field_value.value() {
|
.map(|field_value| match *field_value.value() {
|
||||||
|
|||||||
@@ -61,11 +61,19 @@ impl Query for MoreLikeThisQuery {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// The builder for more-like-this query
|
/// The builder for more-like-this query
|
||||||
#[derive(Debug, Clone, Default)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct MoreLikeThisQueryBuilder {
|
pub struct MoreLikeThisQueryBuilder {
|
||||||
mlt: MoreLikeThis,
|
mlt: MoreLikeThis,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Default for MoreLikeThisQueryBuilder {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
mlt: MoreLikeThis::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl MoreLikeThisQueryBuilder {
|
impl MoreLikeThisQueryBuilder {
|
||||||
/// Sets the minimum document frequency.
|
/// Sets the minimum document frequency.
|
||||||
///
|
///
|
||||||
@@ -176,20 +184,20 @@ mod tests {
|
|||||||
use crate::DocAddress;
|
use crate::DocAddress;
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
|
|
||||||
fn create_test_index() -> crate::Result<Index> {
|
fn create_test_index() -> Index {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let title = schema_builder.add_text_field("title", TEXT);
|
let title = schema_builder.add_text_field("title", TEXT);
|
||||||
let body = schema_builder.add_text_field("body", TEXT | STORED);
|
let body = schema_builder.add_text_field("body", TEXT | STORED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_for_tests().unwrap();
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(title => "aaa", body => "the old man and the sea"))?;
|
index_writer.add_document(doc!(title => "aaa", body => "the old man and the sea"));
|
||||||
index_writer.add_document(doc!(title => "bbb", body => "an old man sailing on the sea"))?;
|
index_writer.add_document(doc!(title => "bbb", body => "an old man sailing on the sea"));
|
||||||
index_writer.add_document(doc!(title => "ccc", body=> "send this message to alice"))?;
|
index_writer.add_document(doc!(title => "ccc", body=> "send this message to alice"));
|
||||||
index_writer.add_document(doc!(title => "ddd", body=> "a lady was riding and old bike"))?;
|
index_writer.add_document(doc!(title => "ddd", body=> "a lady was riding and old bike"));
|
||||||
index_writer.add_document(doc!(title => "eee", body=> "Yes, my lady."))?;
|
index_writer.add_document(doc!(title => "eee", body=> "Yes, my lady."));
|
||||||
index_writer.commit()?;
|
index_writer.commit().unwrap();
|
||||||
Ok(index)
|
index
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -235,9 +243,9 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_more_like_this_query() -> crate::Result<()> {
|
fn test_more_like_this_query() {
|
||||||
let index = create_test_index()?;
|
let index = create_test_index();
|
||||||
let reader = index.reader()?;
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
// search base 1st doc with words [sea, and] skipping [old]
|
// search base 1st doc with words [sea, and] skipping [old]
|
||||||
@@ -250,7 +258,7 @@ mod tests {
|
|||||||
.with_boost_factor(1.0)
|
.with_boost_factor(1.0)
|
||||||
.with_stop_words(vec!["old".to_string()])
|
.with_stop_words(vec!["old".to_string()])
|
||||||
.with_document(DocAddress::new(0, 0));
|
.with_document(DocAddress::new(0, 0));
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(5))?;
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(5)).unwrap();
|
||||||
let mut doc_ids: Vec<_> = top_docs.iter().map(|item| item.1.doc_id).collect();
|
let mut doc_ids: Vec<_> = top_docs.iter().map(|item| item.1.doc_id).collect();
|
||||||
doc_ids.sort_unstable();
|
doc_ids.sort_unstable();
|
||||||
|
|
||||||
@@ -266,12 +274,11 @@ mod tests {
|
|||||||
.with_max_word_length(5)
|
.with_max_word_length(5)
|
||||||
.with_boost_factor(1.0)
|
.with_boost_factor(1.0)
|
||||||
.with_document(DocAddress::new(0, 4));
|
.with_document(DocAddress::new(0, 4));
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(5))?;
|
let top_docs = searcher.search(&query, &TopDocs::with_limit(5)).unwrap();
|
||||||
let mut doc_ids: Vec<_> = top_docs.iter().map(|item| item.1.doc_id).collect();
|
let mut doc_ids: Vec<_> = top_docs.iter().map(|item| item.1.doc_id).collect();
|
||||||
doc_ids.sort_unstable();
|
doc_ids.sort_unstable();
|
||||||
|
|
||||||
assert_eq!(doc_ids.len(), 2);
|
assert_eq!(doc_ids.len(), 2);
|
||||||
assert_eq!(doc_ids, vec![3, 4]);
|
assert_eq!(doc_ids, vec![3, 4]);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,34 +18,34 @@ pub mod tests {
|
|||||||
use crate::DocId;
|
use crate::DocId;
|
||||||
use crate::{DocAddress, TERMINATED};
|
use crate::{DocAddress, TERMINATED};
|
||||||
|
|
||||||
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
|
pub fn create_index(texts: &[&'static str]) -> Index {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
for &text in texts {
|
for &text in texts {
|
||||||
let doc = doc!(text_field=>text);
|
let doc = doc!(text_field=>text);
|
||||||
index_writer.add_document(doc)?;
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
Ok(index)
|
index
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_phrase_query() -> crate::Result<()> {
|
pub fn test_phrase_query() {
|
||||||
let index = create_index(&[
|
let index = create_index(&[
|
||||||
"b b b d c g c",
|
"b b b d c g c",
|
||||||
"a b b d c g c",
|
"a b b d c g c",
|
||||||
"a b a b c",
|
"a b a b c",
|
||||||
"c a b a d ga a",
|
"c a b a d ga a",
|
||||||
"a b c",
|
"a b c",
|
||||||
])?;
|
]);
|
||||||
let schema = index.schema();
|
let schema = index.schema();
|
||||||
let text_field = schema.get_field("text").unwrap();
|
let text_field = schema.get_field("text").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let test_query = |texts: Vec<&str>| {
|
let test_query = |texts: Vec<&str>| {
|
||||||
let terms: Vec<Term> = texts
|
let terms: Vec<Term> = texts
|
||||||
.iter()
|
.iter()
|
||||||
@@ -54,7 +54,7 @@ pub mod tests {
|
|||||||
let phrase_query = PhraseQuery::new(terms);
|
let phrase_query = PhraseQuery::new(terms);
|
||||||
let test_fruits = searcher
|
let test_fruits = searcher
|
||||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||||
.unwrap();
|
.expect("search should succeed");
|
||||||
test_fruits
|
test_fruits
|
||||||
.docs()
|
.docs()
|
||||||
.iter()
|
.iter()
|
||||||
@@ -66,12 +66,11 @@ pub mod tests {
|
|||||||
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
|
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
|
||||||
assert!(test_query(vec!["g", "ewrwer"]).is_empty());
|
assert!(test_query(vec!["g", "ewrwer"]).is_empty());
|
||||||
assert!(test_query(vec!["g", "a"]).is_empty());
|
assert!(test_query(vec!["g", "a"]).is_empty());
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_phrase_query_simple() -> crate::Result<()> {
|
pub fn test_phrase_query_simple() -> crate::Result<()> {
|
||||||
let index = create_index(&["a b b d c g c", "a b a b c"])?;
|
let index = create_index(&["a b b d c g c", "a b a b c"]);
|
||||||
let text_field = index.schema().get_field("text").unwrap();
|
let text_field = index.schema().get_field("text").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader()?.searcher();
|
||||||
let terms: Vec<Term> = vec!["a", "b", "c"]
|
let terms: Vec<Term> = vec!["a", "b", "c"]
|
||||||
@@ -87,17 +86,17 @@ pub mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_phrase_query_no_score() -> crate::Result<()> {
|
pub fn test_phrase_query_no_score() {
|
||||||
let index = create_index(&[
|
let index = create_index(&[
|
||||||
"b b b d c g c",
|
"b b b d c g c",
|
||||||
"a b b d c g c",
|
"a b b d c g c",
|
||||||
"a b a b c",
|
"a b a b c",
|
||||||
"c a b a d ga a",
|
"c a b a d ga a",
|
||||||
"a b c",
|
"a b c",
|
||||||
])?;
|
]);
|
||||||
let schema = index.schema();
|
let schema = index.schema();
|
||||||
let text_field = schema.get_field("text").unwrap();
|
let text_field = schema.get_field("text").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let test_query = |texts: Vec<&str>| {
|
let test_query = |texts: Vec<&str>| {
|
||||||
let terms: Vec<Term> = texts
|
let terms: Vec<Term> = texts
|
||||||
.iter()
|
.iter()
|
||||||
@@ -106,7 +105,7 @@ pub mod tests {
|
|||||||
let phrase_query = PhraseQuery::new(terms);
|
let phrase_query = PhraseQuery::new(terms);
|
||||||
let test_fruits = searcher
|
let test_fruits = searcher
|
||||||
.search(&phrase_query, &TEST_COLLECTOR_WITHOUT_SCORE)
|
.search(&phrase_query, &TEST_COLLECTOR_WITHOUT_SCORE)
|
||||||
.unwrap();
|
.expect("search should succeed");
|
||||||
test_fruits
|
test_fruits
|
||||||
.docs()
|
.docs()
|
||||||
.iter()
|
.iter()
|
||||||
@@ -118,11 +117,10 @@ pub mod tests {
|
|||||||
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
|
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
|
||||||
assert!(test_query(vec!["g", "ewrwer"]).is_empty());
|
assert!(test_query(vec!["g", "ewrwer"]).is_empty());
|
||||||
assert!(test_query(vec!["g", "a"]).is_empty());
|
assert!(test_query(vec!["g", "a"]).is_empty());
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_phrase_query_no_positions() -> crate::Result<()> {
|
pub fn test_phrase_query_no_positions() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
use crate::schema::IndexRecordOption;
|
use crate::schema::IndexRecordOption;
|
||||||
use crate::schema::TextFieldIndexing;
|
use crate::schema::TextFieldIndexing;
|
||||||
@@ -137,34 +135,33 @@ pub mod tests {
|
|||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"a b c"))?;
|
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let phrase_query = PhraseQuery::new(vec![
|
let phrase_query = PhraseQuery::new(vec![
|
||||||
Term::from_field_text(text_field, "a"),
|
Term::from_field_text(text_field, "a"),
|
||||||
Term::from_field_text(text_field, "b"),
|
Term::from_field_text(text_field, "b"),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
let search_error = searcher
|
let search_result = searcher
|
||||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||||
.err();
|
.map(|_| ());
|
||||||
assert!(matches!(
|
assert!(matches!(
|
||||||
search_error,
|
search_result,
|
||||||
Some(crate::TantivyError::SchemaError(msg))
|
Err(crate::TantivyError::SchemaError(msg))
|
||||||
if msg == "Applied phrase query on field \"text\", which does not have positions \
|
if msg == "Applied phrase query on field \"text\", which does not have positions \
|
||||||
indexed"
|
indexed"
|
||||||
));
|
));
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_phrase_score() -> crate::Result<()> {
|
pub fn test_phrase_score() {
|
||||||
let index = create_index(&["a b c", "a b c a b"])?;
|
let index = create_index(&["a b c", "a b c a b"]);
|
||||||
let schema = index.schema();
|
let schema = index.schema();
|
||||||
let text_field = schema.get_field("text").unwrap();
|
let text_field = schema.get_field("text").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let test_query = |texts: Vec<&str>| {
|
let test_query = |texts: Vec<&str>| {
|
||||||
let terms: Vec<Term> = texts
|
let terms: Vec<Term> = texts
|
||||||
.iter()
|
.iter()
|
||||||
@@ -180,24 +177,23 @@ pub mod tests {
|
|||||||
let scores = test_query(vec!["a", "b"]);
|
let scores = test_query(vec!["a", "b"]);
|
||||||
assert_nearly_equals!(scores[0], 0.40618482);
|
assert_nearly_equals!(scores[0], 0.40618482);
|
||||||
assert_nearly_equals!(scores[1], 0.46844664);
|
assert_nearly_equals!(scores[1], 0.46844664);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test] // motivated by #234
|
#[test] // motivated by #234
|
||||||
pub fn test_phrase_query_docfreq_order() -> crate::Result<()> {
|
pub fn test_phrase_query_docfreq_order() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"b"))?;
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
index_writer.add_document(doc!(text_field=>"a b"))?;
|
index_writer.add_document(doc!(text_field=>"a b"));
|
||||||
index_writer.add_document(doc!(text_field=>"b a"))?;
|
index_writer.add_document(doc!(text_field=>"b a"));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
|
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let test_query = |texts: Vec<&str>| {
|
let test_query = |texts: Vec<&str>| {
|
||||||
let terms: Vec<Term> = texts
|
let terms: Vec<Term> = texts
|
||||||
.iter()
|
.iter()
|
||||||
@@ -212,19 +208,18 @@ pub mod tests {
|
|||||||
};
|
};
|
||||||
assert_eq!(test_query(vec!["a", "b"]), vec![DocAddress::new(0, 1)]);
|
assert_eq!(test_query(vec!["a", "b"]), vec![DocAddress::new(0, 1)]);
|
||||||
assert_eq!(test_query(vec!["b", "a"]), vec![DocAddress::new(0, 2)]);
|
assert_eq!(test_query(vec!["b", "a"]), vec![DocAddress::new(0, 2)]);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test] // motivated by #234
|
#[test] // motivated by #234
|
||||||
pub fn test_phrase_query_non_trivial_offsets() -> crate::Result<()> {
|
pub fn test_phrase_query_non_trivial_offsets() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_for_tests()?;
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"a b c d e f g h"))?;
|
index_writer.add_document(doc!(text_field=>"a b c d e f g h"));
|
||||||
index_writer.commit()?;
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let test_query = |texts: Vec<(usize, &str)>| {
|
let test_query = |texts: Vec<(usize, &str)>| {
|
||||||
@@ -250,6 +245,5 @@ pub mod tests {
|
|||||||
assert_eq!(test_query(vec![(4, "e"), (0, "a"), (2, "c")]), vec![0]);
|
assert_eq!(test_query(vec![(4, "e"), (0, "a"), (2, "c")]), vec![0]);
|
||||||
assert!(test_query(vec![(0, "a"), (2, "d")]).is_empty());
|
assert!(test_query(vec![(0, "a"), (2, "d")]).is_empty());
|
||||||
assert_eq!(test_query(vec![(1, "a"), (3, "c")]), vec![0]);
|
assert_eq!(test_query(vec![(1, "a"), (3, "c")]), vec![0]);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -116,18 +116,19 @@ mod tests {
|
|||||||
use crate::{DocSet, Term};
|
use crate::{DocSet, Term};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_phrase_count() -> crate::Result<()> {
|
pub fn test_phrase_count() {
|
||||||
let index = create_index(&["a c", "a a b d a b c", " a b"])?;
|
let index = create_index(&["a c", "a a b d a b c", " a b"]);
|
||||||
let schema = index.schema();
|
let schema = index.schema();
|
||||||
let text_field = schema.get_field("text").unwrap();
|
let text_field = schema.get_field("text").unwrap();
|
||||||
let searcher = index.reader()?.searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let phrase_query = PhraseQuery::new(vec![
|
let phrase_query = PhraseQuery::new(vec![
|
||||||
Term::from_field_text(text_field, "a"),
|
Term::from_field_text(text_field, "a"),
|
||||||
Term::from_field_text(text_field, "b"),
|
Term::from_field_text(text_field, "b"),
|
||||||
]);
|
]);
|
||||||
let phrase_weight = phrase_query.phrase_weight(&searcher, true).unwrap();
|
let phrase_weight = phrase_query.phrase_weight(&searcher, true).unwrap();
|
||||||
let mut phrase_scorer = phrase_weight
|
let mut phrase_scorer = phrase_weight
|
||||||
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
|
.phrase_scorer(searcher.segment_reader(0u32), 1.0)
|
||||||
|
.unwrap()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(phrase_scorer.doc(), 1);
|
assert_eq!(phrase_scorer.doc(), 1);
|
||||||
assert_eq!(phrase_scorer.phrase_count(), 2);
|
assert_eq!(phrase_scorer.phrase_count(), 2);
|
||||||
@@ -135,6 +136,5 @@ mod tests {
|
|||||||
assert_eq!(phrase_scorer.doc(), 2);
|
assert_eq!(phrase_scorer.doc(), 2);
|
||||||
assert_eq!(phrase_scorer.phrase_count(), 1);
|
assert_eq!(phrase_scorer.phrase_count(), 1);
|
||||||
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user