mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 04:52:55 +00:00
Compare commits
135 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
176f67a266 | ||
|
|
19babff849 | ||
|
|
bf2576adf9 | ||
|
|
b8241c5603 | ||
|
|
a4745151c0 | ||
|
|
e2ce326a8c | ||
|
|
bb21d12a70 | ||
|
|
4565aba62a | ||
|
|
545a7ec8dd | ||
|
|
e68775d71c | ||
|
|
dcc92d287e | ||
|
|
b48f81c051 | ||
|
|
a3042e956b | ||
|
|
1fa10f0a0b | ||
|
|
279a9eb5e3 | ||
|
|
21a24672d8 | ||
|
|
a3f1fbaae6 | ||
|
|
a6e767c877 | ||
|
|
6af0488dbe | ||
|
|
07d87e154b | ||
|
|
8b0b0133dd | ||
|
|
7b9752f897 | ||
|
|
c92f41aea8 | ||
|
|
dea16f1d9d | ||
|
|
236cfbec08 | ||
|
|
edcafb69bb | ||
|
|
14908479d5 | ||
|
|
ab4593eeb7 | ||
|
|
e75bb1d6a1 | ||
|
|
63b9d62237 | ||
|
|
0098e3d428 | ||
|
|
69d5e4b9b1 | ||
|
|
e0cdd3114d | ||
|
|
f32b4a2ebe | ||
|
|
6ff60b8ed8 | ||
|
|
8da28fb6cf | ||
|
|
0df2a221da | ||
|
|
5449ec3c11 | ||
|
|
10f6c07c53 | ||
|
|
06e7bd18e7 | ||
|
|
37e4280c0a | ||
|
|
0ba1cf93f7 | ||
|
|
21a9940726 | ||
|
|
8600b8ea25 | ||
|
|
30f4f85d48 | ||
|
|
82d25b8397 | ||
|
|
2104c0277c | ||
|
|
dd37e109f2 | ||
|
|
cc23194c58 | ||
|
|
63868733a3 | ||
|
|
644d8a3a10 | ||
|
|
e32dba1a97 | ||
|
|
a78aa4c259 | ||
|
|
7e5f697d00 | ||
|
|
a78f4cca37 | ||
|
|
2e44f0f099 | ||
|
|
9ccba9f864 | ||
|
|
9101bf5753 | ||
|
|
23e97da9f6 | ||
|
|
1d439e96f5 | ||
|
|
934933582e | ||
|
|
98c7fbdc6f | ||
|
|
cec9956a01 | ||
|
|
c64972e039 | ||
|
|
b3b2421e8a | ||
|
|
f570fe37d4 | ||
|
|
6704ab6987 | ||
|
|
a12d211330 | ||
|
|
ee681a4dd1 | ||
|
|
d15efd6635 | ||
|
|
18814ba0c1 | ||
|
|
f247935bb9 | ||
|
|
6a197e023e | ||
|
|
96a313c6dd | ||
|
|
fb9b1c1f41 | ||
|
|
e1bca6db9d | ||
|
|
8438eda01a | ||
|
|
b373f00840 | ||
|
|
46decdb0ea | ||
|
|
835cdc2fe8 | ||
|
|
19756bb7d6 | ||
|
|
57e1f8ed28 | ||
|
|
2649c8a715 | ||
|
|
ede97eded6 | ||
|
|
4b7ff78c5a | ||
|
|
948758ad78 | ||
|
|
d71fa43ca3 | ||
|
|
1e5266d4c9 | ||
|
|
537fc27231 | ||
|
|
af593b1116 | ||
|
|
3d73c0c240 | ||
|
|
3a8e524f77 | ||
|
|
c0641c2b47 | ||
|
|
ef3a16a129 | ||
|
|
a0a284fe91 | ||
|
|
0feeef2684 | ||
|
|
cc50bdb06a | ||
|
|
23c2c3ae7c | ||
|
|
674524ba91 | ||
|
|
60a9a7f837 | ||
|
|
5b5c706581 | ||
|
|
3e14a76623 | ||
|
|
8cde1c81e5 | ||
|
|
8d0a29b137 | ||
|
|
cbfb2fe19d | ||
|
|
09e00f1d42 | ||
|
|
290620fdee | ||
|
|
f0d1b85bd8 | ||
|
|
aaef546f91 | ||
|
|
811ddf2226 | ||
|
|
79a339d353 | ||
|
|
e45e4c79d9 | ||
|
|
848bf41bc9 | ||
|
|
d11cb087a7 | ||
|
|
2dd7422f42 | ||
|
|
e8707c02c0 | ||
|
|
55928d756a | ||
|
|
a4370bca64 | ||
|
|
5a5c5a8ca5 | ||
|
|
1b470dd474 | ||
|
|
52b4575245 | ||
|
|
ddd2d5b04c | ||
|
|
fa22b4041a | ||
|
|
8faee143fa | ||
|
|
366ce98f08 | ||
|
|
190e60a41c | ||
|
|
b9558801a1 | ||
|
|
36728215ac | ||
|
|
39551a0418 | ||
|
|
39b98b2e76 | ||
|
|
616162400d | ||
|
|
694d164db6 | ||
|
|
ef442cefb1 | ||
|
|
14da241f35 | ||
|
|
346a9e4287 |
19
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
19
.github/ISSUE_TEMPLATE/bug_report.md
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
---
|
||||
name: Bug report
|
||||
about: Create a report to help us improve
|
||||
|
||||
---
|
||||
|
||||
**Describe the bug**
|
||||
- What did you do?
|
||||
- What happened?
|
||||
- What was expected?
|
||||
|
||||
**Which version of tantivy are you using?**
|
||||
If "master", ideally give the specific sha1 revision.
|
||||
|
||||
**To Reproduce**
|
||||
|
||||
If your bug is deterministic, can you give a minimal reproducing code?
|
||||
Some bugs are not deterministic. Can you describe with precision in which context it happened?
|
||||
If this is possible, can you share your code?
|
||||
14
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
14
.github/ISSUE_TEMPLATE/feature_request.md
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
---
|
||||
name: Feature request
|
||||
about: Suggest an idea for this project
|
||||
|
||||
---
|
||||
|
||||
**Is your feature request related to a problem? Please describe.**
|
||||
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
|
||||
|
||||
**Describe the solution you'd like**
|
||||
A clear and concise description of what you want to happen.
|
||||
|
||||
**[Optional] describe alternatives you've considered**
|
||||
A clear and concise description of any alternative solutions or features you've considered.
|
||||
7
.github/ISSUE_TEMPLATE/question.md
vendored
Normal file
7
.github/ISSUE_TEMPLATE/question.md
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
---
|
||||
name: Question
|
||||
about: Ask any question about tantivy's usage...
|
||||
|
||||
---
|
||||
|
||||
Try to be specific about your use case...
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,3 +1,4 @@
|
||||
tantivy.iml
|
||||
*.swp
|
||||
target
|
||||
target/debug
|
||||
|
||||
107
.travis.yml
107
.travis.yml
@@ -9,74 +9,42 @@ sudo: required
|
||||
env:
|
||||
global:
|
||||
- CRATE_NAME=tantivy
|
||||
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
|
||||
- secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
|
||||
addons:
|
||||
apt:
|
||||
sources:
|
||||
- ubuntu-toolchain-r-test
|
||||
- kalakris-cmake
|
||||
packages:
|
||||
- gcc-4.8
|
||||
- g++-4.8
|
||||
- libcurl4-openssl-dev
|
||||
- libelf-dev
|
||||
- libdw-dev
|
||||
- binutils-dev
|
||||
- cmake
|
||||
|
||||
matrix:
|
||||
include:
|
||||
# Android
|
||||
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
||||
- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||
- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||
- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||
- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
|
||||
|
||||
# iOS
|
||||
#- env: TARGET=aarch64-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=armv7-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=armv7s-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
#- env: TARGET=i386-apple-ios DISABLE_TESTS=1
|
||||
# os: osx
|
||||
- env: TARGET=x86_64-apple-ios DISABLE_TESTS=1
|
||||
os: osx
|
||||
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-linux-android DISABLE_TESTS=1
|
||||
|
||||
# Linux
|
||||
- env: TARGET=aarch64-unknown-linux-gnu
|
||||
# - env: TARGET=arm-unknown-linux-gnueabi
|
||||
# - env: TARGET=armv7-unknown-linux-gnueabihf
|
||||
- env: TARGET=i686-unknown-linux-gnu
|
||||
#- env: TARGET=i686-unknown-linux-musl
|
||||
#- env: TARGET=mips-unknown-linux-gnu
|
||||
#- env: TARGET=mips64-unknown-linux-gnuabi64
|
||||
#- env: TARGET=mips64el-unknown-linux-gnuabi64
|
||||
#- env: TARGET=mipsel-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc64-unknown-linux-gnu
|
||||
#- env: TARGET=powerpc64le-unknown-linux-gnu
|
||||
#- env: TARGET=s390x-unknown-linux-gnu DISABLE_TESTS=1
|
||||
- env: TARGET=x86_64-unknown-linux-gnu
|
||||
- env: TARGET=x86_64-unknown-linux-musl
|
||||
#- env: TARGET=aarch64-unknown-linux-gnu
|
||||
#- env: TARGET=i686-unknown-linux-gnu
|
||||
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1
|
||||
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
|
||||
|
||||
# OSX
|
||||
#- env: TARGET=i686-apple-darwin
|
||||
# os: osx
|
||||
- env: TARGET=x86_64-apple-darwin
|
||||
os: osx
|
||||
|
||||
# *BSD
|
||||
#- env: TARGET=i686-unknown-freebsd DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-unknown-freebsd DISABLE_TESTS=1
|
||||
#- env: TARGET=x86_64-unknown-netbsd DISABLE_TESTS=1
|
||||
|
||||
# Windows
|
||||
#- env: TARGET=x86_64-pc-windows-gnu
|
||||
|
||||
# Bare metal
|
||||
# These targets don't support std and as such are likely not suitable for
|
||||
# most crates.
|
||||
# - env: TARGET=thumbv6m-none-eabi
|
||||
# - env: TARGET=thumbv7em-none-eabi
|
||||
# - env: TARGET=thumbv7em-none-eabihf
|
||||
# - env: TARGET=thumbv7m-none-eabi
|
||||
|
||||
# Testing other channels
|
||||
#- env: TARGET=x86_64-unknown-linux-gnu
|
||||
# rust: nightly
|
||||
#- env: TARGET=x86_64-apple-darwin
|
||||
# os: osx
|
||||
# rust: nightly
|
||||
|
||||
before_install:
|
||||
- set -e
|
||||
- rustup self update
|
||||
@@ -85,31 +53,16 @@ install:
|
||||
- sh ci/install.sh
|
||||
- source ~/.cargo/env || true
|
||||
|
||||
before_script:
|
||||
- export PATH=$HOME/.cargo/bin:$PATH
|
||||
- cargo install cargo-update || echo "cargo-update already installed"
|
||||
- cargo install cargo-travis || echo "cargo-travis already installed"
|
||||
|
||||
script:
|
||||
- bash ci/script.sh
|
||||
|
||||
after_script: set +e
|
||||
|
||||
before_deploy:
|
||||
- sh ci/before_deploy.sh
|
||||
#
|
||||
#deploy:
|
||||
# # - Create a `public_repo` GitHub token. Go to: https://github.com/settings/tokens/new
|
||||
# # - Encrypt it: `travis encrypt 0123456789012345678901234567890123456789
|
||||
# # - Paste the output down here
|
||||
# api_key:
|
||||
# secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
# file_glob: true
|
||||
# file: $CRATE_NAME-$TRAVIS_TAG-$TARGET.*
|
||||
# on:
|
||||
# # TODO Here you can pick which targets will generate binary releases
|
||||
# # In this example, there are some targets that are tested using the stable
|
||||
# # and nightly channels. This condition makes sure there is only one release
|
||||
# # for such targets and that's generated using the stable channel
|
||||
# condition: $TRAVIS_RUST_VERSION = stable
|
||||
# tags: true
|
||||
# provider: releases
|
||||
# skip_cleanup: true
|
||||
|
||||
cache: cargo
|
||||
before_cache:
|
||||
@@ -124,4 +77,4 @@ before_cache:
|
||||
|
||||
notifications:
|
||||
email:
|
||||
on_success: never
|
||||
on_success: never
|
||||
22
CHANGELOG.md
22
CHANGELOG.md
@@ -1,3 +1,25 @@
|
||||
Tantivy 0.8.0
|
||||
=====================
|
||||
*No change in the index format*
|
||||
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)
|
||||
- Multithreaded search (@jwolfe, @fulmicoton)
|
||||
|
||||
|
||||
Tantivy 0.7.1
|
||||
=====================
|
||||
*No change in the index format*
|
||||
- Bugfix: NGramTokenizer panics on non ascii chars
|
||||
- Added a space usage API
|
||||
|
||||
Tantivy 0.7
|
||||
=====================
|
||||
- Skip data for doc ids and positions (@fulmicoton),
|
||||
greatly improving performance
|
||||
- Tantivy error now rely on the failure crate (@drusellers)
|
||||
- Added support for `AND`, `OR`, `NOT` syntax in addition to the `+`,`-` syntax
|
||||
- Added a snippet generator with highlight (@vigneshsarma, @fulmicoton)
|
||||
- Added a `TopFieldCollector` (@pentlander)
|
||||
|
||||
Tantivy 0.6.1
|
||||
=========================
|
||||
- Bugfix #324. GC removing was removing file that were still in useful
|
||||
|
||||
55
Cargo.toml
55
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.6.1"
|
||||
version = "0.8.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -12,67 +12,70 @@ readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.9.1"
|
||||
base64 = "0.10.0"
|
||||
byteorder = "1.0"
|
||||
lazy_static = "0.2.1"
|
||||
tinysegmenter = "0.1.0"
|
||||
regex = "0.2"
|
||||
lazy_static = "1"
|
||||
regex = "1.0"
|
||||
fst = {version="0.3", default-features=false}
|
||||
fst-regex = { version="0.2" }
|
||||
lz4 = {version="1.20", optional=true}
|
||||
snap = {version="0.2"}
|
||||
atomicwrites = {version="0.2.2", optional=true}
|
||||
tempfile = "2.1"
|
||||
log = "0.3.6"
|
||||
combine = "2.2"
|
||||
tempfile = "3.0"
|
||||
log = "0.4"
|
||||
combine = "3"
|
||||
tempdir = "0.3"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
itertools = "0.5.9"
|
||||
itertools = "0.8"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
bit-set = "0.4.0"
|
||||
uuid = { version = "0.6", features = ["v4", "serde"] }
|
||||
chan = "0.1"
|
||||
crossbeam = "0.3"
|
||||
bit-set = "0.5"
|
||||
uuid = { version = "0.7", features = ["v4", "serde"] }
|
||||
crossbeam = "0.5"
|
||||
futures = "0.1"
|
||||
futures-cpupool = "0.1"
|
||||
error-chain = "0.8"
|
||||
owning_ref = "0.3"
|
||||
owning_ref = "0.4"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "0.1.0"
|
||||
rust-stemmers = "1"
|
||||
downcast = { version="0.9" }
|
||||
matches = "0.1"
|
||||
bitpacking = "0.5"
|
||||
census = "0.1"
|
||||
fnv = "1.0.6"
|
||||
owned-read = "0.1"
|
||||
owned-read = "0.4"
|
||||
failure = "0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.2"
|
||||
scoped-pool = "1.0"
|
||||
murmurhash32 = "0.2"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.3"
|
||||
env_logger = "0.4"
|
||||
rand = "0.6"
|
||||
maplit = "1"
|
||||
|
||||
[profile.release]
|
||||
opt-level = 3
|
||||
debug = false
|
||||
lto = true
|
||||
debug-assertions = false
|
||||
|
||||
[profile.test]
|
||||
debug-assertions = true
|
||||
overflow-checks = true
|
||||
|
||||
[features]
|
||||
default = ["mmap"]
|
||||
# by default no-fail is disabled. We manually enable it when running test.
|
||||
default = ["mmap", "no_fail"]
|
||||
mmap = ["fst/mmap", "atomicwrites"]
|
||||
lz4-compression = ["lz4"]
|
||||
no_fail = ["fail/no_fail"]
|
||||
unstable = [] # useful for benches.
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
[[example]]
|
||||
name = "simple_search"
|
||||
required-features = ["mmap"]
|
||||
|
||||
[[example]]
|
||||
name = "custom_tokenizer"
|
||||
|
||||
34
README.md
34
README.md
@@ -1,14 +1,27 @@
|
||||

|
||||
|
||||
[](https://travis-ci.org/tantivy-search/tantivy)
|
||||
[](https://coveralls.io/github/tantivy-search/tantivy?branch=master)
|
||||
[](https://codecov.io/gh/tantivy-search/tantivy)
|
||||
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
|
||||
[](https://saythanks.io/to/fulmicoton)
|
||||
|
||||

|
||||
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/0)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/1)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/2)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/3)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/4)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/5)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
|
||||
|
||||
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in rust.
|
||||
|
||||
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
|
||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||
an off-the-shelf search engine server, but rather a crate that can be used
|
||||
to build such a search engine.
|
||||
|
||||
@@ -17,10 +30,11 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
||||
- Tiny startup time (<10ms), perfect for command line tools
|
||||
- BM25 scoring (the same as lucene)
|
||||
- Basic query language (`+michael +jackson`)
|
||||
- Phrase queries search (\"michael jackson\"`)
|
||||
- Natural query language `(michael AND jackson) OR "king of pop"`
|
||||
- Phrase queries search (`"michael jackson"`)
|
||||
- Incremental indexing
|
||||
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
|
||||
- Mmap directory
|
||||
@@ -30,12 +44,14 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
- LZ4 compressed document store
|
||||
- Range queries
|
||||
- Faceted search
|
||||
- Configurable indexing (optional term frequency and position indexing
|
||||
- Configurable indexing (optional term frequency and position indexing)
|
||||
- Cheesy logo with a horse
|
||||
|
||||
# Non-features
|
||||
|
||||
- Distributed search and will not be in the scope of tantivy.
|
||||
- Distributed search is out of the scope of tantivy. That being said, tantivy is meant as a
|
||||
library upon which one could build a distributed search. Serializable/mergeable collector state for instance,
|
||||
are within the scope of tantivy.
|
||||
|
||||
|
||||
# Supported OS and compiler
|
||||
@@ -64,6 +80,10 @@ To check out and run tests, you can simply run :
|
||||
cd tantivy
|
||||
cargo build
|
||||
|
||||
## Running tests
|
||||
|
||||
Some tests will not run with just `cargo test` because of `fail-rs`.
|
||||
To run the tests exhaustively, run `./run-tests.sh`.
|
||||
|
||||
# Contribute
|
||||
|
||||
|
||||
@@ -18,5 +18,5 @@ install:
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
|
||||
- REM SET RUST_BACKTRACE=1 & cargo run --example simple_search
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1
|
||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||
|
||||
28
ci/script.sh
28
ci/script.sh
@@ -1,20 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# This script takes care of testing your crate
|
||||
|
||||
set -ex
|
||||
|
||||
main() {
|
||||
cross build --target $TARGET
|
||||
cross build --target $TARGET --release
|
||||
|
||||
if [ ! -z $DISABLE_TESTS ]; then
|
||||
return
|
||||
if [ ! -z $CODECOV ]; then
|
||||
echo "Codecov"
|
||||
cargo build --verbose && cargo coverage --verbose && bash <(curl -s https://codecov.io/bash) -s target/kcov
|
||||
else
|
||||
echo "Build"
|
||||
cross build --target $TARGET
|
||||
if [ ! -z $DISABLE_TESTS ]; then
|
||||
return
|
||||
fi
|
||||
echo "Test"
|
||||
cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1
|
||||
fi
|
||||
|
||||
cross test --target $TARGET
|
||||
# cross test --target $TARGET --release
|
||||
|
||||
# cross run --target $TARGET
|
||||
# cross run --target $TARGET --release
|
||||
for example in $(ls examples/*.rs)
|
||||
do
|
||||
cargo run --example $(basename $example .rs)
|
||||
done
|
||||
}
|
||||
|
||||
# we don't run the "test phase" when doing deploys
|
||||
|
||||
1
doc/.gitignore
vendored
Normal file
1
doc/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
book
|
||||
5
doc/book.toml
Normal file
5
doc/book.toml
Normal file
@@ -0,0 +1,5 @@
|
||||
[book]
|
||||
authors = ["Paul Masurel"]
|
||||
multilingual = false
|
||||
src = "src"
|
||||
title = "Tantivy, the user guide"
|
||||
15
doc/src/SUMMARY.md
Normal file
15
doc/src/SUMMARY.md
Normal file
@@ -0,0 +1,15 @@
|
||||
# Summary
|
||||
|
||||
|
||||
|
||||
[Avant Propos](./avant-propos.md)
|
||||
|
||||
- [Segments](./basis.md)
|
||||
- [Defining your schema](./schema.md)
|
||||
- [Facetting](./facetting.md)
|
||||
- [Innerworkings](./innerworkings.md)
|
||||
- [Inverted index](./inverted_index.md)
|
||||
- [Best practise](./inverted_index.md)
|
||||
|
||||
[Frequently Asked Questions](./faq.md)
|
||||
[Examples](./examples.md)
|
||||
34
doc/src/avant-propos.md
Normal file
34
doc/src/avant-propos.md
Normal file
@@ -0,0 +1,34 @@
|
||||
# Foreword, what is the scope of tantivy?
|
||||
|
||||
> Tantivy is a **search** engine **library** for Rust.
|
||||
|
||||
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and
|
||||
they both have the same scope and targetted use cases.
|
||||
|
||||
If you are not familiar with Lucene, let's break down our little tagline.
|
||||
|
||||
- **Search** here means full-text search : fundamentally, tantivy is here to help you
|
||||
identify efficiently what are the documents matching a given query in your corpus.
|
||||
But modern search UI are so much more : text processing, facetting, autocomplete, fuzzy search, good
|
||||
relevancy, collapsing, highlighting, spatial search.
|
||||
|
||||
While some of these features are not available in tantivy yet, all of these are relevant
|
||||
feature requests. Tantivy's objective is to offer a solid toolbox to create the best search
|
||||
experience. But keep in mind this is just a toolbox.
|
||||
Which bring us to the second keyword...
|
||||
|
||||
- **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution like elastic search for instance.
|
||||
|
||||
Sometimes a functionality will not be available in tantivy because it is too
|
||||
specific to your use case. By design, tantivy should make it possible to extend
|
||||
the available set of features using the existing rock-solid datastructures.
|
||||
|
||||
Most frequently this will mean writing your own `Collector`, your own `Scorer` or your own
|
||||
`TokenFilter`... Some of your requirements may also be related to
|
||||
something closer to architecture or operations. For instance, you may
|
||||
want to build a large corpus on Hadoop, fine-tune the merge policy to keep your
|
||||
index sharded in a time-wise fashion, or you may want to convert and existing
|
||||
index from a different format.
|
||||
|
||||
Tantivy exposes a lot of low level API to do all of these things.
|
||||
|
||||
77
doc/src/basis.md
Normal file
77
doc/src/basis.md
Normal file
@@ -0,0 +1,77 @@
|
||||
# Anatomy of an index
|
||||
|
||||
## Straight from disk
|
||||
|
||||
Tantivy accesses its data using an abstracting trait called `Directory`.
|
||||
In theory, one can come and override the data access logic. In practise, the
|
||||
trait somewhat assumes that your data can be mapped to memory, and tantivy
|
||||
seems deeply married to using `mmap` for its io [^1], and the only persisting
|
||||
directory shipped with tantivy is the `MmapDirectory`.
|
||||
|
||||
While this design has some downsides, this greatly simplifies the source code of
|
||||
tantivy. Caching is also entirely delegated to the OS.
|
||||
|
||||
`tantivy` works entirely (or almost) by directly reading the datastructures as they are layed on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
|
||||
|
||||
This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case.
|
||||
|
||||
In later chapters, we will discuss tantivy's inverted index data layout.
|
||||
One key take away is that to achieve great performance, search indexes are extremely compact.
|
||||
Of course this is crucial to reduce IO, and ensure that as much of our index can sit in RAM.
|
||||
|
||||
Also, whenever possible its data is accessed sequentially. Of course, this is an amazing property when tantivy needs to access the data from your spinning hard disk, but this is also
|
||||
critical for performance, if your data is read from and an `SSD` or even already in your pagecache.
|
||||
|
||||
|
||||
## Segments, and the log method
|
||||
|
||||
That kind of compact layout comes at one cost: it prevents our datastructures from being dynamic.
|
||||
In fact, the `Directory` trait does not even allow you to modify part of a file.
|
||||
|
||||
To allow the addition / deletion of documents, and create the illusion that
|
||||
your index is dynamic (i.e.: adding and deleting documents), tantivy uses a common database trick sometimes referred to as the *log method*.
|
||||
|
||||
Let's forget about deletes for a moment.
|
||||
|
||||
As you add documents, these documents are processed and stored in a dedicated datastructure, in a `RAM` buffer. This datastructure is not ready for search, but it is useful to receive your data and rearrange it very rapidly.
|
||||
|
||||
As you add documents, this buffer will reach its capacity and tantivy will transparently stop adding document to it and start converting this datastructure to its final read-only format on disk. Once written, an brand empty buffer is available to resume adding documents.
|
||||
|
||||
The resulting chunk of index obtained after this serialization is called a `Segment`.
|
||||
|
||||
> A segment is a self-contained atomic piece of index. It is identified with a UUID, and all of its files are identified using the naming scheme : `<UUID>.*`.
|
||||
|
||||
Which brings us to the nature of a tantivy `Index`.
|
||||
|
||||
> A tantivy `Index` is a collection of `Segments`.
|
||||
|
||||
Physically, this really just means and index is a bunch of segment files in a given `Directory`,
|
||||
linked together by a `meta.json` file. This transparency can become extremely handy
|
||||
to get tantivy to fit your use case:
|
||||
|
||||
*Example 1* You could for instance use hadoop to build a very large search index in a timely manner, copy all of the resulting segment files in the same directory and edit the `meta.json` to get a functional index.[^2]
|
||||
|
||||
*Example 2* You could also disable your merge policy and enforce daily segments. Removing data after one week can then be done very efficiently by just editing the `meta.json` and deleting the files associated to segment `D-7`.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Merging
|
||||
|
||||
As you index more and more data, your index will accumulate more and more segments.
|
||||
Having a lot of small segments is not really optimal. There is a bit of redundancy in having
|
||||
all these term dictionary. Also when searching, we will need to do term lookups as many times as we have segments. It can hurt search performance a bit.
|
||||
|
||||
That's where merging or compacting comes into place. Tantivy will continuously consider merge
|
||||
opportunities and start merging segments in the background.
|
||||
|
||||
|
||||
# Indexing throughput, number of indexing threads
|
||||
|
||||
|
||||
|
||||
|
||||
[^1]: This may eventually change.
|
||||
|
||||
[^2]: Be careful however. By default these files will not be considered as *managed* by tantivy. This means they will never be garbage collected by tantivy, regardless of whether they become obsolete or not.
|
||||
0
doc/src/best_practise.md.rs
Normal file
0
doc/src/best_practise.md.rs
Normal file
3
doc/src/examples.md
Normal file
3
doc/src/examples.md
Normal file
@@ -0,0 +1,3 @@
|
||||
# Examples
|
||||
|
||||
- [Basic search](/examples/basic_search.html)
|
||||
5
doc/src/facetting.md
Normal file
5
doc/src/facetting.md
Normal file
@@ -0,0 +1,5 @@
|
||||
# Facetting
|
||||
|
||||
wewew
|
||||
|
||||
## weeewe
|
||||
0
doc/src/faq.md
Normal file
0
doc/src/faq.md
Normal file
1
doc/src/innerworkings.md
Normal file
1
doc/src/innerworkings.md
Normal file
@@ -0,0 +1 @@
|
||||
# Innerworkings
|
||||
1
doc/src/inverted_index.md
Normal file
1
doc/src/inverted_index.md
Normal file
@@ -0,0 +1 @@
|
||||
# Inverted index
|
||||
1
doc/src/schema.md
Normal file
1
doc/src/schema.md
Normal file
@@ -0,0 +1 @@
|
||||
# Defining your schema
|
||||
@@ -1,26 +1,32 @@
|
||||
extern crate tantivy;
|
||||
// # Basic Example
|
||||
//
|
||||
// This example covers the basic functionalities of
|
||||
// tantivy.
|
||||
//
|
||||
// We will :
|
||||
// - define our schema
|
||||
// = create an index in a directory
|
||||
// - index few documents in our index
|
||||
// - search for the best document matchings "sea whale"
|
||||
// - retrieve the best document original content.
|
||||
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate serde_json;
|
||||
|
||||
use std::path::Path;
|
||||
use tantivy::collector::TopCollector;
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn main() {
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
if let Ok(dir) = TempDir::new("tantivy_example_dir") {
|
||||
run_example(dir.path()).unwrap();
|
||||
dir.close().unwrap();
|
||||
}
|
||||
}
|
||||
let index_path = TempDir::new("tantivy_example_dir")?;
|
||||
|
||||
fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// The Tantivy index requires a very strict schema.
|
||||
@@ -29,13 +35,13 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// be indexed".
|
||||
|
||||
// first we need to define a schema ...
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// Our first field is title.
|
||||
// We want full-text search for it, and we also want
|
||||
// to be able to retrieve the document after the search.
|
||||
//
|
||||
// TEXT | STORED is some syntactic sugar to describe
|
||||
// `TEXT | STORED` is some syntactic sugar to describe
|
||||
// that.
|
||||
//
|
||||
// `TEXT` means the field should be tokenized and indexed,
|
||||
@@ -64,21 +70,22 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
// This single `IndexWriter` is already
|
||||
// multithreaded.
|
||||
//
|
||||
// Here we use a buffer of 50MB per thread. Using a bigger
|
||||
// heap for the indexer can increase its throughput.
|
||||
// Here we give tantivy a budget of `50MB`.
|
||||
// Using a bigger heap for the indexer may increase
|
||||
// throughput, but 50 MB is already plenty.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// Let's index our documents!
|
||||
// We first need a handle on the title and the body field.
|
||||
|
||||
// ### Create a document "manually".
|
||||
// ### Adding documents
|
||||
//
|
||||
// We can create a document manually, by setting the fields
|
||||
// one by one in a Document object.
|
||||
@@ -96,46 +103,47 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// ... and add it to the `IndexWriter`.
|
||||
index_writer.add_document(old_man_doc);
|
||||
|
||||
// ### Create a document directly from json.
|
||||
//
|
||||
// Alternatively, we can use our schema to parse a
|
||||
// document object directly from json.
|
||||
// The document is a string, but we use the `json` macro
|
||||
// from `serde_json` for the convenience of multi-line support.
|
||||
let json = json!({
|
||||
"title": "Of Mice and Men",
|
||||
"body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
});
|
||||
let mice_and_men_doc = schema.parse_document(&json.to_string())?;
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
index_writer.add_document(mice_and_men_doc);
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
// Multi-valued field are allowed, they are
|
||||
// expressed in JSON by an array.
|
||||
// The following document has two titles.
|
||||
let json = json!({
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
});
|
||||
let frankenstein_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(frankenstein_doc);
|
||||
// Multivalued field just need to be repeated.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
title => "The Modern Prometheus",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
));
|
||||
|
||||
// This is an example, so we will only index 3 documents
|
||||
// here. You can check out tantivy's tutorial to index
|
||||
// the English wikipedia. Tantivy's indexing is rather fast.
|
||||
// Indexing 5 million articles of the English wikipedia takes
|
||||
// around 4 minutes on my computer!
|
||||
// around 3 minutes on my computer!
|
||||
|
||||
// ### Committing
|
||||
//
|
||||
@@ -160,17 +168,29 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
|
||||
// # Searching
|
||||
//
|
||||
// ### Searcher
|
||||
//
|
||||
// Let's search our index. Start by reloading
|
||||
// searchers in the index. This should be done
|
||||
// after every commit().
|
||||
// after every `commit()`.
|
||||
index.load_searchers()?;
|
||||
|
||||
// Afterwards create one (or more) searchers.
|
||||
// We now need to acquire a searcher.
|
||||
// Some search experience might require more than
|
||||
// one query.
|
||||
//
|
||||
// You should create a searcher
|
||||
// every time you start a "search query".
|
||||
// The searcher ensure that we get to work
|
||||
// with a consistent version of the index.
|
||||
//
|
||||
// Acquiring a `searcher` is very cheap.
|
||||
//
|
||||
// You should acquire a searcher every time you
|
||||
// start processing a request and
|
||||
// and release it right after your query is finished.
|
||||
let searcher = index.searcher();
|
||||
|
||||
// ### Query
|
||||
|
||||
// The query parser can interpret human queries.
|
||||
// Here, if the user does not specify which
|
||||
// field they want to search, tantivy will search
|
||||
@@ -193,15 +213,10 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
//
|
||||
// We are not interested in all of the documents but
|
||||
// only in the top 10. Keeping track of our top 10 best documents
|
||||
// is the role of the TopCollector.
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
// is the role of the TopDocs.
|
||||
|
||||
// We can now perform our query.
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
|
||||
// Our top collector now contains the 10
|
||||
// most relevant doc ids...
|
||||
let doc_addresses = top_collector.docs();
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
// The actual documents still need to be
|
||||
// retrieved from Tantivy's store.
|
||||
@@ -210,16 +225,10 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// the document returned will only contain
|
||||
// a title.
|
||||
|
||||
for doc_address in doc_addresses {
|
||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
for (_score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
// Wait for indexing and merging threads to shut down.
|
||||
// Usually this isn't needed, but in `main` we try to
|
||||
// delete the temporary directory and that fails on
|
||||
// Windows if the files are still open.
|
||||
index_writer.wait_merging_threads()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
187
examples/custom_collector.rs
Normal file
187
examples/custom_collector.rs
Normal file
@@ -0,0 +1,187 @@
|
||||
// # Custom collector example
|
||||
//
|
||||
// This example shows how you can implement your own
|
||||
// collector. As an example, we will compute a collector
|
||||
// that computes the standard deviation of a given fast field.
|
||||
//
|
||||
// Of course, you can have a look at the tantivy's built-in collectors
|
||||
// such as the `CountCollector` for more examples.
|
||||
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::{Collector, SegmentCollector};
|
||||
use tantivy::fastfield::FastFieldReader;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::Field;
|
||||
use tantivy::schema::{Schema, FAST, INT_INDEXED, TEXT};
|
||||
use tantivy::Index;
|
||||
use tantivy::SegmentReader;
|
||||
|
||||
#[derive(Default)]
|
||||
struct Stats {
|
||||
count: usize,
|
||||
sum: f64,
|
||||
squared_sum: f64,
|
||||
}
|
||||
|
||||
impl Stats {
|
||||
pub fn count(&self) -> usize {
|
||||
self.count
|
||||
}
|
||||
|
||||
pub fn mean(&self) -> f64 {
|
||||
self.sum / (self.count as f64)
|
||||
}
|
||||
|
||||
fn square_mean(&self) -> f64 {
|
||||
self.squared_sum / (self.count as f64)
|
||||
}
|
||||
|
||||
pub fn standard_deviation(&self) -> f64 {
|
||||
let mean = self.mean();
|
||||
(self.square_mean() - mean * mean).sqrt()
|
||||
}
|
||||
|
||||
fn non_zero_count(self) -> Option<Stats> {
|
||||
if self.count == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(self)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct StatsCollector {
|
||||
field: Field,
|
||||
}
|
||||
|
||||
impl StatsCollector {
|
||||
fn with_field(field: Field) -> StatsCollector {
|
||||
StatsCollector { field }
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for StatsCollector {
|
||||
// That's the type of our result.
|
||||
// Our standard deviation will be a float.
|
||||
type Fruit = Option<Stats>;
|
||||
|
||||
type Child = StatsSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: u32,
|
||||
segment: &SegmentReader,
|
||||
) -> tantivy::Result<StatsSegmentCollector> {
|
||||
let fast_field_reader = segment.fast_field_reader(self.field)?;
|
||||
Ok(StatsSegmentCollector {
|
||||
fast_field_reader,
|
||||
stats: Stats::default(),
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
// this collector does not care about score.
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, segment_stats: Vec<Option<Stats>>) -> tantivy::Result<Option<Stats>> {
|
||||
let mut stats = Stats::default();
|
||||
for segment_stats_opt in segment_stats {
|
||||
if let Some(segment_stats) = segment_stats_opt {
|
||||
stats.count += segment_stats.count;
|
||||
stats.sum += segment_stats.sum;
|
||||
stats.squared_sum += segment_stats.squared_sum;
|
||||
}
|
||||
}
|
||||
Ok(stats.non_zero_count())
|
||||
}
|
||||
}
|
||||
|
||||
struct StatsSegmentCollector {
|
||||
fast_field_reader: FastFieldReader<u64>,
|
||||
stats: Stats,
|
||||
}
|
||||
|
||||
impl SegmentCollector for StatsSegmentCollector {
|
||||
type Fruit = Option<Stats>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let value = self.fast_field_reader.get(doc) as f64;
|
||||
self.stats.count += 1;
|
||||
self.stats.sum += value;
|
||||
self.stats.squared_sum += value * value;
|
||||
}
|
||||
|
||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||
self.stats.non_zero_count()
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// The Tantivy index requires a very strict schema.
|
||||
// The schema declares which fields are in the index,
|
||||
// and for each field, its type and "the way it should
|
||||
// be indexed".
|
||||
|
||||
// first we need to define a schema ...
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// We'll assume a fictional index containing
|
||||
// products, and with a name, a description, and a price.
|
||||
let product_name = schema_builder.add_text_field("name", TEXT);
|
||||
let product_description = schema_builder.add_text_field("description", TEXT);
|
||||
let price = schema_builder.add_u64_field("price", INT_INDEXED | FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Lets index a bunch of fake documents for the sake of
|
||||
// this example.
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Super Broom 2000",
|
||||
product_description => "While it is ok for short distance travel, this broom \
|
||||
was designed quiditch. It will up your game.",
|
||||
price => 30_200u64
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Turbulobroom",
|
||||
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
|
||||
You'll enjoy its sharp turns, and rapid acceleration",
|
||||
price => 29_240u64
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Broomio",
|
||||
product_description => "Great value for the price. This broom is a market favorite",
|
||||
price => 21_240u64
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
product_name => "Whack a Mole",
|
||||
product_description => "Prime quality bat.",
|
||||
price => 5_200u64
|
||||
));
|
||||
index_writer.commit()?;
|
||||
index.load_searchers()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);
|
||||
|
||||
// here we want to get a hit on the 'ken' in Frankenstein
|
||||
let query = query_parser.parse_query("broom")?;
|
||||
if let Some(stats) = searcher.search(&query, &StatsCollector::with_field(price))? {
|
||||
println!("count: {}", stats.count());
|
||||
println!("mean: {}", stats.mean());
|
||||
println!("standard deviation: {}", stats.standard_deviation());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,27 +1,17 @@
|
||||
extern crate tantivy;
|
||||
extern crate tempdir;
|
||||
// # Defining a tokenizer pipeline
|
||||
//
|
||||
// In this example, we'll see how to define a tokenizer pipeline
|
||||
// by aligning a bunch of `TokenFilter`.
|
||||
|
||||
#[macro_use]
|
||||
extern crate serde_json;
|
||||
|
||||
use std::path::Path;
|
||||
use tantivy::collector::TopCollector;
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::NgramTokenizer;
|
||||
use tantivy::Index;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn main() {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
if let Ok(dir) = TempDir::new("tantivy_token_example_dir") {
|
||||
run_example(dir.path()).unwrap();
|
||||
dir.close().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// The Tantivy index requires a very strict schema.
|
||||
@@ -30,7 +20,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// be indexed".
|
||||
|
||||
// first we need to define a schema ...
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// Our first field is title.
|
||||
// In this example we want to use NGram searching
|
||||
@@ -42,7 +32,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
let title = schema_builder.add_text_field("title", text_options);
|
||||
|
||||
// Our second field is body.
|
||||
// We want full-text search for it, but we do not
|
||||
@@ -51,17 +41,17 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
//
|
||||
// We can make our index lighter and
|
||||
// by omitting `STORED` flag.
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
let body = schema_builder.add_text_field("body", TEXT);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
//
|
||||
// Let's create a brand new index.
|
||||
//
|
||||
// This will actually just save a meta.json
|
||||
// with our schema in the directory.
|
||||
let index = Index::create_in_dir(index_path, schema.clone())?;
|
||||
// To simplify we will work entirely in RAM.
|
||||
// This is not what you want in reality, but it is very useful
|
||||
// for your unit tests... Or this example.
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
// here we are registering our custome tokenizer
|
||||
// this will store tokens of 3 characters each
|
||||
@@ -77,101 +67,32 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// Here we use a buffer of 50MB per thread. Using a bigger
|
||||
// heap for the indexer can increase its throughput.
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// Let's index our documents!
|
||||
// We first need a handle on the title and the body field.
|
||||
|
||||
// ### Create a document "manually".
|
||||
//
|
||||
// We can create a document manually, by setting the fields
|
||||
// one by one in a Document object.
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
let mut old_man_doc = Document::default();
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish.",
|
||||
);
|
||||
|
||||
// ... and add it to the `IndexWriter`.
|
||||
index_writer.add_document(old_man_doc);
|
||||
|
||||
// ### Create a document directly from json.
|
||||
//
|
||||
// Alternatively, we can use our schema to parse a
|
||||
// document object directly from json.
|
||||
// The document is a string, but we use the `json` macro
|
||||
// from `serde_json` for the convenience of multi-line support.
|
||||
let json = json!({
|
||||
"title": "Of Mice and Men",
|
||||
"body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
});
|
||||
let mice_and_men_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(mice_and_men_doc);
|
||||
|
||||
// Multi-valued field are allowed, they are
|
||||
// expressed in JSON by an array.
|
||||
// The following document has two titles.
|
||||
let json = json!({
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
});
|
||||
let frankenstein_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(frankenstein_doc);
|
||||
|
||||
// This is an example, so we will only index 3 documents
|
||||
// here. You can check out tantivy's tutorial to index
|
||||
// the English wikipedia. Tantivy's indexing is rather fast.
|
||||
// Indexing 5 million articles of the English wikipedia takes
|
||||
// around 4 minutes on my computer!
|
||||
|
||||
// ### Committing
|
||||
//
|
||||
// At this point our documents are not searchable.
|
||||
//
|
||||
//
|
||||
// We need to call .commit() explicitly to force the
|
||||
// index_writer to finish processing the documents in the queue,
|
||||
// flush the current index to the disk, and advertise
|
||||
// the existence of new documents.
|
||||
//
|
||||
// This call is blocking.
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent
|
||||
limbs and branches that arch over the pool"#
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and
|
||||
increasing confidence in the success of my undertaking."#
|
||||
));
|
||||
index_writer.commit()?;
|
||||
|
||||
// If `.commit()` returns correctly, then all of the
|
||||
// documents that have been added are guaranteed to be
|
||||
// persistently indexed.
|
||||
//
|
||||
// In the scenario of a crash or a power failure,
|
||||
// tantivy behaves as if has rolled back to its last
|
||||
// commit.
|
||||
|
||||
// # Searching
|
||||
//
|
||||
// Let's search our index. Start by reloading
|
||||
// searchers in the index. This should be done
|
||||
// after every commit().
|
||||
index.load_searchers()?;
|
||||
|
||||
// Afterwards create one (or more) searchers.
|
||||
//
|
||||
// You should create a searcher
|
||||
// every time you start a "search query".
|
||||
let searcher = index.searcher();
|
||||
|
||||
// The query parser can interpret human queries.
|
||||
@@ -183,44 +104,12 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
|
||||
// here we want to get a hit on the 'ken' in Frankenstein
|
||||
let query = query_parser.parse_query("ken")?;
|
||||
|
||||
// A query defines a set of documents, as
|
||||
// well as the way they should be scored.
|
||||
//
|
||||
// A query created by the query parser is scored according
|
||||
// to a metric called Tf-Idf, and will consider
|
||||
// any document matching at least one of our terms.
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
// ### Collectors
|
||||
//
|
||||
// We are not interested in all of the documents but
|
||||
// only in the top 10. Keeping track of our top 10 best documents
|
||||
// is the role of the TopCollector.
|
||||
let mut top_collector = TopCollector::with_limit(10);
|
||||
|
||||
// We can now perform our query.
|
||||
searcher.search(&*query, &mut top_collector)?;
|
||||
|
||||
// Our top collector now contains the 10
|
||||
// most relevant doc ids...
|
||||
let doc_addresses = top_collector.docs();
|
||||
|
||||
// The actual documents still need to be
|
||||
// retrieved from Tantivy's store.
|
||||
//
|
||||
// Since the body field was not configured as stored,
|
||||
// the document returned will only contain
|
||||
// a title.
|
||||
|
||||
for doc_address in doc_addresses {
|
||||
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||
for (_, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
// Wait for indexing and merging threads to shut down.
|
||||
// Usually this isn't needed, but in `main` we try to
|
||||
// delete the temporary directory and that fails on
|
||||
// Windows if the files are still open.
|
||||
index_writer.wait_merging_threads()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
142
examples/deleting_updating_documents.rs
Normal file
142
examples/deleting_updating_documents.rs
Normal file
@@ -0,0 +1,142 @@
|
||||
// # Deleting and Updating (?) documents
|
||||
//
|
||||
// This example explains how to delete and update documents.
|
||||
// In fact there is actually no such thing as an update in tantivy.
|
||||
//
|
||||
// To update a document, you need to delete a document and then reinsert
|
||||
// its new version.
|
||||
//
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
|
||||
// A simple helper function to fetch a single document
|
||||
// given its id from our index.
|
||||
// It will be helpful to check our work.
|
||||
fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Option<Document>> {
|
||||
let searcher = index.searcher();
|
||||
|
||||
// This is the simplest query you can think of.
|
||||
// It matches all of the documents containing a specific term.
|
||||
//
|
||||
// The second argument is here to tell we don't care about decoding positions,
|
||||
// or term frequencies.
|
||||
let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic);
|
||||
let top_docs = searcher.search(&term_query, &TopDocs::with_limit(1))?;
|
||||
|
||||
if let Some((_score, doc_address)) = top_docs.first() {
|
||||
let doc = searcher.doc(*doc_address)?;
|
||||
Ok(Some(doc))
|
||||
} else {
|
||||
// no doc matching this ID.
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// Check out the *basic_search* example if this makes
|
||||
// small sense to you.
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// Tantivy does not really have a notion of primary id.
|
||||
// This may change in the future.
|
||||
//
|
||||
// Still, we can create a `isbn` field and use it as an id. This
|
||||
// field can be `u64` or a `text`, depending on your use case.
|
||||
// It just needs to be indexed.
|
||||
//
|
||||
// If it is `text`, let's make sure to keep it `raw` and let's avoid
|
||||
// running any text processing on it.
|
||||
// This is done by associating this field to the tokenizer named `raw`.
|
||||
// Rather than building our [`TextOptions`](//docs.rs/tantivy/~0/tantivy/schema/struct.TextOptions.html) manually,
|
||||
// We use the `STRING` shortcut. `STRING` stands for indexed (without term frequency or positions)
|
||||
// and untokenized.
|
||||
//
|
||||
// Because we also want to be able to see this `id` in our returned documents,
|
||||
// we also mark the field as stored.
|
||||
let isbn = schema_builder.add_text_field("isbn", STRING | STORED);
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// Let's add a couple of documents, for the sake of the example.
|
||||
let mut old_man_doc = Document::default();
|
||||
old_man_doc.add_text(title, "The Old Man and the Sea");
|
||||
index_writer.add_document(doc!(
|
||||
isbn => "978-0099908401",
|
||||
title => "The old Man and the see"
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
isbn => "978-0140177398",
|
||||
title => "Of Mice and Men",
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankentein", //< Oops there is a typo here.
|
||||
isbn => "978-9176370711",
|
||||
));
|
||||
index_writer.commit()?;
|
||||
index.load_searchers()?;
|
||||
|
||||
let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711");
|
||||
|
||||
// Oops our frankenstein doc seems mispelled
|
||||
let frankenstein_doc_misspelled = extract_doc_given_isbn(&index, &frankenstein_isbn)?.unwrap();
|
||||
assert_eq!(
|
||||
schema.to_json(&frankenstein_doc_misspelled),
|
||||
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
|
||||
);
|
||||
|
||||
// # Update = Delete + Insert
|
||||
//
|
||||
// Here we will want to update the typo in the `Frankenstein` book.
|
||||
//
|
||||
// Tantivy does not handle updates directly, we need to delete
|
||||
// and reinsert the document.
|
||||
//
|
||||
// This can be complicated as it means you need to have access
|
||||
// to the entire document. It is good practise to integrate tantivy
|
||||
// with a key value store for this reason.
|
||||
//
|
||||
// To remove one of the document, we just call `delete_term`
|
||||
// on its id.
|
||||
//
|
||||
// Note that `tantivy` does nothing to enforce the idea that
|
||||
// there is only one document associated to this id.
|
||||
//
|
||||
// Also you might have noticed that we apply the delete before
|
||||
// having committed. This does not matter really...
|
||||
index_writer.delete_term(frankenstein_isbn.clone());
|
||||
|
||||
// We now need to reinsert our document without the typo.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
isbn => "978-9176370711",
|
||||
));
|
||||
|
||||
// You are guaranteed that your clients will only observe your index in
|
||||
// the state it was in after a commit.
|
||||
// In this example, your search engine will at no point be missing the *Frankenstein* document.
|
||||
// Everything happened as if the document was updated.
|
||||
index_writer.commit()?;
|
||||
// We reload our searcher to make our change available to clients.
|
||||
index.load_searchers()?;
|
||||
|
||||
// No more typo!
|
||||
let frankenstein_new_doc = extract_doc_given_isbn(&index, &frankenstein_isbn)?.unwrap();
|
||||
assert_eq!(
|
||||
schema.to_json(&frankenstein_new_doc),
|
||||
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
80
examples/faceted_search.rs
Normal file
80
examples/faceted_search.rs
Normal file
@@ -0,0 +1,80 @@
|
||||
// # Basic Example
|
||||
//
|
||||
// This example covers the basic functionalities of
|
||||
// tantivy.
|
||||
//
|
||||
// We will :
|
||||
// - define our schema
|
||||
// = create an index in a directory
|
||||
// - index few documents in our index
|
||||
// - search for the best document matchings "sea whale"
|
||||
// - retrieve the best document original content.
|
||||
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::FacetCollector;
|
||||
use tantivy::query::AllQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new("tantivy_facet_example_dir")?;
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
schema_builder.add_text_field("name", TEXT | STORED);
|
||||
|
||||
// this is our faceted field
|
||||
schema_builder.add_facet_field("tags");
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
let name = schema.get_field("name").unwrap();
|
||||
let tags = schema.get_field("tags").unwrap();
|
||||
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
index_writer.add_document(doc!(
|
||||
name => "the ditch",
|
||||
tags => Facet::from("/pools/north")
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
name => "little stacey",
|
||||
tags => Facet::from("/pools/south")
|
||||
));
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
index.load_searchers()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
|
||||
let mut facet_collector = FacetCollector::for_field(tags);
|
||||
facet_collector.add_facet("/pools");
|
||||
|
||||
let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
|
||||
// This lists all of the facet counts
|
||||
let facets: Vec<(&Facet, u64)> = facet_counts.get("/pools").collect();
|
||||
assert_eq!(
|
||||
facets,
|
||||
vec![
|
||||
(&Facet::from("/pools/north"), 1),
|
||||
(&Facet::from("/pools/south"), 1),
|
||||
]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
use tempdir::TempDir;
|
||||
@@ -1,2 +0,0 @@
|
||||
#!/bin/bash
|
||||
docco simple_search.rs -o html
|
||||
@@ -1,518 +0,0 @@
|
||||
/*--------------------- Typography ----------------------------*/
|
||||
|
||||
@font-face {
|
||||
font-family: 'aller-light';
|
||||
src: url('public/fonts/aller-light.eot');
|
||||
src: url('public/fonts/aller-light.eot?#iefix') format('embedded-opentype'),
|
||||
url('public/fonts/aller-light.woff') format('woff'),
|
||||
url('public/fonts/aller-light.ttf') format('truetype');
|
||||
font-weight: normal;
|
||||
font-style: normal;
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: 'aller-bold';
|
||||
src: url('public/fonts/aller-bold.eot');
|
||||
src: url('public/fonts/aller-bold.eot?#iefix') format('embedded-opentype'),
|
||||
url('public/fonts/aller-bold.woff') format('woff'),
|
||||
url('public/fonts/aller-bold.ttf') format('truetype');
|
||||
font-weight: normal;
|
||||
font-style: normal;
|
||||
}
|
||||
|
||||
@font-face {
|
||||
font-family: 'roboto-black';
|
||||
src: url('public/fonts/roboto-black.eot');
|
||||
src: url('public/fonts/roboto-black.eot?#iefix') format('embedded-opentype'),
|
||||
url('public/fonts/roboto-black.woff') format('woff'),
|
||||
url('public/fonts/roboto-black.ttf') format('truetype');
|
||||
font-weight: normal;
|
||||
font-style: normal;
|
||||
}
|
||||
|
||||
/*--------------------- Layout ----------------------------*/
|
||||
html { height: 100%; }
|
||||
body {
|
||||
font-family: "aller-light";
|
||||
font-size: 14px;
|
||||
line-height: 18px;
|
||||
color: #30404f;
|
||||
margin: 0; padding: 0;
|
||||
height:100%;
|
||||
}
|
||||
#container { min-height: 100%; }
|
||||
|
||||
a {
|
||||
color: #000;
|
||||
}
|
||||
|
||||
b, strong {
|
||||
font-weight: normal;
|
||||
font-family: "aller-bold";
|
||||
}
|
||||
|
||||
p {
|
||||
margin: 15px 0 0px;
|
||||
}
|
||||
.annotation ul, .annotation ol {
|
||||
margin: 25px 0;
|
||||
}
|
||||
.annotation ul li, .annotation ol li {
|
||||
font-size: 14px;
|
||||
line-height: 18px;
|
||||
margin: 10px 0;
|
||||
}
|
||||
|
||||
h1, h2, h3, h4, h5, h6 {
|
||||
color: #112233;
|
||||
line-height: 1em;
|
||||
font-weight: normal;
|
||||
font-family: "roboto-black";
|
||||
text-transform: uppercase;
|
||||
margin: 30px 0 15px 0;
|
||||
}
|
||||
|
||||
h1 {
|
||||
margin-top: 40px;
|
||||
}
|
||||
h2 {
|
||||
font-size: 1.26em;
|
||||
}
|
||||
|
||||
hr {
|
||||
border: 0;
|
||||
background: 1px #ddd;
|
||||
height: 1px;
|
||||
margin: 20px 0;
|
||||
}
|
||||
|
||||
pre, tt, code {
|
||||
font-size: 12px; line-height: 16px;
|
||||
font-family: Menlo, Monaco, Consolas, "Lucida Console", monospace;
|
||||
margin: 0; padding: 0;
|
||||
}
|
||||
.annotation pre {
|
||||
display: block;
|
||||
margin: 0;
|
||||
padding: 7px 10px;
|
||||
background: #fcfcfc;
|
||||
-moz-box-shadow: inset 0 0 10px rgba(0,0,0,0.1);
|
||||
-webkit-box-shadow: inset 0 0 10px rgba(0,0,0,0.1);
|
||||
box-shadow: inset 0 0 10px rgba(0,0,0,0.1);
|
||||
overflow-x: auto;
|
||||
}
|
||||
.annotation pre code {
|
||||
border: 0;
|
||||
padding: 0;
|
||||
background: transparent;
|
||||
}
|
||||
|
||||
|
||||
blockquote {
|
||||
border-left: 5px solid #ccc;
|
||||
margin: 0;
|
||||
padding: 1px 0 1px 1em;
|
||||
}
|
||||
.sections blockquote p {
|
||||
font-family: Menlo, Consolas, Monaco, monospace;
|
||||
font-size: 12px; line-height: 16px;
|
||||
color: #999;
|
||||
margin: 10px 0 0;
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
|
||||
ul.sections {
|
||||
list-style: none;
|
||||
padding:0 0 5px 0;;
|
||||
margin:0;
|
||||
}
|
||||
|
||||
/*
|
||||
Force border-box so that % widths fit the parent
|
||||
container without overlap because of margin/padding.
|
||||
|
||||
More Info : http://www.quirksmode.org/css/box.html
|
||||
*/
|
||||
ul.sections > li > div {
|
||||
-moz-box-sizing: border-box; /* firefox */
|
||||
-ms-box-sizing: border-box; /* ie */
|
||||
-webkit-box-sizing: border-box; /* webkit */
|
||||
-khtml-box-sizing: border-box; /* konqueror */
|
||||
box-sizing: border-box; /* css3 */
|
||||
}
|
||||
|
||||
|
||||
/*---------------------- Jump Page -----------------------------*/
|
||||
#jump_to, #jump_page {
|
||||
margin: 0;
|
||||
background: white;
|
||||
-webkit-box-shadow: 0 0 25px #777; -moz-box-shadow: 0 0 25px #777;
|
||||
-webkit-border-bottom-left-radius: 5px; -moz-border-radius-bottomleft: 5px;
|
||||
font: 16px Arial;
|
||||
cursor: pointer;
|
||||
text-align: right;
|
||||
list-style: none;
|
||||
}
|
||||
|
||||
#jump_to a {
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
#jump_to a.large {
|
||||
display: none;
|
||||
}
|
||||
#jump_to a.small {
|
||||
font-size: 22px;
|
||||
font-weight: bold;
|
||||
color: #676767;
|
||||
}
|
||||
|
||||
#jump_to, #jump_wrapper {
|
||||
position: fixed;
|
||||
right: 0; top: 0;
|
||||
padding: 10px 15px;
|
||||
margin:0;
|
||||
}
|
||||
|
||||
#jump_wrapper {
|
||||
display: none;
|
||||
padding:0;
|
||||
}
|
||||
|
||||
#jump_to:hover #jump_wrapper {
|
||||
display: block;
|
||||
}
|
||||
|
||||
#jump_page_wrapper{
|
||||
position: fixed;
|
||||
right: 0;
|
||||
top: 0;
|
||||
bottom: 0;
|
||||
}
|
||||
|
||||
#jump_page {
|
||||
padding: 5px 0 3px;
|
||||
margin: 0 0 25px 25px;
|
||||
max-height: 100%;
|
||||
overflow: auto;
|
||||
}
|
||||
|
||||
#jump_page .source {
|
||||
display: block;
|
||||
padding: 15px;
|
||||
text-decoration: none;
|
||||
border-top: 1px solid #eee;
|
||||
}
|
||||
|
||||
#jump_page .source:hover {
|
||||
background: #f5f5ff;
|
||||
}
|
||||
|
||||
#jump_page .source:first-child {
|
||||
}
|
||||
|
||||
/*---------------------- Low resolutions (> 320px) ---------------------*/
|
||||
@media only screen and (min-width: 320px) {
|
||||
.pilwrap { display: none; }
|
||||
|
||||
ul.sections > li > div {
|
||||
display: block;
|
||||
padding:5px 10px 0 10px;
|
||||
}
|
||||
|
||||
ul.sections > li > div.annotation ul, ul.sections > li > div.annotation ol {
|
||||
padding-left: 30px;
|
||||
}
|
||||
|
||||
ul.sections > li > div.content {
|
||||
overflow-x:auto;
|
||||
-webkit-box-shadow: inset 0 0 5px #e5e5ee;
|
||||
box-shadow: inset 0 0 5px #e5e5ee;
|
||||
border: 1px solid #dedede;
|
||||
margin:5px 10px 5px 10px;
|
||||
padding-bottom: 5px;
|
||||
}
|
||||
|
||||
ul.sections > li > div.annotation pre {
|
||||
margin: 7px 0 7px;
|
||||
padding-left: 15px;
|
||||
}
|
||||
|
||||
ul.sections > li > div.annotation p tt, .annotation code {
|
||||
background: #f8f8ff;
|
||||
border: 1px solid #dedede;
|
||||
font-size: 12px;
|
||||
padding: 0 0.2em;
|
||||
}
|
||||
}
|
||||
|
||||
/*---------------------- (> 481px) ---------------------*/
|
||||
@media only screen and (min-width: 481px) {
|
||||
#container {
|
||||
position: relative;
|
||||
}
|
||||
body {
|
||||
background-color: #F5F5FF;
|
||||
font-size: 15px;
|
||||
line-height: 21px;
|
||||
}
|
||||
pre, tt, code {
|
||||
line-height: 18px;
|
||||
}
|
||||
p, ul, ol {
|
||||
margin: 0 0 15px;
|
||||
}
|
||||
|
||||
|
||||
#jump_to {
|
||||
padding: 5px 10px;
|
||||
}
|
||||
#jump_wrapper {
|
||||
padding: 0;
|
||||
}
|
||||
#jump_to, #jump_page {
|
||||
font: 10px Arial;
|
||||
text-transform: uppercase;
|
||||
}
|
||||
#jump_page .source {
|
||||
padding: 5px 10px;
|
||||
}
|
||||
#jump_to a.large {
|
||||
display: inline-block;
|
||||
}
|
||||
#jump_to a.small {
|
||||
display: none;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#background {
|
||||
position: absolute;
|
||||
top: 0; bottom: 0;
|
||||
width: 350px;
|
||||
background: #fff;
|
||||
border-right: 1px solid #e5e5ee;
|
||||
z-index: -1;
|
||||
}
|
||||
|
||||
ul.sections > li > div.annotation ul, ul.sections > li > div.annotation ol {
|
||||
padding-left: 40px;
|
||||
}
|
||||
|
||||
ul.sections > li {
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
ul.sections > li > div {
|
||||
display: inline-block;
|
||||
}
|
||||
|
||||
ul.sections > li > div.annotation {
|
||||
max-width: 350px;
|
||||
min-width: 350px;
|
||||
min-height: 5px;
|
||||
padding: 13px;
|
||||
overflow-x: hidden;
|
||||
white-space: normal;
|
||||
vertical-align: top;
|
||||
text-align: left;
|
||||
}
|
||||
ul.sections > li > div.annotation pre {
|
||||
margin: 15px 0 15px;
|
||||
padding-left: 15px;
|
||||
}
|
||||
|
||||
ul.sections > li > div.content {
|
||||
padding: 13px;
|
||||
vertical-align: top;
|
||||
border: none;
|
||||
-webkit-box-shadow: none;
|
||||
box-shadow: none;
|
||||
}
|
||||
|
||||
.pilwrap {
|
||||
position: relative;
|
||||
display: inline;
|
||||
}
|
||||
|
||||
.pilcrow {
|
||||
font: 12px Arial;
|
||||
text-decoration: none;
|
||||
color: #454545;
|
||||
position: absolute;
|
||||
top: 3px; left: -20px;
|
||||
padding: 1px 2px;
|
||||
opacity: 0;
|
||||
-webkit-transition: opacity 0.2s linear;
|
||||
}
|
||||
.for-h1 .pilcrow {
|
||||
top: 47px;
|
||||
}
|
||||
.for-h2 .pilcrow, .for-h3 .pilcrow, .for-h4 .pilcrow {
|
||||
top: 35px;
|
||||
}
|
||||
|
||||
ul.sections > li > div.annotation:hover .pilcrow {
|
||||
opacity: 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*---------------------- (> 1025px) ---------------------*/
|
||||
@media only screen and (min-width: 1025px) {
|
||||
|
||||
body {
|
||||
font-size: 16px;
|
||||
line-height: 24px;
|
||||
}
|
||||
|
||||
#background {
|
||||
width: 525px;
|
||||
}
|
||||
ul.sections > li > div.annotation {
|
||||
max-width: 525px;
|
||||
min-width: 525px;
|
||||
padding: 10px 25px 1px 50px;
|
||||
}
|
||||
ul.sections > li > div.content {
|
||||
padding: 9px 15px 16px 25px;
|
||||
}
|
||||
}
|
||||
|
||||
/*---------------------- Syntax Highlighting -----------------------------*/
|
||||
|
||||
td.linenos { background-color: #f0f0f0; padding-right: 10px; }
|
||||
span.lineno { background-color: #f0f0f0; padding: 0 5px 0 5px; }
|
||||
/*
|
||||
|
||||
github.com style (c) Vasily Polovnyov <vast@whiteants.net>
|
||||
|
||||
*/
|
||||
|
||||
pre code {
|
||||
display: block; padding: 0.5em;
|
||||
color: #000;
|
||||
background: #f8f8ff
|
||||
}
|
||||
|
||||
pre .hljs-comment,
|
||||
pre .hljs-template_comment,
|
||||
pre .hljs-diff .hljs-header,
|
||||
pre .hljs-javadoc {
|
||||
color: #408080;
|
||||
font-style: italic
|
||||
}
|
||||
|
||||
pre .hljs-keyword,
|
||||
pre .hljs-assignment,
|
||||
pre .hljs-literal,
|
||||
pre .hljs-css .hljs-rule .hljs-keyword,
|
||||
pre .hljs-winutils,
|
||||
pre .hljs-javascript .hljs-title,
|
||||
pre .hljs-lisp .hljs-title,
|
||||
pre .hljs-subst {
|
||||
color: #954121;
|
||||
/*font-weight: bold*/
|
||||
}
|
||||
|
||||
pre .hljs-number,
|
||||
pre .hljs-hexcolor {
|
||||
color: #40a070
|
||||
}
|
||||
|
||||
pre .hljs-string,
|
||||
pre .hljs-tag .hljs-value,
|
||||
pre .hljs-phpdoc,
|
||||
pre .hljs-tex .hljs-formula {
|
||||
color: #219161;
|
||||
}
|
||||
|
||||
pre .hljs-title,
|
||||
pre .hljs-id {
|
||||
color: #19469D;
|
||||
}
|
||||
pre .hljs-params {
|
||||
color: #00F;
|
||||
}
|
||||
|
||||
pre .hljs-javascript .hljs-title,
|
||||
pre .hljs-lisp .hljs-title,
|
||||
pre .hljs-subst {
|
||||
font-weight: normal
|
||||
}
|
||||
|
||||
pre .hljs-class .hljs-title,
|
||||
pre .hljs-haskell .hljs-label,
|
||||
pre .hljs-tex .hljs-command {
|
||||
color: #458;
|
||||
font-weight: bold
|
||||
}
|
||||
|
||||
pre .hljs-tag,
|
||||
pre .hljs-tag .hljs-title,
|
||||
pre .hljs-rules .hljs-property,
|
||||
pre .hljs-django .hljs-tag .hljs-keyword {
|
||||
color: #000080;
|
||||
font-weight: normal
|
||||
}
|
||||
|
||||
pre .hljs-attribute,
|
||||
pre .hljs-variable,
|
||||
pre .hljs-instancevar,
|
||||
pre .hljs-lisp .hljs-body {
|
||||
color: #008080
|
||||
}
|
||||
|
||||
pre .hljs-regexp {
|
||||
color: #B68
|
||||
}
|
||||
|
||||
pre .hljs-class {
|
||||
color: #458;
|
||||
font-weight: bold
|
||||
}
|
||||
|
||||
pre .hljs-symbol,
|
||||
pre .hljs-ruby .hljs-symbol .hljs-string,
|
||||
pre .hljs-ruby .hljs-symbol .hljs-keyword,
|
||||
pre .hljs-ruby .hljs-symbol .hljs-keymethods,
|
||||
pre .hljs-lisp .hljs-keyword,
|
||||
pre .hljs-tex .hljs-special,
|
||||
pre .hljs-input_number {
|
||||
color: #990073
|
||||
}
|
||||
|
||||
pre .hljs-builtin,
|
||||
pre .hljs-constructor,
|
||||
pre .hljs-built_in,
|
||||
pre .hljs-lisp .hljs-title {
|
||||
color: #0086b3
|
||||
}
|
||||
|
||||
pre .hljs-preprocessor,
|
||||
pre .hljs-pi,
|
||||
pre .hljs-doctype,
|
||||
pre .hljs-shebang,
|
||||
pre .hljs-cdata {
|
||||
color: #999;
|
||||
font-weight: bold
|
||||
}
|
||||
|
||||
pre .hljs-deletion {
|
||||
background: #fdd
|
||||
}
|
||||
|
||||
pre .hljs-addition {
|
||||
background: #dfd
|
||||
}
|
||||
|
||||
pre .hljs-diff .hljs-change {
|
||||
background: #0086b3
|
||||
}
|
||||
|
||||
pre .hljs-chunk {
|
||||
color: #aaa
|
||||
}
|
||||
|
||||
pre .hljs-tex .hljs-formula {
|
||||
opacity: 0.5;
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 56 KiB |
375
examples/html/public/stylesheets/normalize.css
vendored
375
examples/html/public/stylesheets/normalize.css
vendored
@@ -1,375 +0,0 @@
|
||||
/*! normalize.css v2.0.1 | MIT License | git.io/normalize */
|
||||
|
||||
/* ==========================================================================
|
||||
HTML5 display definitions
|
||||
========================================================================== */
|
||||
|
||||
/*
|
||||
* Corrects `block` display not defined in IE 8/9.
|
||||
*/
|
||||
|
||||
article,
|
||||
aside,
|
||||
details,
|
||||
figcaption,
|
||||
figure,
|
||||
footer,
|
||||
header,
|
||||
hgroup,
|
||||
nav,
|
||||
section,
|
||||
summary {
|
||||
display: block;
|
||||
}
|
||||
|
||||
/*
|
||||
* Corrects `inline-block` display not defined in IE 8/9.
|
||||
*/
|
||||
|
||||
audio,
|
||||
canvas,
|
||||
video {
|
||||
display: inline-block;
|
||||
}
|
||||
|
||||
/*
|
||||
* Prevents modern browsers from displaying `audio` without controls.
|
||||
* Remove excess height in iOS 5 devices.
|
||||
*/
|
||||
|
||||
audio:not([controls]) {
|
||||
display: none;
|
||||
height: 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Addresses styling for `hidden` attribute not present in IE 8/9.
|
||||
*/
|
||||
|
||||
[hidden] {
|
||||
display: none;
|
||||
}
|
||||
|
||||
/* ==========================================================================
|
||||
Base
|
||||
========================================================================== */
|
||||
|
||||
/*
|
||||
* 1. Sets default font family to sans-serif.
|
||||
* 2. Prevents iOS text size adjust after orientation change, without disabling
|
||||
* user zoom.
|
||||
*/
|
||||
|
||||
html {
|
||||
font-family: sans-serif; /* 1 */
|
||||
-webkit-text-size-adjust: 100%; /* 2 */
|
||||
-ms-text-size-adjust: 100%; /* 2 */
|
||||
}
|
||||
|
||||
/*
|
||||
* Removes default margin.
|
||||
*/
|
||||
|
||||
body {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
/* ==========================================================================
|
||||
Links
|
||||
========================================================================== */
|
||||
|
||||
/*
|
||||
* Addresses `outline` inconsistency between Chrome and other browsers.
|
||||
*/
|
||||
|
||||
a:focus {
|
||||
outline: thin dotted;
|
||||
}
|
||||
|
||||
/*
|
||||
* Improves readability when focused and also mouse hovered in all browsers.
|
||||
*/
|
||||
|
||||
a:active,
|
||||
a:hover {
|
||||
outline: 0;
|
||||
}
|
||||
|
||||
/* ==========================================================================
|
||||
Typography
|
||||
========================================================================== */
|
||||
|
||||
/*
|
||||
* Addresses `h1` font sizes within `section` and `article` in Firefox 4+,
|
||||
* Safari 5, and Chrome.
|
||||
*/
|
||||
|
||||
h1 {
|
||||
font-size: 2em;
|
||||
}
|
||||
|
||||
/*
|
||||
* Addresses styling not present in IE 8/9, Safari 5, and Chrome.
|
||||
*/
|
||||
|
||||
abbr[title] {
|
||||
border-bottom: 1px dotted;
|
||||
}
|
||||
|
||||
/*
|
||||
* Addresses style set to `bolder` in Firefox 4+, Safari 5, and Chrome.
|
||||
*/
|
||||
|
||||
b,
|
||||
strong {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
/*
|
||||
* Addresses styling not present in Safari 5 and Chrome.
|
||||
*/
|
||||
|
||||
dfn {
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
/*
|
||||
* Addresses styling not present in IE 8/9.
|
||||
*/
|
||||
|
||||
mark {
|
||||
background: #ff0;
|
||||
color: #000;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Corrects font family set oddly in Safari 5 and Chrome.
|
||||
*/
|
||||
|
||||
code,
|
||||
kbd,
|
||||
pre,
|
||||
samp {
|
||||
font-family: monospace, serif;
|
||||
font-size: 1em;
|
||||
}
|
||||
|
||||
/*
|
||||
* Improves readability of pre-formatted text in all browsers.
|
||||
*/
|
||||
|
||||
pre {
|
||||
white-space: pre;
|
||||
white-space: pre-wrap;
|
||||
word-wrap: break-word;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sets consistent quote types.
|
||||
*/
|
||||
|
||||
q {
|
||||
quotes: "\201C" "\201D" "\2018" "\2019";
|
||||
}
|
||||
|
||||
/*
|
||||
* Addresses inconsistent and variable font size in all browsers.
|
||||
*/
|
||||
|
||||
small {
|
||||
font-size: 80%;
|
||||
}
|
||||
|
||||
/*
|
||||
* Prevents `sub` and `sup` affecting `line-height` in all browsers.
|
||||
*/
|
||||
|
||||
sub,
|
||||
sup {
|
||||
font-size: 75%;
|
||||
line-height: 0;
|
||||
position: relative;
|
||||
vertical-align: baseline;
|
||||
}
|
||||
|
||||
sup {
|
||||
top: -0.5em;
|
||||
}
|
||||
|
||||
sub {
|
||||
bottom: -0.25em;
|
||||
}
|
||||
|
||||
/* ==========================================================================
|
||||
Embedded content
|
||||
========================================================================== */
|
||||
|
||||
/*
|
||||
* Removes border when inside `a` element in IE 8/9.
|
||||
*/
|
||||
|
||||
img {
|
||||
border: 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Corrects overflow displayed oddly in IE 9.
|
||||
*/
|
||||
|
||||
svg:not(:root) {
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
/* ==========================================================================
|
||||
Figures
|
||||
========================================================================== */
|
||||
|
||||
/*
|
||||
* Addresses margin not present in IE 8/9 and Safari 5.
|
||||
*/
|
||||
|
||||
figure {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
/* ==========================================================================
|
||||
Forms
|
||||
========================================================================== */
|
||||
|
||||
/*
|
||||
* Define consistent border, margin, and padding.
|
||||
*/
|
||||
|
||||
fieldset {
|
||||
border: 1px solid #c0c0c0;
|
||||
margin: 0 2px;
|
||||
padding: 0.35em 0.625em 0.75em;
|
||||
}
|
||||
|
||||
/*
|
||||
* 1. Corrects color not being inherited in IE 8/9.
|
||||
* 2. Remove padding so people aren't caught out if they zero out fieldsets.
|
||||
*/
|
||||
|
||||
legend {
|
||||
border: 0; /* 1 */
|
||||
padding: 0; /* 2 */
|
||||
}
|
||||
|
||||
/*
|
||||
* 1. Corrects font family not being inherited in all browsers.
|
||||
* 2. Corrects font size not being inherited in all browsers.
|
||||
* 3. Addresses margins set differently in Firefox 4+, Safari 5, and Chrome
|
||||
*/
|
||||
|
||||
button,
|
||||
input,
|
||||
select,
|
||||
textarea {
|
||||
font-family: inherit; /* 1 */
|
||||
font-size: 100%; /* 2 */
|
||||
margin: 0; /* 3 */
|
||||
}
|
||||
|
||||
/*
|
||||
* Addresses Firefox 4+ setting `line-height` on `input` using `!important` in
|
||||
* the UA stylesheet.
|
||||
*/
|
||||
|
||||
button,
|
||||
input {
|
||||
line-height: normal;
|
||||
}
|
||||
|
||||
/*
|
||||
* 1. Avoid the WebKit bug in Android 4.0.* where (2) destroys native `audio`
|
||||
* and `video` controls.
|
||||
* 2. Corrects inability to style clickable `input` types in iOS.
|
||||
* 3. Improves usability and consistency of cursor style between image-type
|
||||
* `input` and others.
|
||||
*/
|
||||
|
||||
button,
|
||||
html input[type="button"], /* 1 */
|
||||
input[type="reset"],
|
||||
input[type="submit"] {
|
||||
-webkit-appearance: button; /* 2 */
|
||||
cursor: pointer; /* 3 */
|
||||
}
|
||||
|
||||
/*
|
||||
* Re-set default cursor for disabled elements.
|
||||
*/
|
||||
|
||||
button[disabled],
|
||||
input[disabled] {
|
||||
cursor: default;
|
||||
}
|
||||
|
||||
/*
|
||||
* 1. Addresses box sizing set to `content-box` in IE 8/9.
|
||||
* 2. Removes excess padding in IE 8/9.
|
||||
*/
|
||||
|
||||
input[type="checkbox"],
|
||||
input[type="radio"] {
|
||||
box-sizing: border-box; /* 1 */
|
||||
padding: 0; /* 2 */
|
||||
}
|
||||
|
||||
/*
|
||||
* 1. Addresses `appearance` set to `searchfield` in Safari 5 and Chrome.
|
||||
* 2. Addresses `box-sizing` set to `border-box` in Safari 5 and Chrome
|
||||
* (include `-moz` to future-proof).
|
||||
*/
|
||||
|
||||
input[type="search"] {
|
||||
-webkit-appearance: textfield; /* 1 */
|
||||
-moz-box-sizing: content-box;
|
||||
-webkit-box-sizing: content-box; /* 2 */
|
||||
box-sizing: content-box;
|
||||
}
|
||||
|
||||
/*
|
||||
* Removes inner padding and search cancel button in Safari 5 and Chrome
|
||||
* on OS X.
|
||||
*/
|
||||
|
||||
input[type="search"]::-webkit-search-cancel-button,
|
||||
input[type="search"]::-webkit-search-decoration {
|
||||
-webkit-appearance: none;
|
||||
}
|
||||
|
||||
/*
|
||||
* Removes inner padding and border in Firefox 4+.
|
||||
*/
|
||||
|
||||
button::-moz-focus-inner,
|
||||
input::-moz-focus-inner {
|
||||
border: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* 1. Removes default vertical scrollbar in IE 8/9.
|
||||
* 2. Improves readability and alignment in all browsers.
|
||||
*/
|
||||
|
||||
textarea {
|
||||
overflow: auto; /* 1 */
|
||||
vertical-align: top; /* 2 */
|
||||
}
|
||||
|
||||
/* ==========================================================================
|
||||
Tables
|
||||
========================================================================== */
|
||||
|
||||
/*
|
||||
* Remove most spacing between table cells.
|
||||
*/
|
||||
|
||||
table {
|
||||
border-collapse: collapse;
|
||||
border-spacing: 0;
|
||||
}
|
||||
@@ -1,542 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>simple_search.rs</title>
|
||||
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
|
||||
<meta name="viewport" content="width=device-width, target-densitydpi=160dpi, initial-scale=1.0; maximum-scale=1.0; user-scalable=0;">
|
||||
<link rel="stylesheet" media="all" href="docco.css" />
|
||||
</head>
|
||||
<body>
|
||||
<div id="container">
|
||||
<div id="background"></div>
|
||||
|
||||
<ul class="sections">
|
||||
|
||||
<li id="title">
|
||||
<div class="annotation">
|
||||
<h1>simple_search.rs</h1>
|
||||
</div>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
<li id="section-1">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-1">¶</a>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre><span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> tantivy;
|
||||
<span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> tempdir;
|
||||
|
||||
<span class="hljs-meta">#[macro_use]</span>
|
||||
<span class="hljs-keyword">extern</span> <span class="hljs-keyword">crate</span> serde_json;
|
||||
|
||||
<span class="hljs-keyword">use</span> std::path::Path;
|
||||
<span class="hljs-keyword">use</span> tempdir::TempDir;
|
||||
<span class="hljs-keyword">use</span> tantivy::Index;
|
||||
<span class="hljs-keyword">use</span> tantivy::schema::*;
|
||||
<span class="hljs-keyword">use</span> tantivy::collector::TopCollector;
|
||||
<span class="hljs-keyword">use</span> tantivy::query::QueryParser;
|
||||
|
||||
<span class="hljs-function"><span class="hljs-keyword">fn</span> <span class="hljs-title">main</span></span>() {</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-2">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-2">¶</a>
|
||||
</div>
|
||||
<p>Let’s create a temporary directory for the
|
||||
sake of this example</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">if</span> <span class="hljs-keyword">let</span> <span class="hljs-literal">Ok</span>(dir) = TempDir::new(<span class="hljs-string">"tantivy_example_dir"</span>) {
|
||||
run_example(dir.path()).unwrap();
|
||||
dir.close().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
<span class="hljs-function"><span class="hljs-keyword">fn</span> <span class="hljs-title">run_example</span></span>(index_path: &Path) -> tantivy::<span class="hljs-built_in">Result</span><()> {</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-3">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-3">¶</a>
|
||||
</div>
|
||||
<h1 id="defining-the-schema">Defining the schema</h1>
|
||||
<p>The Tantivy index requires a very strict schema.
|
||||
The schema declares which fields are in the index,
|
||||
and for each field, its type and “the way it should
|
||||
be indexed”.</p>
|
||||
|
||||
</div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-4">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-4">¶</a>
|
||||
</div>
|
||||
<p>first we need to define a schema …</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> schema_builder = SchemaBuilder::<span class="hljs-keyword">default</span>();</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-5">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-5">¶</a>
|
||||
</div>
|
||||
<p>Our first field is title.
|
||||
We want full-text search for it, and we also want
|
||||
to be able to retrieve the document after the search.</p>
|
||||
<p>TEXT | STORED is some syntactic sugar to describe
|
||||
that.</p>
|
||||
<p><code>TEXT</code> means the field should be tokenized and indexed,
|
||||
along with its term frequency and term positions.</p>
|
||||
<p><code>STORED</code> means that the field will also be saved
|
||||
in a compressed, row-oriented key-value store.
|
||||
This store is useful to reconstruct the
|
||||
documents that were selected during the search phase.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> schema_builder.add_text_field(<span class="hljs-string">"title"</span>, TEXT | STORED);</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-6">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-6">¶</a>
|
||||
</div>
|
||||
<p>Our second field is body.
|
||||
We want full-text search for it, but we do not
|
||||
need to be able to be able to retrieve it
|
||||
for our application. </p>
|
||||
<p>We can make our index lighter and
|
||||
by omitting <code>STORED</code> flag.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> schema_builder.add_text_field(<span class="hljs-string">"body"</span>, TEXT);
|
||||
|
||||
<span class="hljs-keyword">let</span> schema = schema_builder.build();</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-7">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-7">¶</a>
|
||||
</div>
|
||||
<h1 id="indexing-documents">Indexing documents</h1>
|
||||
<p>Let’s create a brand new index.</p>
|
||||
<p>This will actually just save a meta.json
|
||||
with our schema in the directory.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> index = Index::create(index_path, schema.clone())?;</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-8">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-8">¶</a>
|
||||
</div>
|
||||
<p>To insert document we need an index writer.
|
||||
There must be only one writer at a time.
|
||||
This single <code>IndexWriter</code> is already
|
||||
multithreaded.</p>
|
||||
<p>Here we use a buffer of 50MB per thread. Using a bigger
|
||||
heap for the indexer can increase its throughput.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> index_writer = index.writer(<span class="hljs-number">50_000_000</span>)?;</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-9">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-9">¶</a>
|
||||
</div>
|
||||
<p>Let’s index our documents!
|
||||
We first need a handle on the title and the body field.</p>
|
||||
|
||||
</div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-10">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-10">¶</a>
|
||||
</div>
|
||||
<h3 id="create-a-document-manually-">Create a document “manually”.</h3>
|
||||
<p>We can create a document manually, by setting the fields
|
||||
one by one in a Document object.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> title = schema.get_field(<span class="hljs-string">"title"</span>).unwrap();
|
||||
<span class="hljs-keyword">let</span> body = schema.get_field(<span class="hljs-string">"body"</span>).unwrap();
|
||||
|
||||
<span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> old_man_doc = Document::<span class="hljs-keyword">default</span>();
|
||||
old_man_doc.add_text(title, <span class="hljs-string">"The Old Man and the Sea"</span>);
|
||||
old_man_doc.add_text(
|
||||
body,
|
||||
<span class="hljs-string">"He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."</span>,
|
||||
);</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-11">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-11">¶</a>
|
||||
</div>
|
||||
<p>… and add it to the <code>IndexWriter</code>.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> index_writer.add_document(old_man_doc);</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-12">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-12">¶</a>
|
||||
</div>
|
||||
<h3 id="create-a-document-directly-from-json-">Create a document directly from json.</h3>
|
||||
<p>Alternatively, we can use our schema to parse a
|
||||
document object directly from json.
|
||||
The document is a string, but we use the <code>json</code> macro
|
||||
from <code>serde_json</code> for the convenience of multi-line support.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> json = json!({
|
||||
<span class="hljs-string">"title"</span>: <span class="hljs-string">"Of Mice and Men"</span>,
|
||||
<span class="hljs-string">"body"</span>: <span class="hljs-string">"A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"</span>
|
||||
});
|
||||
<span class="hljs-keyword">let</span> mice_and_men_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(mice_and_men_doc);</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-13">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-13">¶</a>
|
||||
</div>
|
||||
<p>Multi-valued field are allowed, they are
|
||||
expressed in JSON by an array.
|
||||
The following document has two titles.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> json = json!({
|
||||
<span class="hljs-string">"title"</span>: [<span class="hljs-string">"Frankenstein"</span>, <span class="hljs-string">"The Modern Prometheus"</span>],
|
||||
<span class="hljs-string">"body"</span>: <span class="hljs-string">"You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."</span>
|
||||
});
|
||||
<span class="hljs-keyword">let</span> frankenstein_doc = schema.parse_document(&json.to_string())?;
|
||||
|
||||
index_writer.add_document(frankenstein_doc);</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-14">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-14">¶</a>
|
||||
</div>
|
||||
<p>This is an example, so we will only index 3 documents
|
||||
here. You can check out tantivy’s tutorial to index
|
||||
the English wikipedia. Tantivy’s indexing is rather fast.
|
||||
Indexing 5 million articles of the English wikipedia takes
|
||||
around 4 minutes on my computer!</p>
|
||||
|
||||
</div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-15">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-15">¶</a>
|
||||
</div>
|
||||
<h3 id="committing">Committing</h3>
|
||||
<p>At this point our documents are not searchable.</p>
|
||||
<p>We need to call .commit() explicitly to force the
|
||||
index_writer to finish processing the documents in the queue,
|
||||
flush the current index to the disk, and advertise
|
||||
the existence of new documents.</p>
|
||||
<p>This call is blocking.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> index_writer.commit()?;</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-16">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-16">¶</a>
|
||||
</div>
|
||||
<p>If <code>.commit()</code> returns correctly, then all of the
|
||||
documents that have been added are guaranteed to be
|
||||
persistently indexed.</p>
|
||||
<p>In the scenario of a crash or a power failure,
|
||||
tantivy behaves as if has rolled back to its last
|
||||
commit.</p>
|
||||
|
||||
</div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-17">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-17">¶</a>
|
||||
</div>
|
||||
<h1 id="searching">Searching</h1>
|
||||
<p>Let’s search our index. Start by reloading
|
||||
searchers in the index. This should be done
|
||||
after every commit().</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> index.load_searchers()?;</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-18">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-18">¶</a>
|
||||
</div>
|
||||
<p>Afterwards create one (or more) searchers.</p>
|
||||
<p>You should create a searcher
|
||||
every time you start a “search query”.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> searcher = index.searcher();</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-19">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-19">¶</a>
|
||||
</div>
|
||||
<p>The query parser can interpret human queries.
|
||||
Here, if the user does not specify which
|
||||
field they want to search, tantivy will search
|
||||
in both title and body.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> query_parser = QueryParser::for_index(index, <span class="hljs-built_in">vec!</span>[title, body]);</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-20">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-20">¶</a>
|
||||
</div>
|
||||
<p>QueryParser may fail if the query is not in the right
|
||||
format. For user facing applications, this can be a problem.
|
||||
A ticket has been opened regarding this problem.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> query = query_parser.parse_query(<span class="hljs-string">"sea whale"</span>)?;</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-21">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-21">¶</a>
|
||||
</div>
|
||||
<p>A query defines a set of documents, as
|
||||
well as the way they should be scored.</p>
|
||||
<p>A query created by the query parser is scored according
|
||||
to a metric called Tf-Idf, and will consider
|
||||
any document matching at least one of our terms.</p>
|
||||
|
||||
</div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-22">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-22">¶</a>
|
||||
</div>
|
||||
<h3 id="collectors">Collectors</h3>
|
||||
<p>We are not interested in all of the documents but
|
||||
only in the top 10. Keeping track of our top 10 best documents
|
||||
is the role of the TopCollector.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut</span> top_collector = TopCollector::with_limit(<span class="hljs-number">10</span>);</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-23">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-23">¶</a>
|
||||
</div>
|
||||
<p>We can now perform our query.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> searcher.search(&*query, &<span class="hljs-keyword">mut</span> top_collector)?;</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-24">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-24">¶</a>
|
||||
</div>
|
||||
<p>Our top collector now contains the 10
|
||||
most relevant doc ids…</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> <span class="hljs-keyword">let</span> doc_addresses = top_collector.docs();</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-25">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-25">¶</a>
|
||||
</div>
|
||||
<p>The actual documents still need to be
|
||||
retrieved from Tantivy’s store.</p>
|
||||
<p>Since the body field was not configured as stored,
|
||||
the document returned will only contain
|
||||
a title.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre>
|
||||
<span class="hljs-keyword">for</span> doc_address <span class="hljs-keyword">in</span> doc_addresses {
|
||||
<span class="hljs-keyword">let</span> retrieved_doc = searcher.doc(&doc_address)?;
|
||||
<span class="hljs-built_in">println!</span>(<span class="hljs-string">"{}"</span>, schema.to_json(&retrieved_doc));
|
||||
}</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
<li id="section-26">
|
||||
<div class="annotation">
|
||||
|
||||
<div class="pilwrap ">
|
||||
<a class="pilcrow" href="#section-26">¶</a>
|
||||
</div>
|
||||
<p>Wait for indexing and merging threads to shut down.
|
||||
Usually this isn’t needed, but in <code>main</code> we try to
|
||||
delete the temporary directory and that fails on
|
||||
Windows if the files are still open.</p>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="content"><div class='highlight'><pre> index_writer.wait_merging_threads()?;
|
||||
|
||||
<span class="hljs-literal">Ok</span>(())
|
||||
}</pre></div></div>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
133
examples/iterating_docs_and_positions.rs
Normal file
133
examples/iterating_docs_and_positions.rs
Normal file
@@ -0,0 +1,133 @@
|
||||
// # Iterating docs and positioms.
|
||||
//
|
||||
// At its core of tantivy, relies on a data structure
|
||||
// called an inverted index.
|
||||
//
|
||||
// This example shows how to manually iterate through
|
||||
// the list of documents containing a term, getting
|
||||
// its term frequency, and accessing its positions.
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::{DocId, DocSet, Postings};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// We first create a schema for the sake of the
|
||||
// example. Check the `basic_search` example for more information.
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// For this example, we need to make sure to index positions for our title
|
||||
// field. `TEXT` precisely does this.
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"));
|
||||
index_writer.add_document(doc!(title => "Of Mice and Men"));
|
||||
index_writer.add_document(doc!(title => "The modern Promotheus"));
|
||||
index_writer.commit()?;
|
||||
|
||||
index.load_searchers()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
|
||||
// A tantivy index is actually a collection of segments.
|
||||
// Similarly, a searcher just wraps a list `segment_reader`.
|
||||
//
|
||||
// (Because we indexed a very small number of documents over one thread
|
||||
// there is actually only one segment here, but let's iterate through the list
|
||||
// anyway)
|
||||
for segment_reader in searcher.segment_readers() {
|
||||
// A segment contains different data structure.
|
||||
// Inverted index stands for the combination of
|
||||
// - the term dictionary
|
||||
// - the inverted lists associated to each terms and their positions
|
||||
let inverted_index = segment_reader.inverted_index(title);
|
||||
|
||||
// A `Term` is a text token associated with a field.
|
||||
// Let's go through all docs containing the term `title:the` and access their position
|
||||
let term_the = Term::from_field_text(title, "the");
|
||||
|
||||
// This segment posting object is like a cursor over the documents matching the term.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies
|
||||
// and positions.
|
||||
//
|
||||
// If you don't need all this information, you may get better performance by decompressing less
|
||||
// information.
|
||||
if let Some(mut segment_postings) =
|
||||
inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions)
|
||||
{
|
||||
// this buffer will be used to request for positions
|
||||
let mut positions: Vec<u32> = Vec::with_capacity(100);
|
||||
while segment_postings.advance() {
|
||||
// the number of time the term appears in the document.
|
||||
let doc_id: DocId = segment_postings.doc(); //< do not try to access this before calling advance once.
|
||||
|
||||
// This MAY contains deleted documents as well.
|
||||
if segment_reader.is_deleted(doc_id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// the number of time the term appears in the document.
|
||||
let term_freq: u32 = segment_postings.term_freq();
|
||||
// accessing positions is slightly expensive and lazy, do not request
|
||||
// for them if you don't need them for some documents.
|
||||
segment_postings.positions(&mut positions);
|
||||
|
||||
// By definition we should have `term_freq` positions.
|
||||
assert_eq!(positions.len(), term_freq as usize);
|
||||
|
||||
// This prints:
|
||||
// ```
|
||||
// Doc 0: TermFreq 2: [0, 4]
|
||||
// Doc 2: TermFreq 1: [0]
|
||||
// ```
|
||||
println!("Doc {}: TermFreq {}: {:?}", doc_id, term_freq, positions);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// A `Term` is a text token associated with a field.
|
||||
// Let's go through all docs containing the term `title:the` and access their position
|
||||
let term_the = Term::from_field_text(title, "the");
|
||||
|
||||
// Some other powerful operations (especially `.skip_to`) may be useful to consume these
|
||||
// posting lists rapidly.
|
||||
// You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait
|
||||
// and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait
|
||||
|
||||
// Also, for some VERY specific high performance use case like an OLAP analysis of logs,
|
||||
// you can get better performance by accessing directly the blocks of doc ids.
|
||||
for segment_reader in searcher.segment_readers() {
|
||||
// A segment contains different data structure.
|
||||
// Inverted index stands for the combination of
|
||||
// - the term dictionary
|
||||
// - the inverted lists associated to each terms and their positions
|
||||
let inverted_index = segment_reader.inverted_index(title);
|
||||
|
||||
// This segment posting object is like a cursor over the documents matching the term.
|
||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies
|
||||
// and positions.
|
||||
//
|
||||
// If you don't need all this information, you may get better performance by decompressing less
|
||||
// information.
|
||||
if let Some(mut block_segment_postings) =
|
||||
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)
|
||||
{
|
||||
while block_segment_postings.advance() {
|
||||
// Once again these docs MAY contains deleted documents as well.
|
||||
let docs = block_segment_postings.docs();
|
||||
// Prints `Docs [0, 2].`
|
||||
println!("Docs {:?}", docs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
87
examples/snippet.rs
Normal file
87
examples/snippet.rs
Normal file
@@ -0,0 +1,87 @@
|
||||
// # Snippet example
|
||||
//
|
||||
// This example shows how to return a representative snippet of
|
||||
// your hit result.
|
||||
// Snippet are an extracted of a target document, and returned in HTML format.
|
||||
// The keyword searched by the user are highlighted with a `<b>` tag.
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::{Snippet, SnippetGenerator};
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new("tantivy_example_dir")?;
|
||||
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let body = schema_builder.add_text_field("body", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
// we'll only need one doc for this example.
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
// ...
|
||||
index_writer.commit()?;
|
||||
|
||||
index.load_searchers()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
let query = query_parser.parse_query("sycamore spring")?;
|
||||
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
|
||||
|
||||
for (score, doc_address) in top_docs {
|
||||
let doc = searcher.doc(doc_address)?;
|
||||
let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||
println!("Document score {}:", score);
|
||||
println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
|
||||
println!("snippet: {}", snippet.to_html());
|
||||
println!("custom highlighting: {}", highlight(snippet));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn highlight(snippet: Snippet) -> String {
|
||||
let mut result = String::new();
|
||||
let mut start_from = 0;
|
||||
|
||||
for (start, end) in snippet.highlighted().iter().map(|h| h.bounds()) {
|
||||
result.push_str(&snippet.fragments()[start_from..start]);
|
||||
result.push_str(" --> ");
|
||||
result.push_str(&snippet.fragments()[start..end]);
|
||||
result.push_str(" <-- ");
|
||||
start_from = end;
|
||||
}
|
||||
|
||||
result.push_str(&snippet.fragments()[start_from..]);
|
||||
result
|
||||
}
|
||||
117
examples/stop_words.rs
Normal file
117
examples/stop_words.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
// # Stop Words Example
|
||||
//
|
||||
// This example covers the basic usage of stop words
|
||||
// with tantivy
|
||||
//
|
||||
// We will :
|
||||
// - define our schema
|
||||
// - create an index in a directory
|
||||
// - add a few stop words
|
||||
// - index few documents in our index
|
||||
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::*;
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// this example assumes you understand the content in `basic_search`
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// This configures your custom options for how tantivy will
|
||||
// store and process your content in the index; The key
|
||||
// to note is that we are setting the tokenizer to `stoppy`
|
||||
// which will be defined and registered below.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
|
||||
// Our first field is title.
|
||||
schema_builder.add_text_field("title", text_options);
|
||||
|
||||
// Our second field is body.
|
||||
let text_field_indexing = TextFieldIndexing::default()
|
||||
.set_tokenizer("stoppy")
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
let text_options = TextOptions::default()
|
||||
.set_indexing_options(text_field_indexing)
|
||||
.set_stored();
|
||||
schema_builder.add_text_field("body", text_options);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||
// then removes all instances of `the` and `and` from the corpus
|
||||
let tokenizer = SimpleTokenizer
|
||||
.filter(LowerCaser)
|
||||
.filter(StopWordFilter::remove(vec![
|
||||
"the".to_string(),
|
||||
"and".to_string(),
|
||||
]));
|
||||
|
||||
index.tokenizers().register("stoppy", tokenizer);
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let body = schema.get_field("body").unwrap();
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "The Old Man and the Sea",
|
||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||
he had gone eighty-four days now without taking a fish."
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
));
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
title => "Frankenstein",
|
||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||
increasing confidence in the success of my undertaking."
|
||||
));
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
index.load_searchers()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// stop words are applied on the query as well.
|
||||
// The following will be equivalent to `title:frankenstein`
|
||||
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
for (score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
println!("\n==\nDocument score {}:", score);
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
41
examples/working_with_json.rs
Normal file
41
examples/working_with_json.rs
Normal file
@@ -0,0 +1,41 @@
|
||||
extern crate tantivy;
|
||||
use tantivy::schema::*;
|
||||
|
||||
// # Document from json
|
||||
//
|
||||
// For convenience, `Document` can be parsed directly from json.
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's first define a schema and an index.
|
||||
// Check out the basic example if this is confusing to you.
|
||||
//
|
||||
// first we need to define a schema ...
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
schema_builder.add_u64_field("year", INT_INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// Let's assume we have a json-serialized document.
|
||||
let mice_and_men_doc_json = r#"{
|
||||
"title": "Of Mice and Men",
|
||||
"year": 1937
|
||||
}"#;
|
||||
|
||||
// We can parse our document
|
||||
let _mice_and_men_doc = schema.parse_document(&mice_and_men_doc_json)?;
|
||||
|
||||
// Multi-valued field are allowed, they are
|
||||
// expressed in JSON by an array.
|
||||
// The following document has two titles.
|
||||
let frankenstein_json = r#"{
|
||||
"title": ["Frankenstein", "The Modern Prometheus"],
|
||||
"year": 1818
|
||||
}"#;
|
||||
let _frankenstein_doc = schema.parse_document(&frankenstein_json)?;
|
||||
|
||||
// Note that the schema is saved in your index directory.
|
||||
//
|
||||
// As a result, Indexes are aware of their schema, and you can use this feature
|
||||
// just by opening an existing `Index`, and calling `index.schema()..parse_document(json)`.
|
||||
Ok(())
|
||||
}
|
||||
2
run-tests.sh
Executable file
2
run-tests.sh
Executable file
@@ -0,0 +1,2 @@
|
||||
#!/bin/bash
|
||||
cargo test --no-default-features --features mmap -- --test-threads 1
|
||||
@@ -1,142 +0,0 @@
|
||||
use collector::Collector;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
|
||||
/// Collector that does nothing.
|
||||
/// This is used in the chain Collector and will hopefully
|
||||
/// be optimized away by the compiler.
|
||||
pub struct DoNothingCollector;
|
||||
impl Collector for DoNothingCollector {
|
||||
#[inline]
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
#[inline]
|
||||
fn collect(&mut self, _doc: DocId, _score: Score) {}
|
||||
#[inline]
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero-cost abstraction used to collect on multiple collectors.
|
||||
/// This contraption is only usable if the type of your collectors
|
||||
/// are known at compile time.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct ChainedCollector<Left: Collector, Right: Collector> {
|
||||
left: Left,
|
||||
right: Right,
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||
/// Adds a collector
|
||||
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
|
||||
ChainedCollector {
|
||||
left: self,
|
||||
right: new_collector,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
self.left.set_segment(segment_local_id, segment)?;
|
||||
self.right.set_segment(segment_local_id, segment)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.left.collect(doc, score);
|
||||
self.right.collect(doc, score);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.left.requires_scoring() || self.right.requires_scoring()
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a `ChainedCollector`
|
||||
pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
|
||||
ChainedCollector {
|
||||
left: DoNothingCollector,
|
||||
right: DoNothingCollector,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::{Collector, CountCollector, TopCollector};
|
||||
|
||||
#[test]
|
||||
fn test_chained_collector() {
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
{
|
||||
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
}
|
||||
assert_eq!(count_collector.count(), 3);
|
||||
assert!(top_collector.at_capacity());
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
use super::Collector;
|
||||
use collector::SegmentCollector;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
@@ -11,14 +12,14 @@ use SegmentReader;
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::CountCollector;
|
||||
/// use tantivy::collector::Count;
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
@@ -43,59 +44,86 @@ use SegmentReader;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut count_collector).unwrap();
|
||||
/// let count = searcher.search(&query, &Count).unwrap();
|
||||
///
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert_eq!(count, 2);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Default)]
|
||||
pub struct CountCollector {
|
||||
count: usize,
|
||||
}
|
||||
pub struct Count;
|
||||
|
||||
impl CountCollector {
|
||||
/// Returns the count of documents that were
|
||||
/// collected.
|
||||
pub fn count(&self) -> usize {
|
||||
self.count
|
||||
}
|
||||
}
|
||||
impl Collector for Count {
|
||||
type Fruit = usize;
|
||||
|
||||
impl Collector for CountCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
type Child = SegmentCountCollector;
|
||||
|
||||
fn collect(&mut self, _: DocId, _: Score) {
|
||||
self.count += 1;
|
||||
fn for_segment(&self, _: SegmentLocalId, _: &SegmentReader) -> Result<SegmentCountCollector> {
|
||||
Ok(SegmentCountCollector::default())
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, segment_counts: Vec<usize>) -> Result<usize> {
|
||||
Ok(segment_counts.into_iter().sum())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct SegmentCountCollector {
|
||||
count: usize,
|
||||
}
|
||||
|
||||
impl SegmentCollector for SegmentCountCollector {
|
||||
type Fruit = usize;
|
||||
|
||||
fn collect(&mut self, _: DocId, _: Score) {
|
||||
self.count += 1;
|
||||
}
|
||||
|
||||
fn harvest(self) -> usize {
|
||||
self.count
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use collector::{Collector, CountCollector};
|
||||
use super::{Count, SegmentCountCollector};
|
||||
use collector::Collector;
|
||||
use collector::SegmentCollector;
|
||||
|
||||
#[test]
|
||||
fn test_count_collector() {
|
||||
let mut count_collector = CountCollector::default();
|
||||
assert_eq!(count_collector.count(), 0);
|
||||
count_collector.collect(0u32, 1f32);
|
||||
assert_eq!(count_collector.count(), 1);
|
||||
assert_eq!(count_collector.count(), 1);
|
||||
count_collector.collect(1u32, 1f32);
|
||||
assert_eq!(count_collector.count(), 2);
|
||||
assert!(!count_collector.requires_scoring());
|
||||
fn test_count_collect_does_not_requires_scoring() {
|
||||
assert!(!Count.requires_scoring());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_count_collector() {
|
||||
{
|
||||
let count_collector = SegmentCountCollector::default();
|
||||
assert_eq!(count_collector.harvest(), 0);
|
||||
}
|
||||
{
|
||||
let mut count_collector = SegmentCountCollector::default();
|
||||
count_collector.collect(0u32, 1f32);
|
||||
assert_eq!(count_collector.harvest(), 1);
|
||||
}
|
||||
{
|
||||
let mut count_collector = SegmentCountCollector::default();
|
||||
count_collector.collect(0u32, 1f32);
|
||||
assert_eq!(count_collector.harvest(), 1);
|
||||
}
|
||||
{
|
||||
let mut count_collector = SegmentCountCollector::default();
|
||||
count_collector.collect(0u32, 1f32);
|
||||
count_collector.collect(1u32, 1f32);
|
||||
assert_eq!(count_collector.harvest(), 2);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,20 +1,17 @@
|
||||
use collector::Collector;
|
||||
use collector::SegmentCollector;
|
||||
use docset::SkipResult;
|
||||
use fastfield::FacetReader;
|
||||
use schema::Facet;
|
||||
use schema::Field;
|
||||
use std::cell::UnsafeCell;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::btree_map;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::Bound;
|
||||
use std::iter::Peekable;
|
||||
use std::mem;
|
||||
use std::{u64, usize};
|
||||
use termdict::TermMerger;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
@@ -46,12 +43,6 @@ impl<'a> Ord for Hit<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
struct SegmentFacetCounter {
|
||||
pub facet_reader: FacetReader,
|
||||
pub facet_ords: Vec<u64>,
|
||||
pub facet_counts: Vec<u64>,
|
||||
}
|
||||
|
||||
fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
if facet_bytes.is_empty() {
|
||||
0
|
||||
@@ -91,14 +82,14 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{Facet, SchemaBuilder, TEXT};
|
||||
/// use tantivy::schema::{Facet, Schema, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::FacetCollector;
|
||||
/// use tantivy::query::AllQuery;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
///
|
||||
/// // Facet have their own specific type.
|
||||
/// // It is not a bad practise to put all of your
|
||||
@@ -141,13 +132,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||
/// facet_collector.add_facet("/lang");
|
||||
/// facet_collector.add_facet("/category");
|
||||
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
///
|
||||
/// // this object contains count aggregate for all of the facets.
|
||||
/// let counts = facet_collector.harvest();
|
||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = counts
|
||||
/// let facets: Vec<(&Facet, u64)> = facet_counts
|
||||
/// .get("/category")
|
||||
/// .collect();
|
||||
/// assert_eq!(facets, vec![
|
||||
@@ -159,13 +147,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||
/// facet_collector.add_facet("/category/fiction");
|
||||
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
///
|
||||
/// // this object contains count aggregate for all of the facets.
|
||||
/// let counts = facet_collector.harvest();
|
||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = counts
|
||||
/// let facets: Vec<(&Facet, u64)> = facet_counts
|
||||
/// .get("/category/fiction")
|
||||
/// .collect();
|
||||
/// assert_eq!(facets, vec![
|
||||
@@ -178,13 +163,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||
/// facet_collector.add_facet("/category/fiction");
|
||||
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
///
|
||||
/// // this object contains count aggregate for all of the facets.
|
||||
/// let counts = facet_collector.harvest();
|
||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = counts.top_k("/category/fiction", 1);
|
||||
/// let facets: Vec<(&Facet, u64)> = facet_counts.top_k("/category/fiction", 1);
|
||||
/// assert_eq!(facets, vec![
|
||||
/// (&Facet::from("/category/fiction/fantasy"), 2)
|
||||
/// ]);
|
||||
@@ -194,28 +176,28 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// }
|
||||
/// ```
|
||||
pub struct FacetCollector {
|
||||
facet_ords: Vec<u64>,
|
||||
field: Field,
|
||||
ff_reader: Option<UnsafeCell<FacetReader>>,
|
||||
segment_counters: Vec<SegmentFacetCounter>,
|
||||
|
||||
// facet_ord -> collapse facet_id
|
||||
current_segment_collapse_mapping: Vec<usize>,
|
||||
// collapse facet_id -> count
|
||||
current_segment_counts: Vec<u64>,
|
||||
// collapse facet_id -> facet_ord
|
||||
current_collapse_facet_ords: Vec<u64>,
|
||||
|
||||
facets: BTreeSet<Facet>,
|
||||
}
|
||||
|
||||
pub struct FacetSegmentCollector {
|
||||
reader: FacetReader,
|
||||
facet_ords_buf: Vec<u64>,
|
||||
// facet_ord -> collapse facet_id
|
||||
collapse_mapping: Vec<usize>,
|
||||
// collapse facet_id -> count
|
||||
counts: Vec<u64>,
|
||||
// collapse facet_id -> facet_ord
|
||||
collapse_facet_ords: Vec<u64>,
|
||||
}
|
||||
|
||||
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||
target: &[u8],
|
||||
collapse_it: &mut Peekable<I>,
|
||||
) -> SkipResult {
|
||||
loop {
|
||||
match collapse_it.peek() {
|
||||
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
|
||||
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
|
||||
Ordering::Less => {}
|
||||
Ordering::Greater => {
|
||||
return SkipResult::OverStep;
|
||||
@@ -240,15 +222,8 @@ impl FacetCollector {
|
||||
/// is of the proper type.
|
||||
pub fn for_field(field: Field) -> FacetCollector {
|
||||
FacetCollector {
|
||||
facet_ords: Vec::with_capacity(255),
|
||||
segment_counters: Vec::new(),
|
||||
field,
|
||||
ff_reader: None,
|
||||
facets: BTreeSet::new(),
|
||||
|
||||
current_segment_collapse_mapping: Vec::new(),
|
||||
current_collapse_facet_ords: Vec::new(),
|
||||
current_segment_counts: Vec::new(),
|
||||
facets: BTreeSet::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -278,141 +253,100 @@ impl FacetCollector {
|
||||
}
|
||||
self.facets.insert(facet);
|
||||
}
|
||||
|
||||
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
|
||||
self.current_segment_collapse_mapping.clear();
|
||||
self.current_collapse_facet_ords.clear();
|
||||
self.current_segment_counts.clear();
|
||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||
self.current_collapse_facet_ords.push(0);
|
||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||
if !facet_streamer.advance() {
|
||||
return;
|
||||
}
|
||||
'outer: loop {
|
||||
// at the begining of this loop, facet_streamer
|
||||
// is positionned on a term that has not been processed yet.
|
||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||
match skip_result {
|
||||
SkipResult::Reached => {
|
||||
// we reach a facet we decided to collapse.
|
||||
let collapse_depth = facet_depth(facet_streamer.key());
|
||||
let mut collapsed_id = 0;
|
||||
self.current_segment_collapse_mapping.push(0);
|
||||
while facet_streamer.advance() {
|
||||
let depth = facet_depth(facet_streamer.key());
|
||||
if depth <= collapse_depth {
|
||||
continue 'outer;
|
||||
}
|
||||
if depth == collapse_depth + 1 {
|
||||
collapsed_id = self.current_collapse_facet_ords.len();
|
||||
self.current_collapse_facet_ords
|
||||
.push(facet_streamer.term_ord());
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
} else {
|
||||
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
SkipResult::End | SkipResult::OverStep => {
|
||||
self.current_segment_collapse_mapping.push(0);
|
||||
if !facet_streamer.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn finalize_segment(&mut self) {
|
||||
if self.ff_reader.is_some() {
|
||||
self.segment_counters.push(SegmentFacetCounter {
|
||||
facet_reader: self.ff_reader.take().unwrap().into_inner(),
|
||||
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
|
||||
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the results of the collection.
|
||||
///
|
||||
/// This method does not just return the counters,
|
||||
/// it also translates the facet ordinals of the last segment.
|
||||
pub fn harvest(mut self) -> FacetCounts {
|
||||
self.finalize_segment();
|
||||
|
||||
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_ords[..])
|
||||
.collect();
|
||||
let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters
|
||||
.iter()
|
||||
.map(|segment_counter| &segment_counter.facet_counts[..])
|
||||
.collect();
|
||||
|
||||
let facet_streams = self.segment_counters
|
||||
.iter()
|
||||
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut facet_merger = TermMerger::new(facet_streams);
|
||||
let mut facet_counts = BTreeMap::new();
|
||||
|
||||
while facet_merger.advance() {
|
||||
let count = facet_merger
|
||||
.current_kvs()
|
||||
.iter()
|
||||
.map(|it| {
|
||||
let seg_ord = it.segment_ord;
|
||||
let term_ord = it.streamer.term_ord();
|
||||
collapsed_facet_ords[seg_ord]
|
||||
.binary_search(&term_ord)
|
||||
.map(|collapsed_term_id| {
|
||||
if collapsed_term_id == 0 {
|
||||
0
|
||||
} else {
|
||||
collapsed_facet_counts[seg_ord][collapsed_term_id]
|
||||
}
|
||||
})
|
||||
.unwrap_or(0)
|
||||
})
|
||||
.sum();
|
||||
if count > 0u64 {
|
||||
let bytes: Vec<u8> = facet_merger.key().to_owned();
|
||||
// may create an corrupted facet if the term dicitonary is corrupted
|
||||
let facet = unsafe { Facet::from_encoded(bytes) };
|
||||
facet_counts.insert(facet, count);
|
||||
}
|
||||
}
|
||||
FacetCounts { facet_counts }
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for FacetCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.finalize_segment();
|
||||
type Fruit = FacetCounts;
|
||||
|
||||
type Child = FacetSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_: SegmentLocalId,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<FacetSegmentCollector> {
|
||||
let facet_reader = reader.facet_reader(self.field)?;
|
||||
self.set_collapse_mapping(&facet_reader);
|
||||
self.current_segment_counts
|
||||
.resize(self.current_collapse_facet_ords.len(), 0);
|
||||
self.ff_reader = Some(UnsafeCell::new(facet_reader));
|
||||
Ok(())
|
||||
|
||||
let mut collapse_mapping = Vec::new();
|
||||
let mut counts = Vec::new();
|
||||
let mut collapse_facet_ords = Vec::new();
|
||||
|
||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||
collapse_facet_ords.push(0);
|
||||
{
|
||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||
if facet_streamer.advance() {
|
||||
'outer: loop {
|
||||
// at the begining of this loop, facet_streamer
|
||||
// is positionned on a term that has not been processed yet.
|
||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||
match skip_result {
|
||||
SkipResult::Reached => {
|
||||
// we reach a facet we decided to collapse.
|
||||
let collapse_depth = facet_depth(facet_streamer.key());
|
||||
let mut collapsed_id = 0;
|
||||
collapse_mapping.push(0);
|
||||
while facet_streamer.advance() {
|
||||
let depth = facet_depth(facet_streamer.key());
|
||||
if depth <= collapse_depth {
|
||||
continue 'outer;
|
||||
}
|
||||
if depth == collapse_depth + 1 {
|
||||
collapsed_id = collapse_facet_ords.len();
|
||||
collapse_facet_ords.push(facet_streamer.term_ord());
|
||||
collapse_mapping.push(collapsed_id);
|
||||
} else {
|
||||
collapse_mapping.push(collapsed_id);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
SkipResult::End | SkipResult::OverStep => {
|
||||
collapse_mapping.push(0);
|
||||
if !facet_streamer.advance() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
counts.resize(collapse_facet_ords.len(), 0);
|
||||
|
||||
Ok(FacetSegmentCollector {
|
||||
reader: facet_reader,
|
||||
facet_ords_buf: Vec::with_capacity(255),
|
||||
collapse_mapping,
|
||||
counts,
|
||||
collapse_facet_ords,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, segments_facet_counts: Vec<FacetCounts>) -> Result<FacetCounts> {
|
||||
let mut facet_counts: BTreeMap<Facet, u64> = BTreeMap::new();
|
||||
for segment_facet_counts in segments_facet_counts {
|
||||
for (facet, count) in segment_facet_counts.facet_counts {
|
||||
*(facet_counts.entry(facet).or_insert(0)) += count;
|
||||
}
|
||||
}
|
||||
Ok(FacetCounts { facet_counts })
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for FacetSegmentCollector {
|
||||
type Fruit = FacetCounts;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _: Score) {
|
||||
let facet_reader: &mut FacetReader = unsafe {
|
||||
&mut *self.ff_reader
|
||||
.as_ref()
|
||||
.expect("collect() was called before set_segment. This should never happen.")
|
||||
.get()
|
||||
};
|
||||
facet_reader.facet_ords(doc, &mut self.facet_ords);
|
||||
self.reader.facet_ords(doc, &mut self.facet_ords_buf);
|
||||
let mut previous_collapsed_ord: usize = usize::MAX;
|
||||
for &facet_ord in &self.facet_ords {
|
||||
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
|
||||
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
|
||||
{
|
||||
for &facet_ord in &self.facet_ords_buf {
|
||||
let collapsed_ord = self.collapse_mapping[facet_ord as usize];
|
||||
self.counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
@@ -421,8 +355,24 @@ impl Collector for FacetCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
/// Returns the results of the collection.
|
||||
///
|
||||
/// This method does not just return the counters,
|
||||
/// it also translates the facet ordinals of the last segment.
|
||||
fn harvest(self) -> FacetCounts {
|
||||
let mut facet_counts = BTreeMap::new();
|
||||
let facet_dict = self.reader.facet_dict();
|
||||
for (collapsed_facet_ord, count) in self.counts.iter().cloned().enumerate() {
|
||||
if count == 0 {
|
||||
continue;
|
||||
}
|
||||
let mut facet = vec![];
|
||||
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
|
||||
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
|
||||
// TODO
|
||||
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
|
||||
}
|
||||
FacetCounts { facet_counts }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -454,9 +404,9 @@ impl FacetCounts {
|
||||
let right_bound = if facet.is_root() {
|
||||
Bound::Unbounded
|
||||
} else {
|
||||
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
|
||||
facet_after_bytes.push(1u8);
|
||||
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
|
||||
let mut facet_after_bytes: String = facet.encoded_str().to_owned();
|
||||
facet_after_bytes.push('\u{1}');
|
||||
let facet_after = Facet::from_encoded_string(facet_after_bytes);
|
||||
Bound::Excluded(facet_after)
|
||||
};
|
||||
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
|
||||
@@ -470,17 +420,24 @@ impl FacetCounts {
|
||||
let mut heap = BinaryHeap::with_capacity(k);
|
||||
let mut it = self.get(facet);
|
||||
|
||||
// push the first k elements to first bring the heap
|
||||
// to capacity
|
||||
for (facet, count) in (&mut it).take(k) {
|
||||
heap.push(Hit { count, facet });
|
||||
}
|
||||
|
||||
let mut lowest_count: u64 = heap.peek().map(|hit| hit.count).unwrap_or(u64::MIN);
|
||||
let mut lowest_count: u64 = heap.peek().map(|hit| hit.count).unwrap_or(u64::MIN); //< the `unwrap_or` case may be triggered but the value
|
||||
// is never used in that case.
|
||||
|
||||
for (facet, count) in it {
|
||||
if count > lowest_count {
|
||||
lowest_count = count;
|
||||
if let Some(mut head) = heap.peek_mut() {
|
||||
*head = Hit { count, facet };
|
||||
}
|
||||
// the heap gets reconstructed at this point
|
||||
if let Some(head) = heap.peek() {
|
||||
lowest_count = head.count;
|
||||
}
|
||||
}
|
||||
}
|
||||
heap.into_sorted_vec()
|
||||
@@ -495,14 +452,15 @@ mod tests {
|
||||
use super::{FacetCollector, FacetCounts};
|
||||
use core::Index;
|
||||
use query::AllQuery;
|
||||
use rand::distributions::Uniform;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::{thread_rng, Rng};
|
||||
use schema::Field;
|
||||
use schema::{Document, Facet, SchemaBuilder};
|
||||
use schema::{Document, Facet, Field, Schema};
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_drilldown() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -527,12 +485,10 @@ mod tests {
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet(Facet::from("/top1"));
|
||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
|
||||
let counts: FacetCounts = facet_collector.harvest();
|
||||
{
|
||||
let facets: Vec<(String, u64)> = counts
|
||||
.get("/top1")
|
||||
@@ -545,24 +501,47 @@ mod tests {
|
||||
("/top1/mid1", 50),
|
||||
("/top1/mid2", 50),
|
||||
("/top1/mid3", 50),
|
||||
].iter()
|
||||
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||
.collect::<Vec<_>>()
|
||||
]
|
||||
.iter()
|
||||
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(
|
||||
expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet."
|
||||
)]
|
||||
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
||||
an already added facet.")]
|
||||
fn test_misused_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
facet_collector.add_facet(Facet::from("/country"));
|
||||
facet_collector.add_facet(Facet::from("/country/europe"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_unsorted_multifacet() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facets");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/subjects/A/a"),
|
||||
facet_field => Facet::from_text(&"/subjects/B/a"),
|
||||
facet_field => Facet::from_text(&"/subjects/A/b"),
|
||||
facet_field => Facet::from_text(&"/subjects/B/b"),
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
assert_eq!(searcher.num_docs(), 1);
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet("/subjects");
|
||||
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
|
||||
assert_eq!(facets[0].1, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_non_used_facet_collector() {
|
||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||
@@ -572,20 +551,28 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_facet_collector_topk() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let uniform = Uniform::new_inclusive(1, 100_000);
|
||||
let mut docs: Vec<Document> = vec![("a", 10), ("b", 100), ("c", 7), ("d", 12), ("e", 21)]
|
||||
.into_iter()
|
||||
.flat_map(|(c, count)| {
|
||||
let facet = Facet::from(&format!("/facet_{}", c));
|
||||
let facet = Facet::from(&format!("/facet/{}", c));
|
||||
let doc = doc!(facet_field => facet);
|
||||
iter::repeat(doc).take(count)
|
||||
})
|
||||
.map(|mut doc| {
|
||||
doc.add_facet(
|
||||
facet_field,
|
||||
&format!("/facet/{}", thread_rng().sample(&uniform)),
|
||||
);
|
||||
doc
|
||||
})
|
||||
.collect();
|
||||
thread_rng().shuffle(&mut docs[..]);
|
||||
docs[..].shuffle(&mut thread_rng());
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for doc in docs {
|
||||
@@ -597,18 +584,17 @@ mod tests {
|
||||
let searcher = index.searcher();
|
||||
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet("/");
|
||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
facet_collector.add_facet("/facet");
|
||||
let counts: FacetCounts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
|
||||
let counts: FacetCounts = facet_collector.harvest();
|
||||
{
|
||||
let facets: Vec<(&Facet, u64)> = counts.top_k("/", 3);
|
||||
let facets: Vec<(&Facet, u64)> = counts.top_k("/facet", 3);
|
||||
assert_eq!(
|
||||
facets,
|
||||
vec![
|
||||
(&Facet::from("/facet_b"), 100),
|
||||
(&Facet::from("/facet_e"), 21),
|
||||
(&Facet::from("/facet_d"), 12),
|
||||
(&Facet::from("/facet/b"), 100),
|
||||
(&Facet::from("/facet/e"), 21),
|
||||
(&Facet::from("/facet/d"), 12),
|
||||
]
|
||||
);
|
||||
}
|
||||
@@ -623,13 +609,13 @@ mod bench {
|
||||
use query::AllQuery;
|
||||
use rand::{thread_rng, Rng};
|
||||
use schema::Facet;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Schema;
|
||||
use test::Bencher;
|
||||
use Index;
|
||||
|
||||
#[bench]
|
||||
fn bench_facet_collector(b: &mut Bencher) {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -653,8 +639,8 @@ mod bench {
|
||||
|
||||
b.iter(|| {
|
||||
let searcher = index.searcher();
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||
let facet_collector = FacetCollector::for_field(facet_field);
|
||||
searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -79,7 +79,7 @@ mod tests {
|
||||
// make sure we have facet counters correctly filled
|
||||
fn test_facet_collector_results() {
|
||||
|
||||
let mut schema_builder = schema::SchemaBuilder::new();
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
|
||||
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
|
||||
let text_field = schema_builder.add_text_field("text", STRING);
|
||||
|
||||
@@ -1,7 +1,91 @@
|
||||
/*!
|
||||
Defines how the documents matching a search query should be processed.
|
||||
|
||||
# Collectors
|
||||
|
||||
Collectors define the information you want to extract from the documents matching the queries.
|
||||
In tantivy jargon, we call this information your search "fruit".
|
||||
|
||||
Your fruit could for instance be :
|
||||
- [the count of matching documents](./struct.Count.html)
|
||||
- [the top 10 documents, by relevancy or by a fast field](./struct.TopDocs.html)
|
||||
- [facet counts](./struct.FacetCollector.html)
|
||||
|
||||
At one point in your code, you will trigger the actual search operation by calling
|
||||
[the `search(...)` method of your `Searcher` object](../struct.Searcher.html#method.search).
|
||||
This call will look like this.
|
||||
|
||||
```verbatim
|
||||
let fruit = searcher.search(&query, &collector)?;
|
||||
```
|
||||
|
||||
Here the type of fruit is actually determined as an associated type of the collector (`Collector::Fruit`).
|
||||
|
||||
|
||||
# Combining several collectors
|
||||
|
||||
A rich search experience often requires to run several collectors on your search query.
|
||||
For instance,
|
||||
- selecting the top-K products matching your query
|
||||
- counting the matching documents
|
||||
- computing several facets
|
||||
- computing statistics about the matching product prices
|
||||
|
||||
A simple and efficient way to do that is to pass your collectors as one tuple.
|
||||
The resulting `Fruit` will then be a typed tuple with each collector's original fruits
|
||||
in their respective position.
|
||||
|
||||
```rust
|
||||
# extern crate tantivy;
|
||||
# use tantivy::schema::*;
|
||||
# use tantivy::*;
|
||||
# use tantivy::query::*;
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
#
|
||||
# fn main() -> tantivy::Result<()> {
|
||||
# let mut schema_builder = Schema::builder();
|
||||
# let title = schema_builder.add_text_field("title", TEXT);
|
||||
# let schema = schema_builder.build();
|
||||
# let index = Index::create_in_ram(schema);
|
||||
# let mut index_writer = index.writer(3_000_000)?;
|
||||
# index_writer.add_document(doc!(
|
||||
# title => "The Name of the Wind",
|
||||
# ));
|
||||
# index_writer.add_document(doc!(
|
||||
# title => "The Diary of Muadib",
|
||||
# ));
|
||||
# index_writer.commit().unwrap();
|
||||
# index.load_searchers()?;
|
||||
# let searcher = index.searcher();
|
||||
# let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
# let query = query_parser.parse_query("diary")?;
|
||||
let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
|
||||
searcher.search(&query, &(Count, TopDocs::with_limit(2)))?;
|
||||
# Ok(())
|
||||
# }
|
||||
```
|
||||
|
||||
The `Collector` trait is implemented for up to 4 collectors.
|
||||
If you have more than 4 collectors, you can either group them into
|
||||
tuples of tuples `(a,(b,(c,d)))`, or rely on `MultiCollector`'s.
|
||||
|
||||
# Combining several collectors dynamically
|
||||
|
||||
Combining collectors into a tuple is a zero-cost abstraction: everything
|
||||
happens as if you had manually implemented a single collector
|
||||
combining all of our features.
|
||||
|
||||
Unfortunately it requires you to know at compile time your collector types.
|
||||
If on the other hand, the collectors depend on some query parameter,
|
||||
you can rely on `MultiCollector`'s.
|
||||
|
||||
|
||||
# Implementing your own collectors.
|
||||
|
||||
See the `custom_collector` example.
|
||||
|
||||
*/
|
||||
|
||||
use downcast;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
@@ -9,238 +93,275 @@ use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
|
||||
mod count_collector;
|
||||
pub use self::count_collector::CountCollector;
|
||||
pub use self::count_collector::Count;
|
||||
|
||||
mod multi_collector;
|
||||
pub use self::multi_collector::MultiCollector;
|
||||
|
||||
mod top_collector;
|
||||
pub use self::top_collector::TopCollector;
|
||||
|
||||
mod top_score_collector;
|
||||
pub use self::top_score_collector::TopDocs;
|
||||
|
||||
mod top_field_collector;
|
||||
pub use self::top_field_collector::TopDocsByField;
|
||||
|
||||
mod facet_collector;
|
||||
pub use self::facet_collector::FacetCollector;
|
||||
|
||||
mod chained_collector;
|
||||
pub use self::chained_collector::{chain, ChainedCollector};
|
||||
/// `Fruit` is the type for the result of our collection.
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
pub trait Fruit: Send + downcast::Any {}
|
||||
|
||||
impl<T> Fruit for T where T: Send + downcast::Any {}
|
||||
|
||||
/// Collectors are in charge of collecting and retaining relevant
|
||||
/// information from the document found and scored by the query.
|
||||
///
|
||||
///
|
||||
/// For instance,
|
||||
///
|
||||
/// - keeping track of the top 10 best documents
|
||||
/// - computing a breakdown over a fast field
|
||||
/// - computing the number of documents matching the query
|
||||
///
|
||||
/// Queries are in charge of pushing the `DocSet` to the collector.
|
||||
/// Our search index is in fact a collection of segments, so
|
||||
/// a `Collector` trait is actually more of a factory to instance
|
||||
/// `SegmentCollector`s for each segments.
|
||||
///
|
||||
/// As they work on multiple segments, they first inform
|
||||
/// the collector of a change in a segment and then
|
||||
/// call the `collect` method to push the document to the collector.
|
||||
///
|
||||
/// Temporally, our collector will receive calls
|
||||
/// - `.set_segment(0, segment_reader_0)`
|
||||
/// - `.collect(doc0_of_segment_0)`
|
||||
/// - `.collect(...)`
|
||||
/// - `.collect(last_doc_of_segment_0)`
|
||||
/// - `.set_segment(1, segment_reader_1)`
|
||||
/// - `.collect(doc0_of_segment_1)`
|
||||
/// - `.collect(...)`
|
||||
/// - `.collect(last_doc_of_segment_1)`
|
||||
/// - `...`
|
||||
/// - `.collect(last_doc_of_last_segment)`
|
||||
/// The collection logic itself is in the `SegmentCollector`.
|
||||
///
|
||||
/// Segments are not guaranteed to be visited in any specific order.
|
||||
pub trait Collector {
|
||||
pub trait Collector: Sync {
|
||||
/// `Fruit` is the type for the result of our collection.
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
type Fruit: Fruit;
|
||||
|
||||
/// Type of the `SegmentCollector` associated to this collector.
|
||||
type Child: SegmentCollector<Fruit = Self::Fruit>;
|
||||
|
||||
/// `set_segment` is called before beginning to enumerate
|
||||
/// on this segment.
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()>;
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
) -> Result<Self::Child>;
|
||||
|
||||
/// Returns true iff the collector requires to compute scores for documents.
|
||||
fn requires_scoring(&self) -> bool;
|
||||
|
||||
/// Combines the fruit associated to the collection of each segments
|
||||
/// into one fruit.
|
||||
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit>;
|
||||
}
|
||||
|
||||
impl<'a, C: Collector> Collector for &'a mut C {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
(*self).set_segment(segment_local_id, segment)
|
||||
}
|
||||
/// The `SegmentCollector` is the trait in charge of defining the
|
||||
/// collect operation at the scale of the segment.
|
||||
///
|
||||
/// `.collect(doc, score)` will be called for every documents
|
||||
/// matching the query.
|
||||
pub trait SegmentCollector: 'static {
|
||||
/// `Fruit` is the type for the result of our collection.
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
type Fruit: Fruit;
|
||||
|
||||
/// The query pushes the scored document to the collector via this method.
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
C::collect(self, doc, score)
|
||||
fn collect(&mut self, doc: DocId, score: Score);
|
||||
|
||||
/// Extract the fruit of the collection from the `SegmentCollector`.
|
||||
fn harvest(self) -> Self::Fruit;
|
||||
}
|
||||
|
||||
// -----------------------------------------------
|
||||
// Tuple implementations.
|
||||
|
||||
impl<Left, Right> Collector for (Left, Right)
|
||||
where
|
||||
Left: Collector,
|
||||
Right: Collector,
|
||||
{
|
||||
type Fruit = (Left::Fruit, Right::Fruit);
|
||||
type Child = (Left::Child, Right::Child);
|
||||
|
||||
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
||||
let left = self.0.for_segment(segment_local_id, segment)?;
|
||||
let right = self.1.for_segment(segment_local_id, segment)?;
|
||||
Ok((left, right))
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
C::requires_scoring(self)
|
||||
self.0.requires_scoring() || self.1.requires_scoring()
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
children: Vec<(Left::Fruit, Right::Fruit)>,
|
||||
) -> Result<(Left::Fruit, Right::Fruit)> {
|
||||
let mut left_fruits = vec![];
|
||||
let mut right_fruits = vec![];
|
||||
for (left_fruit, right_fruit) in children {
|
||||
left_fruits.push(left_fruit);
|
||||
right_fruits.push(right_fruit);
|
||||
}
|
||||
Ok((
|
||||
self.0.merge_fruits(left_fruits)?,
|
||||
self.1.merge_fruits(right_fruits)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl<Left, Right> SegmentCollector for (Left, Right)
|
||||
where
|
||||
Left: SegmentCollector,
|
||||
Right: SegmentCollector,
|
||||
{
|
||||
type Fruit = (Left::Fruit, Right::Fruit);
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.0.collect(doc, score);
|
||||
self.1.collect(doc, score);
|
||||
}
|
||||
|
||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||
(self.0.harvest(), self.1.harvest())
|
||||
}
|
||||
}
|
||||
|
||||
// 3-Tuple
|
||||
|
||||
impl<One, Two, Three> Collector for (One, Two, Three)
|
||||
where
|
||||
One: Collector,
|
||||
Two: Collector,
|
||||
Three: Collector,
|
||||
{
|
||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
|
||||
type Child = (One::Child, Two::Child, Three::Child);
|
||||
|
||||
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
||||
let one = self.0.for_segment(segment_local_id, segment)?;
|
||||
let two = self.1.for_segment(segment_local_id, segment)?;
|
||||
let three = self.2.for_segment(segment_local_id, segment)?;
|
||||
Ok((one, two, three))
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.0.requires_scoring() || self.1.requires_scoring() || self.2.requires_scoring()
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
||||
let mut one_fruits = vec![];
|
||||
let mut two_fruits = vec![];
|
||||
let mut three_fruits = vec![];
|
||||
for (one_fruit, two_fruit, three_fruit) in children {
|
||||
one_fruits.push(one_fruit);
|
||||
two_fruits.push(two_fruit);
|
||||
three_fruits.push(three_fruit);
|
||||
}
|
||||
Ok((
|
||||
self.0.merge_fruits(one_fruits)?,
|
||||
self.1.merge_fruits(two_fruits)?,
|
||||
self.2.merge_fruits(three_fruits)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl<One, Two, Three> SegmentCollector for (One, Two, Three)
|
||||
where
|
||||
One: SegmentCollector,
|
||||
Two: SegmentCollector,
|
||||
Three: SegmentCollector,
|
||||
{
|
||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.0.collect(doc, score);
|
||||
self.1.collect(doc, score);
|
||||
self.2.collect(doc, score);
|
||||
}
|
||||
|
||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||
(self.0.harvest(), self.1.harvest(), self.2.harvest())
|
||||
}
|
||||
}
|
||||
|
||||
// 4-Tuple
|
||||
|
||||
impl<One, Two, Three, Four> Collector for (One, Two, Three, Four)
|
||||
where
|
||||
One: Collector,
|
||||
Two: Collector,
|
||||
Three: Collector,
|
||||
Four: Collector,
|
||||
{
|
||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
|
||||
type Child = (One::Child, Two::Child, Three::Child, Four::Child);
|
||||
|
||||
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
||||
let one = self.0.for_segment(segment_local_id, segment)?;
|
||||
let two = self.1.for_segment(segment_local_id, segment)?;
|
||||
let three = self.2.for_segment(segment_local_id, segment)?;
|
||||
let four = self.3.for_segment(segment_local_id, segment)?;
|
||||
Ok((one, two, three, four))
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.0.requires_scoring()
|
||||
|| self.1.requires_scoring()
|
||||
|| self.2.requires_scoring()
|
||||
|| self.3.requires_scoring()
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
||||
let mut one_fruits = vec![];
|
||||
let mut two_fruits = vec![];
|
||||
let mut three_fruits = vec![];
|
||||
let mut four_fruits = vec![];
|
||||
for (one_fruit, two_fruit, three_fruit, four_fruit) in children {
|
||||
one_fruits.push(one_fruit);
|
||||
two_fruits.push(two_fruit);
|
||||
three_fruits.push(three_fruit);
|
||||
four_fruits.push(four_fruit);
|
||||
}
|
||||
Ok((
|
||||
self.0.merge_fruits(one_fruits)?,
|
||||
self.1.merge_fruits(two_fruits)?,
|
||||
self.2.merge_fruits(three_fruits)?,
|
||||
self.3.merge_fruits(four_fruits)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl<One, Two, Three, Four> SegmentCollector for (One, Two, Three, Four)
|
||||
where
|
||||
One: SegmentCollector,
|
||||
Two: SegmentCollector,
|
||||
Three: SegmentCollector,
|
||||
Four: SegmentCollector,
|
||||
{
|
||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.0.collect(doc, score);
|
||||
self.1.collect(doc, score);
|
||||
self.2.collect(doc, score);
|
||||
self.3.collect(doc, score);
|
||||
}
|
||||
|
||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||
(
|
||||
self.0.harvest(),
|
||||
self.1.harvest(),
|
||||
self.2.harvest(),
|
||||
self.3.harvest(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(missing_docs)]
|
||||
mod downcast_impl {
|
||||
downcast!(super::Fruit);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use core::SegmentReader;
|
||||
use fastfield::BytesFastFieldReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
use DocId;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
/// This collector is only used for tests.
|
||||
/// It is unusable in practise, as it does not store
|
||||
/// the segment ordinals
|
||||
pub struct TestCollector {
|
||||
offset: DocId,
|
||||
segment_max_doc: DocId,
|
||||
docs: Vec<DocId>,
|
||||
scores: Vec<Score>,
|
||||
}
|
||||
|
||||
impl TestCollector {
|
||||
/// Return the exhalist of documents.
|
||||
pub fn docs(self) -> Vec<DocId> {
|
||||
self.docs
|
||||
}
|
||||
|
||||
pub fn scores(self) -> Vec<Score> {
|
||||
self.scores
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for TestCollector {
|
||||
fn default() -> TestCollector {
|
||||
TestCollector {
|
||||
offset: 0,
|
||||
segment_max_doc: 0,
|
||||
docs: Vec::new(),
|
||||
scores: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for TestCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.offset += self.segment_max_doc;
|
||||
self.segment_max_doc = reader.max_doc();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.docs.push(doc + self.offset);
|
||||
self.scores.push(score);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// Collects in order all of the fast fields for all of the
|
||||
/// doc in the `DocSet`
|
||||
///
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct FastFieldTestCollector {
|
||||
vals: Vec<u64>,
|
||||
field: Field,
|
||||
ff_reader: Option<FastFieldReader<u64>>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
pub fn for_field(field: Field) -> FastFieldTestCollector {
|
||||
FastFieldTestCollector {
|
||||
vals: Vec::new(),
|
||||
field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn vals(self) -> Vec<u64> {
|
||||
self.vals
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for FastFieldTestCollector {
|
||||
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let val = self.ff_reader.as_ref().unwrap().get(doc);
|
||||
self.vals.push(val);
|
||||
}
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Collects in order all of the fast field bytes for all of the
|
||||
/// docs in the `DocSet`
|
||||
///
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct BytesFastFieldTestCollector {
|
||||
vals: Vec<u8>,
|
||||
field: Field,
|
||||
ff_reader: Option<BytesFastFieldReader>,
|
||||
}
|
||||
|
||||
impl BytesFastFieldTestCollector {
|
||||
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
|
||||
BytesFastFieldTestCollector {
|
||||
vals: Vec::new(),
|
||||
field,
|
||||
ff_reader: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn vals(self) -> Vec<u8> {
|
||||
self.vals
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for BytesFastFieldTestCollector {
|
||||
fn set_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<()> {
|
||||
self.ff_reader = Some(segment.bytes_fast_field_reader(self.field)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let val = self.ff_reader.as_ref().unwrap().get_val(doc);
|
||||
self.vals.extend(val);
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use collector::{Collector, CountCollector};
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn build_collector(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut count_collector = CountCollector::default();
|
||||
let docs: Vec<u32> = (0..1_000_000).collect();
|
||||
for doc in docs {
|
||||
count_collector.collect(doc, 1f32);
|
||||
}
|
||||
count_collector.count()
|
||||
});
|
||||
}
|
||||
}
|
||||
pub mod tests;
|
||||
|
||||
@@ -1,9 +1,97 @@
|
||||
use super::Collector;
|
||||
use super::SegmentCollector;
|
||||
use collector::Fruit;
|
||||
use downcast::Downcast;
|
||||
use std::marker::PhantomData;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use TantivyError;
|
||||
|
||||
pub struct MultiFruit {
|
||||
sub_fruits: Vec<Option<Box<Fruit>>>,
|
||||
}
|
||||
|
||||
pub struct CollectorWrapper<TCollector: Collector>(TCollector);
|
||||
|
||||
impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
||||
type Fruit = Box<Fruit>;
|
||||
type Child = Box<BoxableSegmentCollector>;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<Box<BoxableSegmentCollector>> {
|
||||
let child = self.0.for_segment(segment_local_id, reader)?;
|
||||
Ok(Box::new(SegmentCollectorWrapper(child)))
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.0.requires_scoring()
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, children: Vec<<Self as Collector>::Fruit>) -> Result<Box<Fruit>> {
|
||||
let typed_fruit: Vec<TCollector::Fruit> = children
|
||||
.into_iter()
|
||||
.map(|untyped_fruit| {
|
||||
Downcast::<TCollector::Fruit>::downcast(untyped_fruit)
|
||||
.map(|boxed_but_typed| *boxed_but_typed)
|
||||
.map_err(|e| {
|
||||
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
|
||||
TantivyError::InvalidArgument(err_msg)
|
||||
})
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
let merged_fruit = self.0.merge_fruits(typed_fruit)?;
|
||||
Ok(Box::new(merged_fruit))
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for Box<BoxableSegmentCollector> {
|
||||
type Fruit = Box<Fruit>;
|
||||
|
||||
fn collect(&mut self, doc: u32, score: f32) {
|
||||
self.as_mut().collect(doc, score);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Box<Fruit> {
|
||||
BoxableSegmentCollector::harvest_from_box(self)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait BoxableSegmentCollector {
|
||||
fn collect(&mut self, doc: u32, score: f32);
|
||||
fn harvest_from_box(self: Box<Self>) -> Box<Fruit>;
|
||||
}
|
||||
|
||||
pub struct SegmentCollectorWrapper<TSegmentCollector: SegmentCollector>(TSegmentCollector);
|
||||
|
||||
impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector
|
||||
for SegmentCollectorWrapper<TSegmentCollector>
|
||||
{
|
||||
fn collect(&mut self, doc: u32, score: f32) {
|
||||
self.0.collect(doc, score);
|
||||
}
|
||||
|
||||
fn harvest_from_box(self: Box<Self>) -> Box<Fruit> {
|
||||
Box::new(self.0.harvest())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FruitHandle<TFruit: Fruit> {
|
||||
pos: usize,
|
||||
_phantom: PhantomData<TFruit>,
|
||||
}
|
||||
|
||||
impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
|
||||
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
|
||||
*Downcast::<TFruit>::downcast(boxed_fruit).expect("Failed")
|
||||
}
|
||||
}
|
||||
|
||||
/// Multicollector makes it possible to collect on more than one collector.
|
||||
/// It should only be used for use cases where the Collector types is unknown
|
||||
@@ -13,14 +101,14 @@ use SegmentReader;
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::{CountCollector, TopCollector, MultiCollector};
|
||||
/// use tantivy::collector::{Count, TopDocs, MultiCollector};
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
@@ -44,55 +132,115 @@ use SegmentReader;
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let mut count_collector = CountCollector::default();
|
||||
/// {
|
||||
/// let mut collectors =
|
||||
/// MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||
/// }
|
||||
/// assert_eq!(count_collector.count(), 2);
|
||||
/// assert!(top_collector.at_capacity());
|
||||
/// }
|
||||
/// let mut collectors = MultiCollector::new();
|
||||
/// let top_docs_handle = collectors.add_collector(TopDocs::with_limit(2));
|
||||
/// let count_handle = collectors.add_collector(Count);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let mut multi_fruit = searcher.search(&query, &collectors)?;
|
||||
///
|
||||
/// let count = count_handle.extract(&mut multi_fruit);
|
||||
/// let top_docs = top_docs_handle.extract(&mut multi_fruit);
|
||||
///
|
||||
/// # assert_eq!(count, 2);
|
||||
/// # assert_eq!(top_docs.len(), 2);
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[derive(Default)]
|
||||
pub struct MultiCollector<'a> {
|
||||
collectors: Vec<&'a mut Collector>,
|
||||
collector_wrappers:
|
||||
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>,
|
||||
}
|
||||
|
||||
impl<'a> MultiCollector<'a> {
|
||||
/// Constructor
|
||||
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
|
||||
MultiCollector { collectors }
|
||||
/// Create a new `MultiCollector`
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
|
||||
/// Add a new collector to our `MultiCollector`.
|
||||
pub fn add_collector<'b: 'a, TCollector: Collector + 'b>(
|
||||
&mut self,
|
||||
collector: TCollector,
|
||||
) -> FruitHandle<TCollector::Fruit> {
|
||||
let pos = self.collector_wrappers.len();
|
||||
self.collector_wrappers
|
||||
.push(Box::new(CollectorWrapper(collector)));
|
||||
FruitHandle {
|
||||
pos,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Collector for MultiCollector<'a> {
|
||||
fn set_segment(
|
||||
&mut self,
|
||||
type Fruit = MultiFruit;
|
||||
type Child = MultiCollectorChild;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<()> {
|
||||
for collector in &mut self.collectors {
|
||||
collector.set_segment(segment_local_id, segment)?;
|
||||
}
|
||||
Ok(())
|
||||
) -> Result<MultiCollectorChild> {
|
||||
let children = self
|
||||
.collector_wrappers
|
||||
.iter()
|
||||
.map(|collector_wrapper| collector_wrapper.for_segment(segment_local_id, segment))
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
Ok(MultiCollectorChild { children })
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collector_wrappers.iter().any(|c| c.requires_scoring())
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, segments_multifruits: Vec<MultiFruit>) -> Result<MultiFruit> {
|
||||
let mut segment_fruits_list: Vec<Vec<Box<Fruit>>> = (0..self.collector_wrappers.len())
|
||||
.map(|_| Vec::with_capacity(segments_multifruits.len()))
|
||||
.collect::<Vec<_>>();
|
||||
for segment_multifruit in segments_multifruits {
|
||||
for (idx, segment_fruit_opt) in segment_multifruit.sub_fruits.into_iter().enumerate() {
|
||||
if let Some(segment_fruit) = segment_fruit_opt {
|
||||
segment_fruits_list[idx].push(segment_fruit);
|
||||
}
|
||||
}
|
||||
}
|
||||
let sub_fruits = self
|
||||
.collector_wrappers
|
||||
.iter()
|
||||
.zip(segment_fruits_list)
|
||||
.map(|(child_collector, segment_fruits)| {
|
||||
Ok(Some(child_collector.merge_fruits(segment_fruits)?))
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
Ok(MultiFruit { sub_fruits })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MultiCollectorChild {
|
||||
children: Vec<Box<BoxableSegmentCollector>>,
|
||||
}
|
||||
|
||||
impl SegmentCollector for MultiCollectorChild {
|
||||
type Fruit = MultiFruit;
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
for collector in &mut self.collectors {
|
||||
collector.collect(doc, score);
|
||||
for child in &mut self.children {
|
||||
child.collect(doc, score);
|
||||
}
|
||||
}
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collectors
|
||||
.iter()
|
||||
.any(|collector| collector.requires_scoring())
|
||||
|
||||
fn harvest(self) -> MultiFruit {
|
||||
MultiFruit {
|
||||
sub_fruits: self
|
||||
.children
|
||||
.into_iter()
|
||||
.map(|child| Some(child.harvest()))
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -100,20 +248,42 @@ impl<'a> Collector for MultiCollector<'a> {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::{Collector, CountCollector, TopCollector};
|
||||
use collector::{Count, TopDocs};
|
||||
use query::TermQuery;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Schema, TEXT};
|
||||
use Index;
|
||||
use Term;
|
||||
|
||||
#[test]
|
||||
fn test_multi_collector() {
|
||||
let mut top_collector = TopCollector::with_limit(2);
|
||||
let mut count_collector = CountCollector::default();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut collectors =
|
||||
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||
collectors.collect(1, 0.2);
|
||||
collectors.collect(2, 0.1);
|
||||
collectors.collect(3, 0.5);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc abc"));
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!(text=>""));
|
||||
index_writer.add_document(doc!(text=>"abc abc abc abc"));
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
assert_eq!(count_collector.count(), 3);
|
||||
assert!(top_collector.at_capacity());
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let term = Term::from_field_text(text, "abc");
|
||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
|
||||
let mut collectors = MultiCollector::new();
|
||||
let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2));
|
||||
let count_handler = collectors.add_collector(Count);
|
||||
let mut multifruits = searcher.search(&query, &mut collectors).unwrap();
|
||||
|
||||
assert_eq!(count_handler.extract(&mut multifruits), 5);
|
||||
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
|
||||
}
|
||||
}
|
||||
|
||||
201
src/collector/tests.rs
Normal file
201
src/collector/tests.rs
Normal file
@@ -0,0 +1,201 @@
|
||||
use super::*;
|
||||
use core::SegmentReader;
|
||||
use fastfield::BytesFastFieldReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
use DocAddress;
|
||||
use DocId;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
/// This collector is only used for tests.
|
||||
/// It is unusable in pr
|
||||
///
|
||||
/// actise, as it does not store
|
||||
/// the segment ordinals
|
||||
pub struct TestCollector;
|
||||
|
||||
pub struct TestSegmentCollector {
|
||||
segment_id: SegmentLocalId,
|
||||
fruit: TestFruit,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct TestFruit {
|
||||
docs: Vec<DocAddress>,
|
||||
scores: Vec<Score>,
|
||||
}
|
||||
|
||||
impl TestFruit {
|
||||
/// Return the list of matching documents exhaustively.
|
||||
pub fn docs(&self) -> &[DocAddress] {
|
||||
&self.docs[..]
|
||||
}
|
||||
|
||||
pub fn scores(&self) -> &[Score] {
|
||||
&self.scores[..]
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for TestCollector {
|
||||
type Fruit = TestFruit;
|
||||
type Child = TestSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_id: SegmentLocalId,
|
||||
_reader: &SegmentReader,
|
||||
) -> Result<TestSegmentCollector> {
|
||||
Ok(TestSegmentCollector {
|
||||
segment_id,
|
||||
fruit: TestFruit::default(),
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, mut children: Vec<TestFruit>) -> Result<TestFruit> {
|
||||
children.sort_by_key(|fruit| {
|
||||
if fruit.docs().is_empty() {
|
||||
0
|
||||
} else {
|
||||
fruit.docs()[0].segment_ord()
|
||||
}
|
||||
});
|
||||
let mut docs = vec![];
|
||||
let mut scores = vec![];
|
||||
for child in children {
|
||||
docs.extend(child.docs());
|
||||
scores.extend(child.scores);
|
||||
}
|
||||
Ok(TestFruit { docs, scores })
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for TestSegmentCollector {
|
||||
type Fruit = TestFruit;
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.fruit.docs.push(DocAddress(self.segment_id, doc));
|
||||
self.fruit.scores.push(score);
|
||||
}
|
||||
|
||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||
self.fruit
|
||||
}
|
||||
}
|
||||
|
||||
/// Collects in order all of the fast fields for all of the
|
||||
/// doc in the `DocSet`
|
||||
///
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct FastFieldTestCollector {
|
||||
field: Field,
|
||||
}
|
||||
|
||||
pub struct FastFieldSegmentCollector {
|
||||
vals: Vec<u64>,
|
||||
reader: FastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
pub fn for_field(field: Field) -> FastFieldTestCollector {
|
||||
FastFieldTestCollector { field }
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for FastFieldTestCollector {
|
||||
type Fruit = Vec<u64>;
|
||||
type Child = FastFieldSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_: SegmentLocalId,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<FastFieldSegmentCollector> {
|
||||
Ok(FastFieldSegmentCollector {
|
||||
vals: Vec::new(),
|
||||
reader: reader.fast_field_reader(self.field)?,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, children: Vec<Vec<u64>>) -> Result<Vec<u64>> {
|
||||
Ok(children.into_iter().flat_map(|v| v.into_iter()).collect())
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for FastFieldSegmentCollector {
|
||||
type Fruit = Vec<u64>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let val = self.reader.get(doc);
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Vec<u64> {
|
||||
self.vals
|
||||
}
|
||||
}
|
||||
|
||||
/// Collects in order all of the fast field bytes for all of the
|
||||
/// docs in the `DocSet`
|
||||
///
|
||||
/// This collector is mainly useful for tests.
|
||||
pub struct BytesFastFieldTestCollector {
|
||||
field: Field,
|
||||
}
|
||||
|
||||
pub struct BytesFastFieldSegmentCollector {
|
||||
vals: Vec<u8>,
|
||||
reader: BytesFastFieldReader,
|
||||
}
|
||||
|
||||
impl BytesFastFieldTestCollector {
|
||||
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
|
||||
BytesFastFieldTestCollector { field }
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for BytesFastFieldTestCollector {
|
||||
type Fruit = Vec<u8>;
|
||||
type Child = BytesFastFieldSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: u32,
|
||||
segment: &SegmentReader,
|
||||
) -> Result<BytesFastFieldSegmentCollector> {
|
||||
Ok(BytesFastFieldSegmentCollector {
|
||||
vals: Vec::new(),
|
||||
reader: segment.bytes_fast_field_reader(self.field)?,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, children: Vec<Vec<u8>>) -> Result<Vec<u8>> {
|
||||
Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for BytesFastFieldSegmentCollector {
|
||||
type Fruit = Vec<u8>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let data = self.reader.get_val(doc);
|
||||
self.vals.extend(data);
|
||||
}
|
||||
|
||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
||||
self.vals
|
||||
}
|
||||
}
|
||||
@@ -1,244 +1,224 @@
|
||||
use super::Collector;
|
||||
use serde::export::PhantomData;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use DocAddress;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
|
||||
// Rust heap is a max-heap and we need a min heap.
|
||||
#[derive(Clone, Copy)]
|
||||
struct GlobalScoredDoc {
|
||||
score: Score,
|
||||
doc_address: DocAddress,
|
||||
/// Contains a feature (field, score, etc.) of a document along with the document address.
|
||||
///
|
||||
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
||||
/// default Rust heap is a max heap, whereas a min heap is needed.
|
||||
///
|
||||
/// WARNING: equality is not what you would expect here.
|
||||
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
||||
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
||||
/// struct is never public.
|
||||
struct ComparableDoc<T, D> {
|
||||
feature: T,
|
||||
doc: D,
|
||||
}
|
||||
|
||||
impl PartialOrd for GlobalScoredDoc {
|
||||
fn partial_cmp(&self, other: &GlobalScoredDoc) -> Option<Ordering> {
|
||||
impl<T: PartialOrd, D> PartialOrd for ComparableDoc<T, D> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for GlobalScoredDoc {
|
||||
impl<T: PartialOrd, D> Ord for ComparableDoc<T, D> {
|
||||
#[inline]
|
||||
fn cmp(&self, other: &GlobalScoredDoc) -> Ordering {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other
|
||||
.score
|
||||
.partial_cmp(&self.score)
|
||||
.unwrap_or_else(|| other.doc_address.cmp(&self.doc_address))
|
||||
.feature
|
||||
.partial_cmp(&self.feature)
|
||||
.unwrap_or_else(|| Ordering::Equal)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for GlobalScoredDoc {
|
||||
fn eq(&self, other: &GlobalScoredDoc) -> bool {
|
||||
impl<T: PartialOrd, D> PartialEq for ComparableDoc<T, D> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for GlobalScoredDoc {}
|
||||
impl<T: PartialOrd, D> Eq for ComparableDoc<T, D> {}
|
||||
|
||||
/// The Top Collector keeps track of the K documents
|
||||
/// with the best scores.
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||
/// use tantivy::{Index, Result, DocId, Score};
|
||||
/// use tantivy::collector::TopCollector;
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = SchemaBuilder::new();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut top_collector = TopCollector::with_limit(2);
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// searcher.search(&*query, &mut top_collector).unwrap();
|
||||
///
|
||||
/// let score_docs: Vec<(Score, DocId)> = top_collector
|
||||
/// .score_docs()
|
||||
/// .into_iter()
|
||||
/// .map(|(score, doc_address)| (score, doc_address.doc()))
|
||||
/// .collect();
|
||||
///
|
||||
/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]);
|
||||
/// }
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct TopCollector {
|
||||
pub(crate) struct TopCollector<T> {
|
||||
limit: usize,
|
||||
heap: BinaryHeap<GlobalScoredDoc>,
|
||||
segment_id: u32,
|
||||
_marker: PhantomData<T>,
|
||||
}
|
||||
|
||||
impl TopCollector {
|
||||
impl<T> TopCollector<T>
|
||||
where
|
||||
T: PartialOrd + Clone,
|
||||
{
|
||||
/// Creates a top collector, with a number of documents equal to "limit".
|
||||
///
|
||||
/// # Panics
|
||||
/// The method panics if limit is 0
|
||||
pub fn with_limit(limit: usize) -> TopCollector {
|
||||
pub fn with_limit(limit: usize) -> TopCollector<T> {
|
||||
if limit < 1 {
|
||||
panic!("Limit must be strictly greater than 0.");
|
||||
}
|
||||
TopCollector {
|
||||
limit,
|
||||
heap: BinaryHeap::with_capacity(limit),
|
||||
segment_id: 0,
|
||||
_marker: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns K best documents sorted in decreasing order.
|
||||
///
|
||||
/// Calling this method triggers the sort.
|
||||
/// The result of the sort is not cached.
|
||||
pub fn docs(&self) -> Vec<DocAddress> {
|
||||
self.score_docs()
|
||||
.into_iter()
|
||||
.map(|score_doc| score_doc.1)
|
||||
.collect()
|
||||
pub fn limit(&self) -> usize {
|
||||
self.limit
|
||||
}
|
||||
|
||||
/// Returns K best ScoredDocument sorted in decreasing order.
|
||||
///
|
||||
/// Calling this method triggers the sort.
|
||||
/// The result of the sort is not cached.
|
||||
pub fn score_docs(&self) -> Vec<(Score, DocAddress)> {
|
||||
let mut scored_docs: Vec<GlobalScoredDoc> = self.heap.iter().cloned().collect();
|
||||
scored_docs.sort();
|
||||
scored_docs
|
||||
pub fn merge_fruits(
|
||||
&self,
|
||||
children: Vec<Vec<(T, DocAddress)>>,
|
||||
) -> Result<Vec<(T, DocAddress)>> {
|
||||
if self.limit == 0 {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
let mut top_collector = BinaryHeap::new();
|
||||
for child_fruit in children {
|
||||
for (feature, doc) in child_fruit {
|
||||
if top_collector.len() < self.limit {
|
||||
top_collector.push(ComparableDoc { feature, doc });
|
||||
} else if let Some(mut head) = top_collector.peek_mut() {
|
||||
if head.feature < feature {
|
||||
*head = ComparableDoc { feature, doc };
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(top_collector
|
||||
.into_sorted_vec()
|
||||
.into_iter()
|
||||
.map(|GlobalScoredDoc { score, doc_address }| (score, doc_address))
|
||||
.map(|cdoc| (cdoc.feature, cdoc.doc))
|
||||
.collect())
|
||||
}
|
||||
|
||||
pub(crate) fn for_segment(
|
||||
&self,
|
||||
segment_id: SegmentLocalId,
|
||||
_: &SegmentReader,
|
||||
) -> Result<TopSegmentCollector<T>> {
|
||||
Ok(TopSegmentCollector::new(segment_id, self.limit))
|
||||
}
|
||||
}
|
||||
|
||||
/// The Top Collector keeps track of the K documents
|
||||
/// sorted by type `T`.
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
pub(crate) struct TopSegmentCollector<T> {
|
||||
limit: usize,
|
||||
heap: BinaryHeap<ComparableDoc<T, DocId>>,
|
||||
segment_id: u32,
|
||||
}
|
||||
|
||||
impl<T: PartialOrd> TopSegmentCollector<T> {
|
||||
fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector<T> {
|
||||
TopSegmentCollector {
|
||||
limit,
|
||||
heap: BinaryHeap::with_capacity(limit),
|
||||
segment_id,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||
pub fn harvest(self) -> Vec<(T, DocAddress)> {
|
||||
let segment_id = self.segment_id;
|
||||
self.heap
|
||||
.into_sorted_vec()
|
||||
.into_iter()
|
||||
.map(|comparable_doc| {
|
||||
(
|
||||
comparable_doc.feature,
|
||||
DocAddress(segment_id, comparable_doc.doc),
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Return true iff at least K documents have gone through
|
||||
/// the collector.
|
||||
#[inline]
|
||||
pub fn at_capacity(&self) -> bool {
|
||||
#[inline(always)]
|
||||
pub(crate) fn at_capacity(&self) -> bool {
|
||||
self.heap.len() >= self.limit
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for TopCollector {
|
||||
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||
self.segment_id = segment_id;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
/// Collects a document scored by the given feature
|
||||
///
|
||||
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
|
||||
/// will compare the lowest scoring item with the given one and keep whichever is greater.
|
||||
#[inline(always)]
|
||||
pub fn collect(&mut self, doc: DocId, feature: T) {
|
||||
if self.at_capacity() {
|
||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||
let limit_doc: GlobalScoredDoc = *self.heap
|
||||
.peek()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
if limit_doc.score < score {
|
||||
let mut mut_head = self.heap
|
||||
.peek_mut()
|
||||
.expect("Top collector with size 0 is forbidden");
|
||||
mut_head.score = score;
|
||||
mut_head.doc_address = DocAddress(self.segment_id, doc);
|
||||
if let Some(limit_feature) = self.heap.peek().map(|head| head.feature.clone()) {
|
||||
if limit_feature < feature {
|
||||
if let Some(mut head) = self.heap.peek_mut() {
|
||||
head.feature = feature;
|
||||
head.doc = doc;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let wrapped_doc = GlobalScoredDoc {
|
||||
score,
|
||||
doc_address: DocAddress(self.segment_id, doc),
|
||||
};
|
||||
self.heap.push(wrapped_doc);
|
||||
// we have not reached capacity yet, so we can just push the
|
||||
// element.
|
||||
self.heap.push(ComparableDoc { feature, doc });
|
||||
}
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::Collector;
|
||||
use DocId;
|
||||
use super::{TopCollector, TopSegmentCollector};
|
||||
use DocAddress;
|
||||
use Score;
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity() {
|
||||
let mut top_collector = TopCollector::with_limit(4);
|
||||
let mut top_collector = TopSegmentCollector::new(0, 4);
|
||||
top_collector.collect(1, 0.8);
|
||||
top_collector.collect(3, 0.2);
|
||||
top_collector.collect(5, 0.3);
|
||||
assert!(!top_collector.at_capacity());
|
||||
let score_docs: Vec<(Score, DocId)> = top_collector
|
||||
.score_docs()
|
||||
.into_iter()
|
||||
.map(|(score, doc_address)| (score, doc_address.doc()))
|
||||
.collect();
|
||||
assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]);
|
||||
assert_eq!(
|
||||
top_collector.harvest(),
|
||||
vec![
|
||||
(0.8, DocAddress(0, 1)),
|
||||
(0.3, DocAddress(0, 5)),
|
||||
(0.2, DocAddress(0, 3))
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_at_capacity() {
|
||||
let mut top_collector = TopCollector::with_limit(4);
|
||||
let mut top_collector = TopSegmentCollector::new(0, 4);
|
||||
top_collector.collect(1, 0.8);
|
||||
top_collector.collect(3, 0.2);
|
||||
top_collector.collect(5, 0.3);
|
||||
top_collector.collect(7, 0.9);
|
||||
top_collector.collect(9, -0.2);
|
||||
assert!(top_collector.at_capacity());
|
||||
{
|
||||
let score_docs: Vec<(Score, DocId)> = top_collector
|
||||
.score_docs()
|
||||
.into_iter()
|
||||
.map(|(score, doc_address)| (score, doc_address.doc()))
|
||||
.collect();
|
||||
assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]);
|
||||
}
|
||||
{
|
||||
let docs: Vec<DocId> = top_collector
|
||||
.docs()
|
||||
.into_iter()
|
||||
.map(|doc_address| doc_address.doc())
|
||||
.collect();
|
||||
assert_eq!(docs, vec![7, 1, 5, 3]);
|
||||
}
|
||||
assert_eq!(
|
||||
top_collector.harvest(),
|
||||
vec![
|
||||
(0.9, DocAddress(0, 7)),
|
||||
(0.8, DocAddress(0, 1)),
|
||||
(0.3, DocAddress(0, 5)),
|
||||
(0.2, DocAddress(0, 3))
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_top_0() {
|
||||
TopCollector::with_limit(0);
|
||||
let _collector: TopCollector<Score> = TopCollector::with_limit(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
250
src/collector/top_field_collector.rs
Normal file
250
src/collector/top_field_collector.rs
Normal file
@@ -0,0 +1,250 @@
|
||||
use super::Collector;
|
||||
use collector::top_collector::TopCollector;
|
||||
use collector::top_collector::TopSegmentCollector;
|
||||
use collector::SegmentCollector;
|
||||
use fastfield::FastFieldReader;
|
||||
use fastfield::FastValue;
|
||||
use schema::Field;
|
||||
use DocAddress;
|
||||
use Result;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
|
||||
/// The Top Field Collector keeps track of the K documents
|
||||
/// sorted by a fast field in the index
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// # use tantivy::schema::{Schema, Field, FAST, TEXT};
|
||||
/// # use tantivy::{Index, Result, DocAddress};
|
||||
/// # use tantivy::query::{Query, QueryParser};
|
||||
/// use tantivy::collector::TopDocs;
|
||||
///
|
||||
/// # fn main() {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
/// # let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// # let rating = schema_builder.add_u64_field("rating", FAST);
|
||||
/// # let schema = schema_builder.build();
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
/// # index_writer.add_document(doc!(
|
||||
/// # title => "The Name of the Wind",
|
||||
/// # rating => 92u64,
|
||||
/// # ));
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
|
||||
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
|
||||
/// # index_writer.commit().unwrap();
|
||||
/// # index.load_searchers().unwrap();
|
||||
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary").unwrap();
|
||||
/// # let top_docs = docs_sorted_by_rating(&index, &query, rating).unwrap();
|
||||
/// # assert_eq!(top_docs,
|
||||
/// # vec![(97u64, DocAddress(0u32, 1)),
|
||||
/// # (80u64, DocAddress(0u32, 3))]);
|
||||
/// # }
|
||||
/// #
|
||||
/// /// Searches the document matching the given query, and
|
||||
/// /// collects the top 10 documents, order by the `field`
|
||||
/// /// given in argument.
|
||||
/// ///
|
||||
/// /// `field` is required to be a FAST field.
|
||||
/// fn docs_sorted_by_rating(index: &Index, query: &Query, sort_by_field: Field)
|
||||
/// -> Result<Vec<(u64, DocAddress)>> {
|
||||
///
|
||||
/// // This is where we build our collector!
|
||||
/// let top_docs_by_rating = TopDocs::with_limit(2).order_by_field(sort_by_field);
|
||||
///
|
||||
/// // ... and here is our documents. Not this is a simple vec.
|
||||
/// // The `u64` in the pair is the value of our fast field for each documents.
|
||||
/// index.searcher()
|
||||
/// .search(query, &top_docs_by_rating)
|
||||
/// }
|
||||
/// ```
|
||||
pub struct TopDocsByField<T> {
|
||||
collector: TopCollector<T>,
|
||||
field: Field,
|
||||
}
|
||||
|
||||
impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
|
||||
/// Creates a top field collector, with a number of documents equal to "limit".
|
||||
///
|
||||
/// The given field name must be a fast field, otherwise the collector have an error while
|
||||
/// collecting results.
|
||||
///
|
||||
/// # Panics
|
||||
/// The method panics if limit is 0
|
||||
pub(crate) fn new(field: Field, limit: usize) -> TopDocsByField<T> {
|
||||
TopDocsByField {
|
||||
collector: TopCollector::with_limit(limit),
|
||||
field,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByField<T> {
|
||||
type Fruit = Vec<(T, DocAddress)>;
|
||||
|
||||
type Child = TopFieldSegmentCollector<T>;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<TopFieldSegmentCollector<T>> {
|
||||
let collector = self.collector.for_segment(segment_local_id, reader)?;
|
||||
let reader = reader.fast_field_reader(self.field)?;
|
||||
Ok(TopFieldSegmentCollector { collector, reader })
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
segment_fruits: Vec<Vec<(T, DocAddress)>>,
|
||||
) -> Result<Vec<(T, DocAddress)>> {
|
||||
self.collector.merge_fruits(segment_fruits)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TopFieldSegmentCollector<T: FastValue + PartialOrd> {
|
||||
collector: TopSegmentCollector<T>,
|
||||
reader: FastFieldReader<T>,
|
||||
}
|
||||
|
||||
impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
||||
for TopFieldSegmentCollector<T>
|
||||
{
|
||||
type Fruit = Vec<(T, DocAddress)>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let field_value = self.reader.get(doc);
|
||||
self.collector.collect(doc, field_value);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Vec<(T, DocAddress)> {
|
||||
self.collector.harvest()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::TopDocsByField;
|
||||
use collector::Collector;
|
||||
use collector::TopDocs;
|
||||
use query::Query;
|
||||
use query::QueryParser;
|
||||
use schema::Field;
|
||||
use schema::IntOptions;
|
||||
use schema::{Schema, FAST, TEXT};
|
||||
use DocAddress;
|
||||
use Index;
|
||||
use IndexWriter;
|
||||
use TantivyError;
|
||||
|
||||
const TITLE: &str = "title";
|
||||
const SIZE: &str = "size";
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, query) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "growler of beer",
|
||||
size => 64u64,
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "pint of beer",
|
||||
size => 16u64,
|
||||
));
|
||||
});
|
||||
let searcher = index.searcher();
|
||||
|
||||
let top_collector = TopDocs::with_limit(4).order_by_field(size);
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
|
||||
assert_eq!(
|
||||
top_docs,
|
||||
vec![
|
||||
(64, DocAddress(0, 1)),
|
||||
(16, DocAddress(0, 2)),
|
||||
(12, DocAddress(0, 0))
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_field_does_not_exist() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, _) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
});
|
||||
let searcher = index.searcher();
|
||||
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(Field(2));
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
top_collector
|
||||
.for_segment(0, segment_reader)
|
||||
.expect("should panic");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_field_not_fast_field() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, IntOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let (index, _) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
});
|
||||
let searcher = index.searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(size);
|
||||
assert_matches!(
|
||||
top_collector
|
||||
.for_segment(0, segment)
|
||||
.map(|_| ())
|
||||
.unwrap_err(),
|
||||
TantivyError::FastFieldError(_)
|
||||
);
|
||||
}
|
||||
|
||||
fn index(
|
||||
query: &str,
|
||||
query_field: Field,
|
||||
schema: Schema,
|
||||
mut doc_adder: impl FnMut(&mut IndexWriter) -> (),
|
||||
) -> (Index, Box<Query>) {
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
doc_adder(&mut index_writer);
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![query_field]);
|
||||
let query = query_parser.parse_query(query).unwrap();
|
||||
(index, query)
|
||||
}
|
||||
}
|
||||
200
src/collector/top_score_collector.rs
Normal file
200
src/collector/top_score_collector.rs
Normal file
@@ -0,0 +1,200 @@
|
||||
use super::Collector;
|
||||
use collector::top_collector::TopCollector;
|
||||
use collector::top_collector::TopSegmentCollector;
|
||||
use collector::SegmentCollector;
|
||||
use collector::TopDocsByField;
|
||||
use fastfield::FastValue;
|
||||
use schema::Field;
|
||||
use DocAddress;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
|
||||
/// The Top Score Collector keeps track of the K documents
|
||||
/// sorted by their score.
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::DocAddress;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::query::QueryParser;
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// let schema = schema_builder.build();
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2))?;
|
||||
///
|
||||
/// assert_eq!(&top_docs[0], &(0.7261542, DocAddress(0, 1)));
|
||||
/// assert_eq!(&top_docs[1], &(0.6099695, DocAddress(0, 3)));
|
||||
///
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub struct TopDocs(TopCollector<Score>);
|
||||
|
||||
impl TopDocs {
|
||||
/// Creates a top score collector, with a number of documents equal to "limit".
|
||||
///
|
||||
/// # Panics
|
||||
/// The method panics if limit is 0
|
||||
pub fn with_limit(limit: usize) -> TopDocs {
|
||||
TopDocs(TopCollector::with_limit(limit))
|
||||
}
|
||||
|
||||
/// Set top-K to rank documents by a given fast field.
|
||||
///
|
||||
/// (By default, `TopDocs` collects the top-K documents sorted by
|
||||
/// the similarity score.)
|
||||
pub fn order_by_field<T: PartialOrd + FastValue + Clone>(
|
||||
self,
|
||||
field: Field,
|
||||
) -> TopDocsByField<T> {
|
||||
TopDocsByField::new(field, self.0.limit())
|
||||
}
|
||||
}
|
||||
|
||||
impl Collector for TopDocs {
|
||||
type Fruit = Vec<(Score, DocAddress)>;
|
||||
|
||||
type Child = TopScoreSegmentCollector;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<Self::Child> {
|
||||
let collector = self.0.for_segment(segment_local_id, reader)?;
|
||||
Ok(TopScoreSegmentCollector(collector))
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, child_fruits: Vec<Vec<(Score, DocAddress)>>) -> Result<Self::Fruit> {
|
||||
self.0.merge_fruits(child_fruits)
|
||||
}
|
||||
}
|
||||
|
||||
/// Segment Collector associated to `TopDocs`.
|
||||
pub struct TopScoreSegmentCollector(TopSegmentCollector<Score>);
|
||||
|
||||
impl SegmentCollector for TopScoreSegmentCollector {
|
||||
type Fruit = Vec<(Score, DocAddress)>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
self.0.collect(doc, score)
|
||||
}
|
||||
|
||||
fn harvest(self) -> Vec<(Score, DocAddress)> {
|
||||
self.0.harvest()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::TopDocs;
|
||||
use query::QueryParser;
|
||||
use schema::Schema;
|
||||
use schema::TEXT;
|
||||
use DocAddress;
|
||||
use Index;
|
||||
use Score;
|
||||
|
||||
fn make_index() -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
|
||||
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
|
||||
index_writer.add_document(doc!(text_field=>"I like Droopy"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity() {
|
||||
let index = make_index();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let score_docs: Vec<(Score, DocAddress)> = index
|
||||
.searcher()
|
||||
.search(&text_query, &TopDocs::with_limit(4))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
score_docs,
|
||||
vec![
|
||||
(0.81221175, DocAddress(0u32, 1)),
|
||||
(0.5376842, DocAddress(0u32, 2)),
|
||||
(0.48527452, DocAddress(0, 0))
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_at_capacity() {
|
||||
let index = make_index();
|
||||
let field = index.schema().get_field("text").unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let score_docs: Vec<(Score, DocAddress)> = index
|
||||
.searcher()
|
||||
.search(&text_query, &TopDocs::with_limit(2))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
score_docs,
|
||||
vec![
|
||||
(0.81221175, DocAddress(0u32, 1)),
|
||||
(0.5376842, DocAddress(0u32, 2)),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_top_0() {
|
||||
TopDocs::with_limit(0);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,9 +1,6 @@
|
||||
use common::serialize::BinarySerializable;
|
||||
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::ops::Deref;
|
||||
use std::ptr;
|
||||
|
||||
pub(crate) struct BitPacker {
|
||||
mini_buffer: u64,
|
||||
@@ -18,7 +15,7 @@ impl BitPacker {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write<TWrite: Write>(
|
||||
pub fn write<TWrite: io::Write>(
|
||||
&mut self,
|
||||
val: u64,
|
||||
num_bits: u8,
|
||||
@@ -28,14 +25,14 @@ impl BitPacker {
|
||||
let num_bits = num_bits as usize;
|
||||
if self.mini_buffer_written + num_bits > 64 {
|
||||
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
||||
self.mini_buffer.serialize(output)?;
|
||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
||||
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
|
||||
} else {
|
||||
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
||||
self.mini_buffer_written += num_bits;
|
||||
if self.mini_buffer_written == 64 {
|
||||
self.mini_buffer.serialize(output)?;
|
||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||
self.mini_buffer_written = 0;
|
||||
self.mini_buffer = 0u64;
|
||||
}
|
||||
@@ -43,17 +40,18 @@ impl BitPacker {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
|
||||
let mut arr: [u8; 8] = [0u8; 8];
|
||||
LittleEndian::write_u64(&mut arr, self.mini_buffer);
|
||||
output.write_all(&arr[..num_bytes])?;
|
||||
self.mini_buffer_written = 0;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
self.flush(output)?;
|
||||
// Padding the write file to simplify reads.
|
||||
output.write_all(&[0u8; 7])?;
|
||||
@@ -102,8 +100,7 @@ where
|
||||
addr + 8 <= data.len(),
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 =
|
||||
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
|
||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
}
|
||||
@@ -125,8 +122,7 @@ where
|
||||
for output_val in output.iter_mut() {
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
let val_unshifted_unmasked: u64 =
|
||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
*output_val = val_shifted & mask;
|
||||
addr_in_bits += num_bits;
|
||||
|
||||
@@ -34,17 +34,17 @@ impl TinySet {
|
||||
}
|
||||
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
fn complement(&self) -> TinySet {
|
||||
fn complement(self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` contains the element `el`.
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
pub fn contains(self, el: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
/// Returns the intersection of `self` and `other`
|
||||
pub fn intersect(&self, other: TinySet) -> TinySet {
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
}
|
||||
|
||||
@@ -77,7 +77,7 @@ impl TinySet {
|
||||
|
||||
/// Returns true iff the `TinySet` is empty.
|
||||
#[inline(always)]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
pub fn is_empty(self) -> bool {
|
||||
self.0 == 0u64
|
||||
}
|
||||
|
||||
@@ -114,7 +114,7 @@ impl TinySet {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
pub fn len(&self) -> u32 {
|
||||
pub fn len(self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
}
|
||||
@@ -266,14 +266,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_bitset_large() {
|
||||
let arr = generate_nonunique_unsorted(1_000_000, 50_000);
|
||||
let arr = generate_nonunique_unsorted(100_000, 5_000);
|
||||
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
|
||||
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||
let mut bitset = BitSet::with_max_value(100_000);
|
||||
for el in arr {
|
||||
btreeset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for i in 0..1_000_000 {
|
||||
for i in 0..100_000 {
|
||||
assert_eq!(btreeset.contains(&i), bitset.contains(i));
|
||||
}
|
||||
assert_eq!(btreeset.len(), bitset.len());
|
||||
@@ -342,7 +342,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_bitset_clear() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
let els = tests::sample(1_000, 0.01f32);
|
||||
let els = tests::sample(1_000, 0.01f64);
|
||||
for &el in &els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
|
||||
@@ -4,6 +4,8 @@ use common::VInt;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use space_usage::FieldUsage;
|
||||
use space_usage::PerFieldSpaceUsage;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::io::{self, Read};
|
||||
@@ -64,7 +66,7 @@ impl<W: Write> CompositeWrite<W> {
|
||||
&mut self.write
|
||||
}
|
||||
|
||||
/// Close the composite file.
|
||||
/// Close the composite file
|
||||
///
|
||||
/// An index of the different field offsets
|
||||
/// will be written as a footer.
|
||||
@@ -72,7 +74,8 @@ impl<W: Write> CompositeWrite<W> {
|
||||
let footer_offset = self.write.written_bytes();
|
||||
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
|
||||
|
||||
let mut offset_fields: Vec<_> = self.offsets
|
||||
let mut offset_fields: Vec<_> = self
|
||||
.offsets
|
||||
.iter()
|
||||
.map(|(file_addr, offset)| (*offset, *file_addr))
|
||||
.collect();
|
||||
@@ -112,7 +115,6 @@ impl CompositeFile {
|
||||
let end = data.len();
|
||||
let footer_len_data = data.slice_from(end - 4);
|
||||
let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize;
|
||||
|
||||
let footer_start = end - 4 - footer_len;
|
||||
let footer_data = data.slice(footer_start, footer_start + footer_len);
|
||||
let mut footer_buffer = footer_data.as_slice();
|
||||
@@ -166,6 +168,17 @@ impl CompositeFile {
|
||||
.get(&FileAddr { field, idx })
|
||||
.map(|&(from, to)| self.data.slice(from, to))
|
||||
}
|
||||
|
||||
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||
let mut fields = HashMap::new();
|
||||
for (&field_addr, &(start, end)) in self.offsets_index.iter() {
|
||||
fields
|
||||
.entry(field_addr.field)
|
||||
.or_insert_with(|| FieldUsage::empty(field_addr.field))
|
||||
.add_field_idx(field_addr.idx, end - start);
|
||||
}
|
||||
PerFieldSpaceUsage::new(fields)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -7,6 +7,8 @@ use std::io::Write;
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub struct VInt(pub u64);
|
||||
|
||||
const STOP_BIT: u8 = 128;
|
||||
|
||||
impl VInt {
|
||||
pub fn val(&self) -> u64 {
|
||||
self.0
|
||||
@@ -15,24 +17,34 @@ impl VInt {
|
||||
pub fn deserialize_u64<R: Read>(reader: &mut R) -> io::Result<u64> {
|
||||
VInt::deserialize(reader).map(|vint| vint.0)
|
||||
}
|
||||
|
||||
pub fn serialize_into_vec(&self, output: &mut Vec<u8>) {
|
||||
let mut buffer = [0u8; 10];
|
||||
let num_bytes = self.serialize_into(&mut buffer);
|
||||
output.extend(&buffer[0..num_bytes]);
|
||||
}
|
||||
|
||||
fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
|
||||
let mut remaining = self.0;
|
||||
for (i, b) in buffer.iter_mut().enumerate() {
|
||||
let next_byte: u8 = (remaining % 128u64) as u8;
|
||||
remaining /= 128u64;
|
||||
if remaining == 0u64 {
|
||||
*b = next_byte | STOP_BIT;
|
||||
return i + 1;
|
||||
} else {
|
||||
*b = next_byte;
|
||||
}
|
||||
}
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for VInt {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let mut remaining = self.0;
|
||||
let mut buffer = [0u8; 10];
|
||||
let mut i = 0;
|
||||
loop {
|
||||
let next_byte: u8 = (remaining % 128u64) as u8;
|
||||
remaining /= 128u64;
|
||||
if remaining == 0u64 {
|
||||
buffer[i] = next_byte | 128u8;
|
||||
return writer.write_all(&buffer[0..i + 1]);
|
||||
} else {
|
||||
buffer[i] = next_byte;
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
let num_bytes = self.serialize_into(&mut buffer);
|
||||
writer.write_all(&buffer[0..num_bytes])
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
@@ -42,20 +54,58 @@ impl BinarySerializable for VInt {
|
||||
loop {
|
||||
match bytes.next() {
|
||||
Some(Ok(b)) => {
|
||||
result += u64::from(b % 128u8) << shift;
|
||||
if b & 128u8 != 0u8 {
|
||||
break;
|
||||
result |= u64::from(b % 128u8) << shift;
|
||||
if b >= STOP_BIT {
|
||||
return Ok(VInt(result));
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
_ => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"Reach end of buffer",
|
||||
"Reach end of buffer while reading VInt",
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(VInt(result))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::VInt;
|
||||
use common::BinarySerializable;
|
||||
|
||||
fn aux_test_vint(val: u64) {
|
||||
let mut v = [14u8; 10];
|
||||
let num_bytes = VInt(val).serialize_into(&mut v);
|
||||
for i in num_bytes..10 {
|
||||
assert_eq!(v[i], 14u8);
|
||||
}
|
||||
assert!(num_bytes > 0);
|
||||
if num_bytes < 10 {
|
||||
assert!(1u64 << (7 * num_bytes) > val);
|
||||
}
|
||||
if num_bytes > 1 {
|
||||
assert!(1u64 << (7 * (num_bytes - 1)) <= val);
|
||||
}
|
||||
let serdeser_val = VInt::deserialize(&mut &v[..]).unwrap();
|
||||
assert_eq!(val, serdeser_val.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vint() {
|
||||
aux_test_vint(0);
|
||||
aux_test_vint(1);
|
||||
aux_test_vint(5);
|
||||
aux_test_vint(u64::max_value());
|
||||
for i in 1..9 {
|
||||
let power_of_128 = 1u64 << (7 * i);
|
||||
aux_test_vint(power_of_128 - 1u64);
|
||||
aux_test_vint(power_of_128);
|
||||
aux_test_vint(power_of_128 + 1u64);
|
||||
}
|
||||
aux_test_vint(10);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,160 +0,0 @@
|
||||
use compression::compressed_block_size;
|
||||
use compression::BlockDecoder;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use directory::ReadOnlySource;
|
||||
use owned_read::OwnedRead;
|
||||
|
||||
/// Reads a stream of compressed ints.
|
||||
///
|
||||
/// Tantivy uses `CompressedIntStream` to read
|
||||
/// the position file.
|
||||
/// The `.skip(...)` makes it possible to avoid
|
||||
/// decompressing blocks that are not required.
|
||||
pub struct CompressedIntStream {
|
||||
buffer: OwnedRead,
|
||||
|
||||
block_decoder: BlockDecoder,
|
||||
cached_addr: usize, // address of the currently decoded block
|
||||
cached_next_addr: usize, // address following the currently decoded block
|
||||
|
||||
addr: usize, // address of the block associated to the current position
|
||||
inner_offset: usize,
|
||||
}
|
||||
|
||||
impl CompressedIntStream {
|
||||
/// Opens a compressed int stream.
|
||||
pub(crate) fn wrap(source: ReadOnlySource) -> CompressedIntStream {
|
||||
CompressedIntStream {
|
||||
buffer: OwnedRead::new(source),
|
||||
block_decoder: BlockDecoder::new(),
|
||||
cached_addr: usize::max_value(),
|
||||
cached_next_addr: usize::max_value(),
|
||||
|
||||
addr: 0,
|
||||
inner_offset: 0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Loads the block at the given address and return the address of the
|
||||
/// following block
|
||||
pub fn read_block(&mut self, addr: usize) -> usize {
|
||||
if self.cached_addr == addr {
|
||||
// we are already on this block.
|
||||
// no need to read.
|
||||
self.cached_next_addr
|
||||
} else {
|
||||
let next_addr = addr + self.block_decoder
|
||||
.uncompress_block_unsorted(self.buffer.slice_from(addr));
|
||||
self.cached_addr = addr;
|
||||
self.cached_next_addr = next_addr;
|
||||
next_addr
|
||||
}
|
||||
}
|
||||
|
||||
/// Fills a buffer with the next `output.len()` integers.
|
||||
/// This does not consume / advance the stream.
|
||||
pub fn read(&mut self, output: &mut [u32]) {
|
||||
let mut cursor = self.addr;
|
||||
let mut inner_offset = self.inner_offset;
|
||||
let mut num_els: usize = output.len();
|
||||
let mut start = 0;
|
||||
loop {
|
||||
cursor = self.read_block(cursor);
|
||||
let block = &self.block_decoder.output_array()[inner_offset..];
|
||||
let block_len = block.len();
|
||||
if num_els >= block_len {
|
||||
output[start..start + block_len].clone_from_slice(&block);
|
||||
start += block_len;
|
||||
num_els -= block_len;
|
||||
inner_offset = 0;
|
||||
} else {
|
||||
output[start..].clone_from_slice(&block[..num_els]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Skip the next `skip_len` integer.
|
||||
///
|
||||
/// If a full block is skipped, calling
|
||||
/// `.skip(...)` will avoid decompressing it.
|
||||
///
|
||||
/// May panic if the end of the stream is reached.
|
||||
pub fn skip(&mut self, mut skip_len: usize) {
|
||||
loop {
|
||||
let available = COMPRESSION_BLOCK_SIZE - self.inner_offset;
|
||||
if available >= skip_len {
|
||||
self.inner_offset += skip_len;
|
||||
break;
|
||||
} else {
|
||||
skip_len -= available;
|
||||
// entirely skip decompressing some blocks.
|
||||
let num_bits: u8 = self.buffer.get(self.addr);
|
||||
let block_len = compressed_block_size(num_bits);
|
||||
self.addr += block_len;
|
||||
self.inner_offset = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use super::CompressedIntStream;
|
||||
use compression::compressed_block_size;
|
||||
use compression::BlockEncoder;
|
||||
use compression::COMPRESSION_BLOCK_SIZE;
|
||||
use directory::ReadOnlySource;
|
||||
|
||||
fn create_stream_buffer() -> ReadOnlySource {
|
||||
let mut buffer: Vec<u8> = vec![];
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let vals: Vec<u32> = (0u32..1152u32).collect();
|
||||
for chunk in vals.chunks(COMPRESSION_BLOCK_SIZE) {
|
||||
let compressed_block = encoder.compress_block_unsorted(chunk);
|
||||
let num_bits = compressed_block[0];
|
||||
assert_eq!(compressed_block_size(num_bits), compressed_block.len());
|
||||
buffer.extend_from_slice(compressed_block);
|
||||
}
|
||||
if cfg!(simd) {
|
||||
buffer.extend_from_slice(&[0u8; 7]);
|
||||
}
|
||||
ReadOnlySource::from(buffer)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compressed_int_stream() {
|
||||
let buffer = create_stream_buffer();
|
||||
let mut stream = CompressedIntStream::wrap(buffer);
|
||||
let mut block: [u32; COMPRESSION_BLOCK_SIZE] = [0u32; COMPRESSION_BLOCK_SIZE];
|
||||
|
||||
stream.read(&mut block[0..2]);
|
||||
assert_eq!(block[0], 0);
|
||||
assert_eq!(block[1], 1);
|
||||
|
||||
// reading does not consume the stream
|
||||
stream.read(&mut block[0..2]);
|
||||
assert_eq!(block[0], 0);
|
||||
assert_eq!(block[1], 1);
|
||||
stream.skip(2);
|
||||
|
||||
stream.skip(5);
|
||||
stream.read(&mut block[0..3]);
|
||||
stream.skip(3);
|
||||
|
||||
assert_eq!(block[0], 7);
|
||||
assert_eq!(block[1], 8);
|
||||
assert_eq!(block[2], 9);
|
||||
stream.skip(500);
|
||||
stream.read(&mut block[0..3]);
|
||||
stream.skip(3);
|
||||
|
||||
assert_eq!(block[0], 510);
|
||||
assert_eq!(block[1], 511);
|
||||
assert_eq!(block[2], 512);
|
||||
stream.skip(511);
|
||||
stream.read(&mut block[..1]);
|
||||
assert_eq!(block[0], 1024);
|
||||
}
|
||||
}
|
||||
137
src/core/executor.rs
Normal file
137
src/core/executor.rs
Normal file
@@ -0,0 +1,137 @@
|
||||
use crossbeam::channel;
|
||||
use scoped_pool::{Pool, ThreadConfig};
|
||||
use Result;
|
||||
|
||||
/// Search executor whether search request are single thread or multithread.
|
||||
///
|
||||
/// We don't expose Rayon thread pool directly here for several reasons.
|
||||
///
|
||||
/// First dependency hell. It is not a good idea to expose the
|
||||
/// API of a dependency, knowing it might conflict with a different version
|
||||
/// used by the client. Second, we may stop using rayon in the future.
|
||||
pub enum Executor {
|
||||
SingleThread,
|
||||
ThreadPool(Pool),
|
||||
}
|
||||
|
||||
impl Executor {
|
||||
/// Creates an Executor that performs all task in the caller thread.
|
||||
pub fn single_thread() -> Executor {
|
||||
Executor::SingleThread
|
||||
}
|
||||
|
||||
// Creates an Executor that dispatches the tasks in a thread pool.
|
||||
pub fn multi_thread(num_threads: usize, prefix: &'static str) -> Executor {
|
||||
let thread_config = ThreadConfig::new().prefix(prefix);
|
||||
let pool = Pool::with_thread_config(num_threads, thread_config);
|
||||
Executor::ThreadPool(pool)
|
||||
}
|
||||
|
||||
// Perform a map in the thread pool.
|
||||
//
|
||||
// Regardless of the executor (`SingleThread` or `ThreadPool`), panics in the task
|
||||
// will propagate to the caller.
|
||||
pub fn map<
|
||||
A: Send,
|
||||
R: Send,
|
||||
AIterator: Iterator<Item = A>,
|
||||
F: Sized + Sync + Fn(A) -> Result<R>,
|
||||
>(
|
||||
&self,
|
||||
f: F,
|
||||
args: AIterator,
|
||||
) -> Result<Vec<R>> {
|
||||
match self {
|
||||
Executor::SingleThread => args.map(f).collect::<Result<_>>(),
|
||||
Executor::ThreadPool(pool) => {
|
||||
let args_with_indices: Vec<(usize, A)> = args.enumerate().collect();
|
||||
let num_fruits = args_with_indices.len();
|
||||
let fruit_receiver = {
|
||||
let (fruit_sender, fruit_receiver) = channel::unbounded();
|
||||
pool.scoped(|scope| {
|
||||
for arg_with_idx in args_with_indices {
|
||||
scope.execute(|| {
|
||||
let (idx, arg) = arg_with_idx;
|
||||
let fruit = f(arg);
|
||||
if let Err(err) = fruit_sender.send((idx, fruit)) {
|
||||
error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
fruit_receiver
|
||||
// This ends the scope of fruit_sender.
|
||||
// This is important as it makes it possible for the fruit_receiver iteration to
|
||||
// terminate.
|
||||
};
|
||||
// This is lame, but it does not use unsafe code.
|
||||
let mut results_with_position = Vec::with_capacity(num_fruits);
|
||||
for (pos, fruit_res) in fruit_receiver {
|
||||
let fruit = fruit_res?;
|
||||
results_with_position.push((pos, fruit));
|
||||
}
|
||||
results_with_position.sort_by_key(|(pos, _)| *pos);
|
||||
assert_eq!(results_with_position.len(), num_fruits);
|
||||
Ok(results_with_position
|
||||
.into_iter()
|
||||
.map(|(_, fruit)| fruit)
|
||||
.collect::<Vec<_>>())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::Executor;
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "panic should propagate")]
|
||||
fn test_panic_propagates_single_thread() {
|
||||
let _result: Vec<usize> = Executor::single_thread()
|
||||
.map(
|
||||
|_| {
|
||||
panic!("panic should propagate");
|
||||
},
|
||||
vec![0].into_iter(),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic] //< unfortunately the panic message is not propagated
|
||||
fn test_panic_propagates_multi_thread() {
|
||||
let _result: Vec<usize> = Executor::multi_thread(1, "search-test")
|
||||
.map(
|
||||
|_| {
|
||||
panic!("panic should propagate");
|
||||
},
|
||||
vec![0].into_iter(),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_map_singlethread() {
|
||||
let result: Vec<usize> = Executor::single_thread()
|
||||
.map(|i| Ok(i * 2), 0..1_000)
|
||||
.unwrap();
|
||||
assert_eq!(result.len(), 1_000);
|
||||
for i in 0..1_000 {
|
||||
assert_eq!(result[i], i * 2);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_map_multithread() {
|
||||
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
|
||||
.map(|i| Ok(i * 2), 0..10)
|
||||
.unwrap();
|
||||
assert_eq!(result.len(), 10);
|
||||
for i in 0..10 {
|
||||
assert_eq!(result[i], i * 2);
|
||||
}
|
||||
}
|
||||
@@ -1,18 +1,11 @@
|
||||
use core::SegmentId;
|
||||
use error::{ErrorKind, ResultExt};
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use Result;
|
||||
|
||||
use super::pool::LeasedItem;
|
||||
use super::pool::Pool;
|
||||
use super::segment::create_segment;
|
||||
use super::segment::Segment;
|
||||
use core::searcher::Searcher;
|
||||
use core::Executor;
|
||||
use core::IndexMeta;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SegmentReader;
|
||||
use core::META_FILEPATH;
|
||||
@@ -20,32 +13,79 @@ use directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use error::DataCorruption;
|
||||
use error::TantivyError;
|
||||
use indexer::index_writer::open_index_writer;
|
||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||
use indexer::segment_updater::save_new_metas;
|
||||
use indexer::DirectoryLock;
|
||||
use indexer::LockType;
|
||||
use num_cpus;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::fmt;
|
||||
use std::path::Path;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::TokenizerManager;
|
||||
use IndexWriter;
|
||||
|
||||
const NUM_SEARCHERS: usize = 12;
|
||||
use Result;
|
||||
|
||||
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||
let meta_string = String::from_utf8_lossy(&meta_data);
|
||||
serde_json::from_str(&meta_string).chain_err(|| ErrorKind::CorruptedFile(META_FILEPATH.clone()))
|
||||
serde_json::from_str(&meta_string)
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
META_FILEPATH.clone(),
|
||||
format!("Meta file cannot be deserialized. {:?}.", e),
|
||||
)
|
||||
})
|
||||
.map_err(From::from)
|
||||
}
|
||||
|
||||
/// Search Index
|
||||
pub struct Index {
|
||||
directory: ManagedDirectory,
|
||||
schema: Schema,
|
||||
num_searchers: Arc<AtomicUsize>,
|
||||
searcher_pool: Arc<Pool<Searcher>>,
|
||||
executor: Arc<Executor>,
|
||||
tokenizers: TokenizerManager,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
/// Examines the director to see if it contains an index
|
||||
pub fn exists<Dir: Directory>(dir: &Dir) -> bool {
|
||||
dir.exists(&META_FILEPATH)
|
||||
}
|
||||
|
||||
/// Accessor to the search executor.
|
||||
///
|
||||
/// This pool is used by default when calling `searcher.search(...)`
|
||||
/// to perform search on the individual segments.
|
||||
///
|
||||
/// By default the executor is single thread, and simply runs in the calling thread.
|
||||
pub fn search_executor(&self) -> &Executor {
|
||||
self.executor.as_ref()
|
||||
}
|
||||
|
||||
/// Replace the default single thread search executor pool
|
||||
/// by a thread pool with a given number of threads.
|
||||
pub fn set_multithread_executor(&mut self, num_threads: usize) {
|
||||
self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-"));
|
||||
}
|
||||
|
||||
/// Replace the default single thread search executor pool
|
||||
/// by a thread pool with a given number of threads.
|
||||
pub fn set_default_multithread_executor(&mut self) {
|
||||
let default_num_threads = num_cpus::get();
|
||||
self.set_multithread_executor(default_num_threads);
|
||||
}
|
||||
|
||||
/// Creates a new index using the `RAMDirectory`.
|
||||
///
|
||||
/// The index will be allocated in anonymous memory.
|
||||
@@ -62,9 +102,30 @@ impl Index {
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
if Index::exists(&mmap_directory) {
|
||||
return Err(TantivyError::IndexAlreadyExists);
|
||||
}
|
||||
|
||||
Index::create(mmap_directory, schema)
|
||||
}
|
||||
|
||||
/// Opens or creates a new index in the provided directory
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||
if Index::exists(&dir) {
|
||||
let index = Index::open(dir)?;
|
||||
if index.schema() == schema {
|
||||
Ok(index)
|
||||
} else {
|
||||
Err(TantivyError::SchemaError(
|
||||
"An index exists but the schema does not match.".to_string(),
|
||||
))
|
||||
}
|
||||
} else {
|
||||
Index::create(dir, schema)
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
///
|
||||
/// The index will use the `MMapDirectory` in a newly created directory.
|
||||
@@ -81,13 +142,15 @@ impl Index {
|
||||
|
||||
/// Creates a new index given an implementation of the trait `Directory`
|
||||
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(dir)?;
|
||||
let directory = ManagedDirectory::wrap(dir)?;
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
/// Create a new index from a directory.
|
||||
///
|
||||
/// This will overwrite existing meta.json
|
||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||
save_new_metas(schema.clone(), directory.borrow_mut())?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
@@ -95,11 +158,14 @@ impl Index {
|
||||
/// Creates a new index given a directory and an `IndexMeta`.
|
||||
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
|
||||
let schema = metas.schema.clone();
|
||||
let n_cpus = num_cpus::get();
|
||||
let index = Index {
|
||||
directory,
|
||||
schema,
|
||||
num_searchers: Arc::new(AtomicUsize::new(n_cpus)),
|
||||
searcher_pool: Arc::new(Pool::new()),
|
||||
tokenizers: TokenizerManager::default(),
|
||||
executor: Arc::new(Executor::single_thread()),
|
||||
};
|
||||
index.load_searchers()?;
|
||||
Ok(index)
|
||||
@@ -110,6 +176,27 @@ impl Index {
|
||||
&self.tokenizers
|
||||
}
|
||||
|
||||
/// Helper to access the tokenizer associated to a specific field.
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<BoxedTokenizer>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||
let tokenizer_name_opt: Option<Box<BoxedTokenizer>> = match field_type {
|
||||
FieldType::Str(text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)),
|
||||
_ => None,
|
||||
};
|
||||
match tokenizer_name_opt {
|
||||
Some(tokenizer) => Ok(tokenizer),
|
||||
None => Err(TantivyError::SchemaError(format!(
|
||||
"{:?} is not a text field.",
|
||||
field_entry.name()
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
@@ -119,7 +206,7 @@ impl Index {
|
||||
|
||||
/// Open the index using the provided directory
|
||||
pub fn open<D: Directory>(directory: D) -> Result<Index> {
|
||||
let directory = ManagedDirectory::new(directory)?;
|
||||
let directory = ManagedDirectory::wrap(directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
}
|
||||
@@ -153,7 +240,7 @@ impl Index {
|
||||
num_threads: usize,
|
||||
overall_heap_size_in_bytes: usize,
|
||||
) -> Result<IndexWriter> {
|
||||
let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
|
||||
let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?;
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
open_index_writer(
|
||||
self,
|
||||
@@ -191,7 +278,8 @@ impl Index {
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
|
||||
Ok(self.searchable_segment_metas()?
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
.map(|segment_meta| self.segment(segment_meta))
|
||||
.collect())
|
||||
@@ -226,27 +314,41 @@ impl Index {
|
||||
|
||||
/// Returns the list of segment ids that are searchable.
|
||||
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
|
||||
Ok(self.searchable_segment_metas()?
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Creates a new generation of searchers after
|
||||
|
||||
/// a change of the set of searchable indexes.
|
||||
/// Sets the number of searchers to use
|
||||
///
|
||||
/// This needs to be called when a new segment has been
|
||||
/// published or after a merge.
|
||||
/// Only works after the next call to `load_searchers`
|
||||
pub fn set_num_searchers(&mut self, num_searchers: usize) {
|
||||
self.num_searchers.store(num_searchers, Ordering::Release);
|
||||
}
|
||||
|
||||
/// Update searchers so that they reflect the state of the last
|
||||
/// `.commit()`.
|
||||
///
|
||||
/// If indexing happens in the same process as searching,
|
||||
/// you most likely want to call `.load_searchers()` right after each
|
||||
/// successful call to `.commit()`.
|
||||
///
|
||||
/// If indexing and searching happen in different processes, the way to
|
||||
/// get the freshest `index` at all time, is to watch `meta.json` and
|
||||
/// call `load_searchers` whenever a changes happen.
|
||||
pub fn load_searchers(&self) -> Result<()> {
|
||||
let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?;
|
||||
let searchable_segments = self.searchable_segments()?;
|
||||
let segment_readers: Vec<SegmentReader> = searchable_segments
|
||||
.iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect::<Result<_>>()?;
|
||||
let schema = self.schema();
|
||||
let searchers = (0..NUM_SEARCHERS)
|
||||
.map(|_| Searcher::new(schema.clone(), segment_readers.clone()))
|
||||
let num_searchers: usize = self.num_searchers.load(Ordering::Acquire);
|
||||
let searchers = (0..num_searchers)
|
||||
.map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone()))
|
||||
.collect();
|
||||
self.searcher_pool.publish_new_generation(searchers);
|
||||
Ok(())
|
||||
@@ -256,7 +358,7 @@ impl Index {
|
||||
///
|
||||
/// This method should be called every single time a search
|
||||
/// query is performed.
|
||||
/// The searchers are taken from a pool of `NUM_SEARCHERS` searchers.
|
||||
/// The searchers are taken from a pool of `num_searchers` searchers.
|
||||
/// If no searcher is available
|
||||
/// this may block.
|
||||
///
|
||||
@@ -278,8 +380,82 @@ impl Clone for Index {
|
||||
Index {
|
||||
directory: self.directory.clone(),
|
||||
schema: self.schema.clone(),
|
||||
num_searchers: Arc::clone(&self.num_searchers),
|
||||
searcher_pool: Arc::clone(&self.searcher_pool),
|
||||
tokenizers: self.tokenizers.clone(),
|
||||
executor: self.executor.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use directory::RAMDirectory;
|
||||
use schema::{Schema, INT_INDEXED, TEXT};
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_indexer_for_field() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
||||
let body_field = schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
assert!(index.tokenizer_for_field(body_field).is_ok());
|
||||
assert_eq!(
|
||||
format!("{:?}", index.tokenizer_for_field(num_likes_field).err()),
|
||||
"Some(SchemaError(\"\\\"num_likes\\\" is not a text field.\"))"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_exists() {
|
||||
let directory = RAMDirectory::create();
|
||||
assert!(!Index::exists(&directory));
|
||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||
assert!(Index::exists(&directory));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn open_or_create_should_create() {
|
||||
let directory = RAMDirectory::create();
|
||||
assert!(!Index::exists(&directory));
|
||||
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
||||
assert!(Index::exists(&directory));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn open_or_create_should_open() {
|
||||
let directory = RAMDirectory::create();
|
||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||
assert!(Index::exists(&directory));
|
||||
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn create_should_wipeoff_existing() {
|
||||
let directory = RAMDirectory::create();
|
||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||
assert!(Index::exists(&directory));
|
||||
assert!(Index::create(directory.clone(), Schema::builder().build()).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn open_or_create_exists_but_schema_does_not_match() {
|
||||
let directory = RAMDirectory::create();
|
||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
||||
assert!(Index::exists(&directory));
|
||||
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
||||
let err = Index::open_or_create(directory, Schema::builder().build());
|
||||
assert_eq!(
|
||||
format!("{:?}", err.unwrap_err()),
|
||||
"SchemaError(\"An index exists but the schema does not match.\")"
|
||||
);
|
||||
}
|
||||
|
||||
fn throw_away_schema() -> Schema {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let _ = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
||||
schema_builder.build()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,19 +46,19 @@ impl fmt::Debug for IndexMeta {
|
||||
mod tests {
|
||||
|
||||
use super::IndexMeta;
|
||||
use schema::{SchemaBuilder, TEXT};
|
||||
use schema::{Schema, TEXT};
|
||||
use serde_json;
|
||||
|
||||
#[test]
|
||||
fn test_serialize_metas() {
|
||||
let schema = {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("text", TEXT);
|
||||
schema_builder.build()
|
||||
};
|
||||
let index_metas = IndexMeta {
|
||||
segments: Vec::new(),
|
||||
schema: schema,
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
};
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
use common::BinarySerializable;
|
||||
use compression::CompressedIntStream;
|
||||
use directory::ReadOnlySource;
|
||||
use postings::FreqReadingOption;
|
||||
use owned_read::OwnedRead;
|
||||
use positions::PositionReader;
|
||||
use postings::TermInfo;
|
||||
use postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use schema::FieldType;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use termdict::TermDictionary;
|
||||
use owned_read::OwnedRead;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
@@ -27,15 +26,18 @@ pub struct InvertedIndexReader {
|
||||
termdict: TermDictionary,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
positions_idx_source: ReadOnlySource,
|
||||
record_option: IndexRecordOption,
|
||||
total_num_tokens: u64,
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symetry
|
||||
pub(crate) fn new(
|
||||
termdict: TermDictionary,
|
||||
postings_source: ReadOnlySource,
|
||||
positions_source: ReadOnlySource,
|
||||
positions_idx_source: ReadOnlySource,
|
||||
record_option: IndexRecordOption,
|
||||
) -> InvertedIndexReader {
|
||||
let total_num_tokens_data = postings_source.slice(0, 8);
|
||||
@@ -45,6 +47,7 @@ impl InvertedIndexReader {
|
||||
termdict,
|
||||
postings_source: postings_source.slice_from(8),
|
||||
positions_source,
|
||||
positions_idx_source,
|
||||
record_option,
|
||||
total_num_tokens,
|
||||
}
|
||||
@@ -52,14 +55,15 @@ impl InvertedIndexReader {
|
||||
|
||||
/// Creates an empty `InvertedIndexReader` object, which
|
||||
/// contains no terms at all.
|
||||
pub fn empty(field_type: FieldType) -> InvertedIndexReader {
|
||||
pub fn empty(field_type: &FieldType) -> InvertedIndexReader {
|
||||
let record_option = field_type
|
||||
.get_index_record_option()
|
||||
.unwrap_or(IndexRecordOption::Basic);
|
||||
InvertedIndexReader {
|
||||
termdict: TermDictionary::empty(field_type),
|
||||
termdict: TermDictionary::empty(&field_type),
|
||||
postings_source: ReadOnlySource::empty(),
|
||||
positions_source: ReadOnlySource::empty(),
|
||||
positions_idx_source: ReadOnlySource::empty(),
|
||||
record_option,
|
||||
total_num_tokens: 0u64,
|
||||
}
|
||||
@@ -94,7 +98,20 @@ impl InvertedIndexReader {
|
||||
let end_source = self.postings_source.len();
|
||||
let postings_slice = self.postings_source.slice(offset, end_source);
|
||||
let postings_reader = OwnedRead::new(postings_slice);
|
||||
block_postings.reset(term_info.doc_freq as usize, postings_reader);
|
||||
block_postings.reset(term_info.doc_freq, postings_reader);
|
||||
}
|
||||
|
||||
/// Returns a block postings given a `Term`.
|
||||
/// This method is for an advanced usage only.
|
||||
///
|
||||
/// Most user should prefer using `read_postings` instead.
|
||||
pub fn read_block_postings(
|
||||
&self,
|
||||
term: &Term,
|
||||
option: IndexRecordOption,
|
||||
) -> Option<BlockSegmentPostings> {
|
||||
self.get_term_info(term)
|
||||
.map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
/// Returns a block postings given a `term_info`.
|
||||
@@ -108,15 +125,11 @@ impl InvertedIndexReader {
|
||||
) -> BlockSegmentPostings {
|
||||
let offset = term_info.postings_offset as usize;
|
||||
let postings_data = self.postings_source.slice_from(offset);
|
||||
let freq_reading_option = match (self.record_option, requested_option) {
|
||||
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
||||
(_, IndexRecordOption::Basic) => FreqReadingOption::SkipFreq,
|
||||
(_, _) => FreqReadingOption::ReadFreq,
|
||||
};
|
||||
BlockSegmentPostings::from_data(
|
||||
term_info.doc_freq as usize,
|
||||
term_info.doc_freq,
|
||||
OwnedRead::new(postings_data),
|
||||
freq_reading_option,
|
||||
self.record_option,
|
||||
requested_option,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -132,11 +145,11 @@ impl InvertedIndexReader {
|
||||
let block_postings = self.read_block_postings_from_terminfo(term_info, option);
|
||||
let position_stream = {
|
||||
if option.has_positions() {
|
||||
let position_offset = term_info.positions_offset;
|
||||
let positions_source = self.positions_source.slice_from(position_offset as usize);
|
||||
let mut stream = CompressedIntStream::wrap(positions_source);
|
||||
stream.skip(term_info.positions_inner_offset as usize);
|
||||
Some(stream)
|
||||
let position_reader = self.positions_source.clone();
|
||||
let skip_reader = self.positions_idx_source.clone();
|
||||
let position_reader =
|
||||
PositionReader::new(position_reader, skip_reader, term_info.positions_idx);
|
||||
Some(position_reader)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -161,8 +174,8 @@ impl InvertedIndexReader {
|
||||
/// `TextIndexingOptions` that does not index position will return a `SegmentPostings`
|
||||
/// with `DocId`s and frequencies.
|
||||
pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
self.get_term_info(term)
|
||||
.map(move |term_info| self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
pub(crate) fn read_postings_no_deletes(
|
||||
@@ -170,8 +183,8 @@ impl InvertedIndexReader {
|
||||
term: &Term,
|
||||
option: IndexRecordOption,
|
||||
) -> Option<SegmentPostings> {
|
||||
let term_info = get!(self.get_term_info(term));
|
||||
Some(self.read_postings_from_terminfo(&term_info, option))
|
||||
self.get_term_info(term)
|
||||
.map(|term_info| self.read_postings_from_terminfo(&term_info, option))
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
mod executor;
|
||||
pub mod index;
|
||||
mod index_meta;
|
||||
mod inverted_index_reader;
|
||||
@@ -9,6 +10,7 @@ mod segment_id;
|
||||
mod segment_meta;
|
||||
mod segment_reader;
|
||||
|
||||
pub use self::executor::Executor;
|
||||
pub use self::index::Index;
|
||||
pub use self::index_meta::IndexMeta;
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
@@ -33,10 +35,4 @@ lazy_static! {
|
||||
/// Removing this file is safe, but will prevent the garbage collection of all of the file that
|
||||
/// are currently in the directory
|
||||
pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json");
|
||||
|
||||
/// Only one process should be able to write tantivy's index at a time.
|
||||
/// This file, when present, is in charge of preventing other processes to open an IndexWriter.
|
||||
///
|
||||
/// If the process is killed and this file remains, it is safe to remove it manually.
|
||||
pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock");
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crossbeam::sync::MsQueue;
|
||||
use crossbeam::queue::MsQueue;
|
||||
use std::mem;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
@@ -87,7 +87,8 @@ impl<T> Deref for LeasedItem<T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &T {
|
||||
&self.gen_item
|
||||
&self
|
||||
.gen_item
|
||||
.as_ref()
|
||||
.expect("Unwrapping a leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
@@ -96,7 +97,8 @@ impl<T> Deref for LeasedItem<T> {
|
||||
|
||||
impl<T> DerefMut for LeasedItem<T> {
|
||||
fn deref_mut(&mut self) -> &mut T {
|
||||
&mut self.gen_item
|
||||
&mut self
|
||||
.gen_item
|
||||
.as_mut()
|
||||
.expect("Unwrapping a mut leased item should never fail")
|
||||
.item // unwrap is safe here
|
||||
|
||||
@@ -1,16 +1,43 @@
|
||||
use collector::Collector;
|
||||
use collector::SegmentCollector;
|
||||
use core::Executor;
|
||||
use core::InvertedIndexReader;
|
||||
use core::SegmentReader;
|
||||
use query::Query;
|
||||
use query::Scorer;
|
||||
use query::Weight;
|
||||
use schema::Document;
|
||||
use schema::Schema;
|
||||
use schema::{Field, Term};
|
||||
use space_usage::SearcherSpaceUsage;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use store::StoreReader;
|
||||
use termdict::TermMerger;
|
||||
use DocAddress;
|
||||
use Index;
|
||||
use Result;
|
||||
|
||||
fn collect_segment<C: Collector>(
|
||||
collector: &C,
|
||||
weight: &Weight,
|
||||
segment_ord: u32,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Result<C::Fruit> {
|
||||
let mut scorer = weight.scorer(segment_reader)?;
|
||||
let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?;
|
||||
if let Some(delete_bitset) = segment_reader.delete_bitset() {
|
||||
scorer.for_each(&mut |doc, score| {
|
||||
if !delete_bitset.is_deleted(doc) {
|
||||
segment_collector.collect(doc, score);
|
||||
}
|
||||
});
|
||||
} else {
|
||||
scorer.for_each(&mut |doc, score| segment_collector.collect(doc, score));
|
||||
}
|
||||
Ok(segment_collector.harvest())
|
||||
}
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
/// It guarantees that the `Segment` will not be removed before
|
||||
@@ -18,25 +45,43 @@ use Result;
|
||||
///
|
||||
pub struct Searcher {
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
store_readers: Vec<StoreReader>,
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
/// Creates a new `Searcher`
|
||||
pub(crate) fn new(schema: Schema, segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||
pub(crate) fn new(
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
) -> Searcher {
|
||||
let store_readers = segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.get_store_reader())
|
||||
.collect();
|
||||
Searcher {
|
||||
schema,
|
||||
index,
|
||||
segment_readers,
|
||||
store_readers,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the `Index` associated to the `Searcher`
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
}
|
||||
|
||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||
///
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
/// the request to the right `Segment`.
|
||||
pub fn doc(&self, doc_address: &DocAddress) -> Result<Document> {
|
||||
let DocAddress(segment_local_id, doc_id) = *doc_address;
|
||||
let segment_reader = &self.segment_readers[segment_local_id as usize];
|
||||
segment_reader.doc(doc_id)
|
||||
pub fn doc(&self, doc_address: DocAddress) -> Result<Document> {
|
||||
let DocAddress(segment_local_id, doc_id) = doc_address;
|
||||
let store_reader = &self.store_readers[segment_local_id as usize];
|
||||
store_reader.get(doc_id)
|
||||
}
|
||||
|
||||
/// Access the schema associated to the index of this searcher.
|
||||
@@ -48,7 +93,7 @@ impl Searcher {
|
||||
pub fn num_docs(&self) -> u64 {
|
||||
self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.num_docs() as u64)
|
||||
.map(|segment_reader| u64::from(segment_reader.num_docs()))
|
||||
.sum::<u64>()
|
||||
}
|
||||
|
||||
@@ -57,7 +102,9 @@ impl Searcher {
|
||||
pub fn doc_freq(&self, term: &Term) -> u64 {
|
||||
self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.inverted_index(term.field()).doc_freq(term) as u64)
|
||||
.map(|segment_reader| {
|
||||
u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
|
||||
})
|
||||
.sum::<u64>()
|
||||
}
|
||||
|
||||
@@ -71,19 +118,78 @@ impl Searcher {
|
||||
&self.segment_readers[segment_ord as usize]
|
||||
}
|
||||
|
||||
/// Runs a query on the segment readers wrapped by the searcher
|
||||
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> {
|
||||
query.search(self, collector)
|
||||
/// Runs a query on the segment readers wrapped by the searcher.
|
||||
///
|
||||
/// Search works as follows :
|
||||
///
|
||||
/// First the weight object associated to the query is created.
|
||||
///
|
||||
/// Then, the query loops over the segments and for each segment :
|
||||
/// - setup the collector and informs it that the segment being processed has changed.
|
||||
/// - creates a SegmentCollector for collecting documents associated to the segment
|
||||
/// - creates a `Scorer` object associated for this segment
|
||||
/// - iterate through the matched documents and push them to the segment collector.
|
||||
///
|
||||
/// Finally, the Collector merges each of the child collectors into itself for result usability
|
||||
/// by the caller.
|
||||
pub fn search<C: Collector>(&self, query: &Query, collector: &C) -> Result<C::Fruit> {
|
||||
let executor = self.index.search_executor();
|
||||
self.search_with_executor(query, collector, executor)
|
||||
}
|
||||
|
||||
/// Same as [`search(...)`](#method.search) but multithreaded.
|
||||
///
|
||||
/// The current implementation is rather naive :
|
||||
/// multithreading is by splitting search into as many task
|
||||
/// as there are segments.
|
||||
///
|
||||
/// It is powerless at making search faster if your index consists in
|
||||
/// one large segment.
|
||||
///
|
||||
/// Also, keep in my multithreading a single query on several
|
||||
/// threads will not improve your throughput. It can actually
|
||||
/// hurt it. It will however, decrease the average response time.
|
||||
pub fn search_with_executor<C: Collector>(
|
||||
&self,
|
||||
query: &Query,
|
||||
collector: &C,
|
||||
executor: &Executor,
|
||||
) -> Result<C::Fruit> {
|
||||
let scoring_enabled = collector.requires_scoring();
|
||||
let weight = query.weight(self, scoring_enabled)?;
|
||||
let segment_readers = self.segment_readers();
|
||||
let fruits = executor.map(
|
||||
|(segment_ord, segment_reader)| {
|
||||
collect_segment(
|
||||
collector,
|
||||
weight.as_ref(),
|
||||
segment_ord as u32,
|
||||
segment_reader,
|
||||
)
|
||||
},
|
||||
segment_readers.iter().enumerate(),
|
||||
)?;
|
||||
collector.merge_fruits(fruits)
|
||||
}
|
||||
|
||||
/// Return the field searcher associated to a `Field`.
|
||||
pub fn field(&self, field: Field) -> FieldSearcher {
|
||||
let inv_index_readers = self.segment_readers
|
||||
let inv_index_readers = self
|
||||
.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.inverted_index(field))
|
||||
.collect::<Vec<_>>();
|
||||
FieldSearcher::new(inv_index_readers)
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this searcher.
|
||||
pub fn space_usage(&self) -> SearcherSpaceUsage {
|
||||
let mut space_usage = SearcherSpaceUsage::new();
|
||||
for segment_reader in self.segment_readers.iter() {
|
||||
space_usage.add_segment(segment_reader.space_usage());
|
||||
}
|
||||
space_usage
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FieldSearcher {
|
||||
@@ -98,7 +204,8 @@ impl FieldSearcher {
|
||||
/// Returns a Stream over all of the sorted unique terms of
|
||||
/// for the given field.
|
||||
pub fn terms(&self) -> TermMerger {
|
||||
let term_streamers: Vec<_> = self.inv_index_readers
|
||||
let term_streamers: Vec<_> = self
|
||||
.inv_index_readers
|
||||
.iter()
|
||||
.map(|inverted_index| inverted_index.terms().stream())
|
||||
.collect();
|
||||
@@ -108,7 +215,8 @@ impl FieldSearcher {
|
||||
|
||||
impl fmt::Debug for Searcher {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let segment_ids = self.segment_readers
|
||||
let segment_ids = self
|
||||
.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.segment_id())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
@@ -10,6 +10,8 @@ pub enum SegmentComponent {
|
||||
POSTINGS,
|
||||
/// Positions of terms in each document.
|
||||
POSITIONS,
|
||||
/// Index to seek within the position file
|
||||
POSITIONSSKIP,
|
||||
/// Column-oriented random-access storage of fields.
|
||||
FASTFIELDS,
|
||||
/// Stores the sum of the length (in terms) of each field for each document.
|
||||
@@ -29,9 +31,10 @@ pub enum SegmentComponent {
|
||||
impl SegmentComponent {
|
||||
/// Iterates through the components.
|
||||
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
|
||||
static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
|
||||
SegmentComponent::POSTINGS,
|
||||
SegmentComponent::POSITIONS,
|
||||
SegmentComponent::POSITIONSSKIP,
|
||||
SegmentComponent::FASTFIELDS,
|
||||
SegmentComponent::FIELDNORMS,
|
||||
SegmentComponent::TERMS,
|
||||
|
||||
@@ -52,12 +52,12 @@ impl SegmentId {
|
||||
/// Picking the first 8 chars is ok to identify
|
||||
/// segments in a display message.
|
||||
pub fn short_uuid_string(&self) -> String {
|
||||
(&self.0.simple().to_string()[..8]).to_string()
|
||||
(&self.0.to_simple_ref().to_string()[..8]).to_string()
|
||||
}
|
||||
|
||||
/// Returns a segment uuid string.
|
||||
pub fn uuid_string(&self) -> String {
|
||||
self.0.simple().to_string()
|
||||
self.0.to_simple_ref().to_string()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ impl<'a> serde::Deserialize<'a> for SegmentMeta {
|
||||
{
|
||||
let inner = InnerSegmentMeta::deserialize(deserializer)?;
|
||||
let tracked = INVENTORY.track(inner);
|
||||
Ok(SegmentMeta { tracked: tracked })
|
||||
Ok(SegmentMeta { tracked })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -110,8 +110,9 @@ impl SegmentMeta {
|
||||
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
|
||||
let mut path = self.id().uuid_string();
|
||||
path.push_str(&*match component {
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSTINGS => ".idx".to_string(),
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSITIONSSKIP => ".posidx".to_string(),
|
||||
SegmentComponent::TERMS => ".term".to_string(),
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
|
||||
@@ -4,8 +4,8 @@ use core::InvertedIndexReader;
|
||||
use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use error::ErrorKind;
|
||||
use directory::ReadOnlySource;
|
||||
use error::TantivyError;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fastfield::FacetReader;
|
||||
use fastfield::FastFieldReader;
|
||||
@@ -13,10 +13,10 @@ use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
||||
use fieldnorm::FieldNormReader;
|
||||
use schema::Cardinality;
|
||||
use schema::Document;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use space_usage::SegmentSpaceUsage;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
@@ -44,15 +44,17 @@ pub struct SegmentReader {
|
||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||
|
||||
segment_id: SegmentId,
|
||||
segment_meta: SegmentMeta,
|
||||
max_doc: DocId,
|
||||
num_docs: DocId,
|
||||
|
||||
termdict_composite: CompositeFile,
|
||||
postings_composite: CompositeFile,
|
||||
positions_composite: CompositeFile,
|
||||
positions_idx_composite: CompositeFile,
|
||||
fast_fields_composite: CompositeFile,
|
||||
fieldnorms_composite: CompositeFile,
|
||||
|
||||
store_reader: StoreReader,
|
||||
store_source: ReadOnlySource,
|
||||
delete_bitset_opt: Option<DeleteBitSet>,
|
||||
schema: Schema,
|
||||
}
|
||||
@@ -63,7 +65,7 @@ impl SegmentReader {
|
||||
/// Today, `tantivy` does not handle deletes, so it happens
|
||||
/// to also be the number of documents in the index.
|
||||
pub fn max_doc(&self) -> DocId {
|
||||
self.segment_meta.max_doc()
|
||||
self.max_doc
|
||||
}
|
||||
|
||||
/// Returns the number of documents.
|
||||
@@ -72,7 +74,7 @@ impl SegmentReader {
|
||||
/// Today, `tantivy` does not handle deletes so max doc and
|
||||
/// num_docs are the same.
|
||||
pub fn num_docs(&self) -> DocId {
|
||||
self.segment_meta.num_docs()
|
||||
self.num_docs
|
||||
}
|
||||
|
||||
/// Returns the schema of the index this segment belongs to.
|
||||
@@ -152,15 +154,17 @@ impl SegmentReader {
|
||||
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
|
||||
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
match field_entry.field_type() {
|
||||
&FieldType::Bytes => {}
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Bytes => {}
|
||||
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
|
||||
}
|
||||
let idx_reader = self.fast_fields_composite
|
||||
let idx_reader = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let values = self.fast_fields_composite
|
||||
let values = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||
Ok(BytesFastFieldReader::open(idx_reader, values))
|
||||
@@ -170,22 +174,22 @@ impl SegmentReader {
|
||||
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if field_entry.field_type() != &FieldType::HierarchicalFacet {
|
||||
return Err(ErrorKind::InvalidArgument(format!(
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"The field {:?} is not a \
|
||||
hierarchical facet.",
|
||||
field_entry
|
||||
)).into());
|
||||
)));
|
||||
}
|
||||
let term_ords_reader = self.multi_fast_field_reader(field)?;
|
||||
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
|
||||
ErrorKind::InvalidArgument(format!(
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"The field \"{}\" is a hierarchical \
|
||||
but this segment does not seem to have the field term \
|
||||
dictionary.",
|
||||
field_entry.name()
|
||||
))
|
||||
})?;
|
||||
let termdict = TermDictionary::from_source(termdict_source);
|
||||
let termdict = TermDictionary::from_source(&termdict_source);
|
||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||
Ok(facet_reader)
|
||||
}
|
||||
@@ -193,8 +197,7 @@ impl SegmentReader {
|
||||
/// Accessor to the segment's `Field norms`'s reader.
|
||||
///
|
||||
/// Field norms are the length (in tokens) of the fields.
|
||||
/// It is used in the computation of the [TfIdf]
|
||||
/// (https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
|
||||
/// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
|
||||
///
|
||||
/// They are simply stored as a fast field, serialized in
|
||||
/// the `.fieldnorm` file of the segment.
|
||||
@@ -212,8 +215,8 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `StoreReader`.
|
||||
pub fn get_store_reader(&self) -> &StoreReader {
|
||||
&self.store_reader
|
||||
pub fn get_store_reader(&self) -> StoreReader {
|
||||
StoreReader::from_source(self.store_source.clone())
|
||||
}
|
||||
|
||||
/// Open a new segment for reading.
|
||||
@@ -222,7 +225,8 @@ impl SegmentReader {
|
||||
let termdict_composite = CompositeFile::open(&termdict_source)?;
|
||||
|
||||
let store_source = segment.open_read(SegmentComponent::STORE)?;
|
||||
let store_reader = StoreReader::from_source(store_source);
|
||||
|
||||
fail_point!("SegmentReader::open#middle");
|
||||
|
||||
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
|
||||
let postings_composite = CompositeFile::open(&postings_source)?;
|
||||
@@ -235,6 +239,14 @@ impl SegmentReader {
|
||||
}
|
||||
};
|
||||
|
||||
let positions_idx_composite = {
|
||||
if let Ok(source) = segment.open_read(SegmentComponent::POSITIONSSKIP) {
|
||||
CompositeFile::open(&source)?
|
||||
} else {
|
||||
CompositeFile::empty()
|
||||
}
|
||||
};
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
|
||||
@@ -251,15 +263,17 @@ impl SegmentReader {
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
segment_meta: segment.meta().clone(),
|
||||
max_doc: segment.meta().max_doc(),
|
||||
num_docs: segment.meta().num_docs(),
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_composite,
|
||||
fieldnorms_composite,
|
||||
segment_id: segment.id(),
|
||||
store_reader,
|
||||
store_source,
|
||||
delete_bitset_opt,
|
||||
positions_composite,
|
||||
positions_idx_composite,
|
||||
schema,
|
||||
})
|
||||
}
|
||||
@@ -272,7 +286,8 @@ impl SegmentReader {
|
||||
/// term dictionary associated to a specific field,
|
||||
/// and opening the posting list associated to any term.
|
||||
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
|
||||
if let Some(inv_idx_reader) = self.inv_idx_reader_cache
|
||||
if let Some(inv_idx_reader) = self
|
||||
.inv_idx_reader_cache
|
||||
.read()
|
||||
.expect("Lock poisoned. This should never happen")
|
||||
.get(&field)
|
||||
@@ -296,23 +311,31 @@ impl SegmentReader {
|
||||
// As a result, no data is associated to the inverted index.
|
||||
//
|
||||
// Returns an empty inverted index.
|
||||
return Arc::new(InvertedIndexReader::empty(field_type.clone()));
|
||||
return Arc::new(InvertedIndexReader::empty(field_type));
|
||||
}
|
||||
|
||||
let postings_source = postings_source_opt.unwrap();
|
||||
|
||||
let termdict_source = self.termdict_composite
|
||||
let termdict_source = self
|
||||
.termdict_composite
|
||||
.open_read(field)
|
||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
|
||||
|
||||
let positions_source = self.positions_composite
|
||||
let positions_source = self
|
||||
.positions_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
|
||||
let positions_idx_source = self
|
||||
.positions_idx_composite
|
||||
.open_read(field)
|
||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||
|
||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||
TermDictionary::from_source(termdict_source),
|
||||
TermDictionary::from_source(&termdict_source),
|
||||
postings_source,
|
||||
positions_source,
|
||||
positions_idx_source,
|
||||
record_option,
|
||||
));
|
||||
|
||||
@@ -326,14 +349,6 @@ impl SegmentReader {
|
||||
inv_idx_reader
|
||||
}
|
||||
|
||||
/// Returns the document (or to be accurate, its stored field)
|
||||
/// bearing the given doc id.
|
||||
/// This method is slow and should seldom be called from
|
||||
/// within a collector.
|
||||
pub fn doc(&self, doc_id: DocId) -> Result<Document> {
|
||||
self.store_reader.get(doc_id)
|
||||
}
|
||||
|
||||
/// Returns the segment id
|
||||
pub fn segment_id(&self) -> SegmentId {
|
||||
self.segment_id
|
||||
@@ -357,6 +372,24 @@ impl SegmentReader {
|
||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
||||
SegmentReaderAliveDocsIterator::new(&self)
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this segment.
|
||||
pub fn space_usage(&self) -> SegmentSpaceUsage {
|
||||
SegmentSpaceUsage::new(
|
||||
self.num_docs(),
|
||||
self.termdict_composite.space_usage(),
|
||||
self.postings_composite.space_usage(),
|
||||
self.positions_composite.space_usage(),
|
||||
self.positions_idx_composite.space_usage(),
|
||||
self.fast_fields_composite.space_usage(),
|
||||
self.fieldnorms_composite.space_usage(),
|
||||
self.get_store_reader().space_usage(),
|
||||
self.delete_bitset_opt
|
||||
.as_ref()
|
||||
.map(|x| x.space_usage())
|
||||
.unwrap_or(0),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentReader {
|
||||
@@ -376,7 +409,7 @@ pub struct SegmentReaderAliveDocsIterator<'a> {
|
||||
impl<'a> SegmentReaderAliveDocsIterator<'a> {
|
||||
pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
|
||||
SegmentReaderAliveDocsIterator {
|
||||
reader: reader,
|
||||
reader,
|
||||
max_doc: reader.max_doc(),
|
||||
current: 0,
|
||||
}
|
||||
@@ -414,12 +447,12 @@ impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use core::Index;
|
||||
use schema::{SchemaBuilder, Term, STORED, TEXT};
|
||||
use schema::{Schema, Term, STORED, TEXT};
|
||||
use DocId;
|
||||
|
||||
#[test]
|
||||
fn test_alive_docs_iterator() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("name", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
@@ -17,7 +17,7 @@ use std::result;
|
||||
/// - The [`RAMDirectory`](struct.RAMDirectory.html), which
|
||||
/// should be used mostly for tests.
|
||||
///
|
||||
pub trait Directory: fmt::Debug + Send + Sync + 'static {
|
||||
pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Opens a virtual file for read.
|
||||
///
|
||||
/// Once a virtual file is open, its data may not
|
||||
@@ -73,7 +73,19 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
|
||||
///
|
||||
/// The file may or may not previously exist.
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// DirectoryClone
|
||||
pub trait DirectoryClone {
|
||||
/// Clones the directory and boxes the clone
|
||||
fn box_clone(&self) -> Box<Directory>;
|
||||
}
|
||||
|
||||
impl<T> DirectoryClone for T
|
||||
where
|
||||
T: 'static + Directory + Clone,
|
||||
{
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use core::MANAGED_FILEPATH;
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use error::{ErrorKind, Result, ResultExt};
|
||||
use error::DataCorruption;
|
||||
use indexer::LockType;
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
use std::io;
|
||||
@@ -11,6 +12,18 @@ use std::result;
|
||||
use std::sync::RwLockWriteGuard;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use Directory;
|
||||
use Result;
|
||||
|
||||
/// Returns true iff the file is "managed".
|
||||
/// Non-managed file are not subject to garbage collection.
|
||||
///
|
||||
/// Filenames that starts by a "." -typically locks-
|
||||
/// are not managed.
|
||||
fn is_managed(path: &Path) -> bool {
|
||||
path.to_str()
|
||||
.map(|p_str| !p_str.starts_with('.'))
|
||||
.unwrap_or(true)
|
||||
}
|
||||
|
||||
/// Wrapper of directories that keeps track of files created by Tantivy.
|
||||
///
|
||||
@@ -39,19 +52,24 @@ fn save_managed_paths(
|
||||
wlock: &RwLockWriteGuard<MetaInformation>,
|
||||
) -> io::Result<()> {
|
||||
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
|
||||
write!(&mut w, "\n")?;
|
||||
writeln!(&mut w)?;
|
||||
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl ManagedDirectory {
|
||||
/// Wraps a directory as managed directory.
|
||||
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
|
||||
pub fn wrap<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
|
||||
match directory.atomic_read(&MANAGED_FILEPATH) {
|
||||
Ok(data) => {
|
||||
let managed_files_json = String::from_utf8_lossy(&data);
|
||||
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
||||
.chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?;
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
MANAGED_FILEPATH.clone(),
|
||||
format!("Managed file cannot be deserialized: {:?}. ", e),
|
||||
)
|
||||
})?;
|
||||
Ok(ManagedDirectory {
|
||||
directory: Box::new(directory),
|
||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||
@@ -81,25 +99,35 @@ impl ManagedDirectory {
|
||||
pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
|
||||
info!("Garbage collect");
|
||||
let mut files_to_delete = vec![];
|
||||
|
||||
// It is crucial to get the living files after acquiring the
|
||||
// read lock of meta informations. That way, we
|
||||
// avoid the following scenario.
|
||||
//
|
||||
// 1) we get the list of living files.
|
||||
// 2) someone creates a new file.
|
||||
// 3) we start garbage collection and remove this file
|
||||
// even though it is a living file.
|
||||
//
|
||||
// releasing the lock as .delete() will use it too.
|
||||
{
|
||||
// releasing the lock as .delete() will use it too.
|
||||
let meta_informations_rlock = self.meta_informations
|
||||
let meta_informations_rlock = self
|
||||
.meta_informations
|
||||
.read()
|
||||
.expect("Managed directory rlock poisoned in garbage collect.");
|
||||
|
||||
// It is crucial to get the living files after acquiring the
|
||||
// read lock of meta informations. That way, we
|
||||
// avoid the following scenario.
|
||||
//
|
||||
// 1) we get the list of living files.
|
||||
// 2) someone creates a new file.
|
||||
// 3) we start garbage collection and remove this file
|
||||
// even though it is a living file.
|
||||
let living_files = get_living_files();
|
||||
|
||||
for managed_path in &meta_informations_rlock.managed_paths {
|
||||
if !living_files.contains(managed_path) {
|
||||
files_to_delete.push(managed_path.clone());
|
||||
// The point of this second "file" lock is to enforce the following scenario
|
||||
// 1) process B tries to load a new set of searcher.
|
||||
// The list of segments is loaded
|
||||
// 2) writer change meta.json (for instance after a merge or a commit)
|
||||
// 3) gc kicks in.
|
||||
// 4) gc removes a file that was useful for process B, before process B opened it.
|
||||
if let Ok(_meta_lock) = LockType::MetaLock.acquire_lock(self) {
|
||||
let living_files = get_living_files();
|
||||
for managed_path in &meta_informations_rlock.managed_paths {
|
||||
if !living_files.contains(managed_path) {
|
||||
files_to_delete.push(managed_path.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -133,7 +161,8 @@ impl ManagedDirectory {
|
||||
if !deleted_files.is_empty() {
|
||||
// update the list of managed files by removing
|
||||
// the file that were removed.
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
let mut meta_informations_wlock = self
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed directory wlock poisoned (2).");
|
||||
{
|
||||
@@ -155,8 +184,17 @@ impl ManagedDirectory {
|
||||
/// registering the filepath and creating the file
|
||||
/// will not lead to garbage files that will
|
||||
/// never get removed.
|
||||
///
|
||||
/// File starting by "." are reserved to locks.
|
||||
/// They are not managed and cannot be subjected
|
||||
/// to garbage collection.
|
||||
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
|
||||
let mut meta_wlock = self.meta_informations
|
||||
// Files starting by "." (e.g. lock files) are not managed.
|
||||
if !is_managed(filepath) {
|
||||
return Ok(());
|
||||
}
|
||||
let mut meta_wlock = self
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
||||
@@ -194,10 +232,6 @@ impl Directory for ManagedDirectory {
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
self.directory.exists(path)
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for ManagedDirectory {
|
||||
@@ -231,7 +265,7 @@ mod tests {
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
@@ -257,7 +291,7 @@ mod tests {
|
||||
}
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
@@ -281,7 +315,7 @@ mod tests {
|
||||
let living_files = HashSet::new();
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
|
||||
@@ -32,7 +32,8 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
|
||||
}
|
||||
})?;
|
||||
|
||||
let meta_data = file.metadata()
|
||||
let meta_data = file
|
||||
.metadata()
|
||||
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
|
||||
if meta_data.len() == 0 {
|
||||
// if the file size is 0, it will not be possible
|
||||
@@ -309,7 +310,8 @@ impl Directory for MmapDirectory {
|
||||
// when the last reference is gone.
|
||||
mmap_cache.cache.remove(&full_path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => self.sync_directory()
|
||||
Ok(_) => self
|
||||
.sync_directory()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
@@ -352,10 +354,6 @@ impl Directory for MmapDirectory {
|
||||
meta_file.write(|f| f.write_all(data))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -366,6 +364,11 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_open_non_existant_path() {
|
||||
assert!(MmapDirectory::open(PathBuf::from("./nowhere")).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_open_empty() {
|
||||
// empty file is actually an edge case because those
|
||||
|
||||
@@ -18,7 +18,7 @@ pub mod error;
|
||||
|
||||
use std::io::{BufWriter, Seek, Write};
|
||||
|
||||
pub use self::directory::Directory;
|
||||
pub use self::directory::{Directory, DirectoryClone};
|
||||
pub use self::ram_directory::RAMDirectory;
|
||||
pub use self::read_only_source::ReadOnlySource;
|
||||
|
||||
|
||||
@@ -170,10 +170,10 @@ impl Directory for RAMDirectory {
|
||||
let path_buf = PathBuf::from(path);
|
||||
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
|
||||
let exists = self.fs
|
||||
let exists = self
|
||||
.fs
|
||||
.write(path_buf.clone(), &Vec::new())
|
||||
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
|
||||
|
||||
// force the creation of the file to mimic the MMap directory.
|
||||
if exists {
|
||||
Err(OpenWriteError::FileAlreadyExists(path_buf))
|
||||
@@ -196,6 +196,10 @@ impl Directory for RAMDirectory {
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
fail_point!("RAMDirectory::atomic_write", |msg| Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
msg.unwrap_or("Undefined".to_string())
|
||||
)));
|
||||
let path_buf = PathBuf::from(path);
|
||||
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
self.fs.write(path_buf, &Vec::new())?;
|
||||
@@ -203,8 +207,4 @@ impl Directory for RAMDirectory {
|
||||
vec_writer.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,6 @@ use fst::raw::MmapReadOnly;
|
||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
use std::ops::Deref;
|
||||
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
///
|
||||
/// These read objects are only in charge to deliver
|
||||
|
||||
217
src/error.rs
217
src/error.rs
@@ -4,135 +4,168 @@ use std::io;
|
||||
|
||||
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use fastfield::FastFieldNotAvailableError;
|
||||
use indexer::LockType;
|
||||
use query;
|
||||
use schema;
|
||||
use serde_json;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::PoisonError;
|
||||
|
||||
error_chain!(
|
||||
errors {
|
||||
/// Path does not exist.
|
||||
PathDoesNotExist(buf: PathBuf) {
|
||||
description("path does not exist")
|
||||
display("path does not exist: '{:?}'", buf)
|
||||
}
|
||||
/// File already exists, this is a problem when we try to write into a new file.
|
||||
FileAlreadyExists(buf: PathBuf) {
|
||||
description("file already exists")
|
||||
display("file already exists: '{:?}'", buf)
|
||||
}
|
||||
/// IO Error.
|
||||
IOError(err: IOError) {
|
||||
description("an IO error occurred")
|
||||
display("an IO error occurred: '{}'", err)
|
||||
}
|
||||
/// The data within is corrupted.
|
||||
///
|
||||
/// For instance, it contains invalid JSON.
|
||||
CorruptedFile(buf: PathBuf) {
|
||||
description("file contains corrupted data")
|
||||
display("file contains corrupted data: '{:?}'", buf)
|
||||
}
|
||||
/// A thread holding the locked panicked and poisoned the lock.
|
||||
Poisoned {
|
||||
description("a thread holding the locked panicked and poisoned the lock")
|
||||
}
|
||||
/// Invalid argument was passed by the user.
|
||||
InvalidArgument(arg: String) {
|
||||
description("an invalid argument was passed")
|
||||
display("an invalid argument was passed: '{}'", arg)
|
||||
}
|
||||
/// An Error happened in one of the thread.
|
||||
ErrorInThread(err: String) {
|
||||
description("an error occurred in a thread")
|
||||
display("an error occurred in a thread: '{}'", err)
|
||||
}
|
||||
/// An Error appeared related to the schema.
|
||||
SchemaError(message: String) {
|
||||
description("the schema is not matching expectations.")
|
||||
display("Schema error: '{}'", message)
|
||||
}
|
||||
/// Tried to access a fastfield reader for a field not configured accordingly.
|
||||
FastFieldError(err: FastFieldNotAvailableError) {
|
||||
description("fast field not available")
|
||||
display("fast field not available: '{:?}'", err)
|
||||
pub struct DataCorruption {
|
||||
filepath: Option<PathBuf>,
|
||||
comment: String,
|
||||
}
|
||||
|
||||
impl DataCorruption {
|
||||
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
|
||||
DataCorruption {
|
||||
filepath: Some(filepath),
|
||||
comment,
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
impl From<FastFieldNotAvailableError> for Error {
|
||||
fn from(fastfield_error: FastFieldNotAvailableError) -> Error {
|
||||
ErrorKind::FastFieldError(fastfield_error).into()
|
||||
pub fn comment_only(comment: String) -> DataCorruption {
|
||||
DataCorruption {
|
||||
filepath: None,
|
||||
comment,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<IOError> for Error {
|
||||
fn from(io_error: IOError) -> Error {
|
||||
ErrorKind::IOError(io_error).into()
|
||||
impl fmt::Debug for DataCorruption {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
write!(f, "Data corruption: ")?;
|
||||
if let Some(ref filepath) = &self.filepath {
|
||||
write!(f, "(in file `{:?}`)", filepath)?;
|
||||
}
|
||||
write!(f, ": {}.", self.comment)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<io::Error> for Error {
|
||||
fn from(io_error: io::Error) -> Error {
|
||||
ErrorKind::IOError(io_error.into()).into()
|
||||
/// The library's failure based error enum
|
||||
#[derive(Debug, Fail)]
|
||||
pub enum TantivyError {
|
||||
/// Path does not exist.
|
||||
#[fail(display = "Path does not exist: '{:?}'", _0)]
|
||||
PathDoesNotExist(PathBuf),
|
||||
/// File already exists, this is a problem when we try to write into a new file.
|
||||
#[fail(display = "File already exists: '{:?}'", _0)]
|
||||
FileAlreadyExists(PathBuf),
|
||||
/// Index already exists in this directory
|
||||
#[fail(display = "Index already exists")]
|
||||
IndexAlreadyExists,
|
||||
/// Failed to acquire file lock
|
||||
#[fail(
|
||||
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
|
||||
_0
|
||||
)]
|
||||
LockFailure(LockType),
|
||||
/// IO Error.
|
||||
#[fail(display = "An IO error occurred: '{}'", _0)]
|
||||
IOError(#[cause] IOError),
|
||||
/// Data corruption.
|
||||
#[fail(display = "{:?}", _0)]
|
||||
DataCorruption(DataCorruption),
|
||||
/// A thread holding the locked panicked and poisoned the lock.
|
||||
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
|
||||
Poisoned,
|
||||
/// Invalid argument was passed by the user.
|
||||
#[fail(display = "An invalid argument was passed: '{}'", _0)]
|
||||
InvalidArgument(String),
|
||||
/// An Error happened in one of the thread.
|
||||
#[fail(display = "An error occurred in a thread: '{}'", _0)]
|
||||
ErrorInThread(String),
|
||||
/// An Error appeared related to the schema.
|
||||
#[fail(display = "Schema error: '{}'", _0)]
|
||||
SchemaError(String),
|
||||
/// Tried to access a fastfield reader for a field not configured accordingly.
|
||||
#[fail(display = "Fast field not available: '{:?}'", _0)]
|
||||
FastFieldError(#[cause] FastFieldNotAvailableError),
|
||||
/// System error. (e.g.: We failed spawning a new thread)
|
||||
#[fail(display = "System error.'{}'", _0)]
|
||||
SystemError(String),
|
||||
}
|
||||
|
||||
impl From<DataCorruption> for TantivyError {
|
||||
fn from(data_corruption: DataCorruption) -> TantivyError {
|
||||
TantivyError::DataCorruption(data_corruption)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<query::QueryParserError> for Error {
|
||||
fn from(parsing_error: query::QueryParserError) -> Error {
|
||||
ErrorKind::InvalidArgument(format!("Query is invalid. {:?}", parsing_error)).into()
|
||||
impl From<FastFieldNotAvailableError> for TantivyError {
|
||||
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
|
||||
TantivyError::FastFieldError(fastfield_error)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Guard> From<PoisonError<Guard>> for Error {
|
||||
fn from(_: PoisonError<Guard>) -> Error {
|
||||
ErrorKind::Poisoned.into()
|
||||
impl From<IOError> for TantivyError {
|
||||
fn from(io_error: IOError) -> TantivyError {
|
||||
TantivyError::IOError(io_error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OpenReadError> for Error {
|
||||
fn from(error: OpenReadError) -> Error {
|
||||
impl From<io::Error> for TantivyError {
|
||||
fn from(io_error: io::Error) -> TantivyError {
|
||||
TantivyError::IOError(io_error.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<query::QueryParserError> for TantivyError {
|
||||
fn from(parsing_error: query::QueryParserError) -> TantivyError {
|
||||
TantivyError::InvalidArgument(format!("Query is invalid. {:?}", parsing_error))
|
||||
}
|
||||
}
|
||||
|
||||
impl<Guard> From<PoisonError<Guard>> for TantivyError {
|
||||
fn from(_: PoisonError<Guard>) -> TantivyError {
|
||||
TantivyError::Poisoned
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OpenReadError> for TantivyError {
|
||||
fn from(error: OpenReadError) -> TantivyError {
|
||||
match error {
|
||||
OpenReadError::FileDoesNotExist(filepath) => {
|
||||
ErrorKind::PathDoesNotExist(filepath).into()
|
||||
OpenReadError::FileDoesNotExist(filepath) => TantivyError::PathDoesNotExist(filepath),
|
||||
OpenReadError::IOError(io_error) => TantivyError::IOError(io_error),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<schema::DocParsingError> for TantivyError {
|
||||
fn from(error: schema::DocParsingError) -> TantivyError {
|
||||
TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error))
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OpenWriteError> for TantivyError {
|
||||
fn from(error: OpenWriteError) -> TantivyError {
|
||||
match error {
|
||||
OpenWriteError::FileAlreadyExists(filepath) => {
|
||||
TantivyError::FileAlreadyExists(filepath)
|
||||
}
|
||||
OpenReadError::IOError(io_error) => ErrorKind::IOError(io_error).into(),
|
||||
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<schema::DocParsingError> for Error {
|
||||
fn from(error: schema::DocParsingError) -> Error {
|
||||
ErrorKind::InvalidArgument(format!("Failed to parse document {:?}", error)).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OpenWriteError> for Error {
|
||||
fn from(error: OpenWriteError) -> Error {
|
||||
match error {
|
||||
OpenWriteError::FileAlreadyExists(filepath) => ErrorKind::FileAlreadyExists(filepath),
|
||||
OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error),
|
||||
}.into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OpenDirectoryError> for Error {
|
||||
fn from(error: OpenDirectoryError) -> Error {
|
||||
impl From<OpenDirectoryError> for TantivyError {
|
||||
fn from(error: OpenDirectoryError) -> TantivyError {
|
||||
match error {
|
||||
OpenDirectoryError::DoesNotExist(directory_path) => {
|
||||
ErrorKind::PathDoesNotExist(directory_path).into()
|
||||
TantivyError::PathDoesNotExist(directory_path)
|
||||
}
|
||||
OpenDirectoryError::NotADirectory(directory_path) => {
|
||||
TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path))
|
||||
}
|
||||
OpenDirectoryError::NotADirectory(directory_path) => ErrorKind::InvalidArgument(
|
||||
format!("{:?} is not a directory", directory_path),
|
||||
).into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for Error {
|
||||
fn from(error: serde_json::Error) -> Error {
|
||||
impl From<serde_json::Error> for TantivyError {
|
||||
fn from(error: serde_json::Error) -> TantivyError {
|
||||
let io_err = io::Error::from(error);
|
||||
ErrorKind::IOError(io_err.into()).into()
|
||||
TantivyError::IOError(io_err.into())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,12 +6,12 @@ pub use self::writer::BytesFastFieldWriter;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Schema;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_bytes() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_bytes_field("bytesfield");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
@@ -51,7 +51,7 @@ impl BytesFastFieldWriter {
|
||||
self.next_doc();
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field() == self.field {
|
||||
if let &Value::Bytes(ref bytes) = field_value.value() {
|
||||
if let Value::Bytes(ref bytes) = *field_value.value() {
|
||||
self.vals.extend_from_slice(bytes);
|
||||
} else {
|
||||
panic!(
|
||||
|
||||
@@ -2,6 +2,7 @@ use bit_set::BitSet;
|
||||
use common::HasLen;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use space_usage::ByteCount;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use DocId;
|
||||
@@ -41,7 +42,8 @@ pub struct DeleteBitSet {
|
||||
impl DeleteBitSet {
|
||||
/// Opens a delete bitset given its data source.
|
||||
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
|
||||
let num_deleted: usize = data.as_slice()
|
||||
let num_deleted: usize = data
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|b| b.count_ones() as usize)
|
||||
.sum();
|
||||
@@ -62,6 +64,11 @@ impl DeleteBitSet {
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this bitset.
|
||||
pub fn space_usage(&self) -> ByteCount {
|
||||
self.data.len()
|
||||
}
|
||||
}
|
||||
|
||||
impl HasLen for DeleteBitSet {
|
||||
|
||||
@@ -4,7 +4,8 @@ use std::result;
|
||||
/// `FastFieldNotAvailableError` is returned when the
|
||||
/// user requested for a fast field reader, and the field was not
|
||||
/// defined in the schema as a fast field.
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Fail)]
|
||||
#[fail(display = "field not available: '{:?}'", field_name)]
|
||||
pub struct FastFieldNotAvailableError {
|
||||
field_name: String,
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use super::MultiValueIntFastFieldReader;
|
||||
use schema::Facet;
|
||||
use std::str;
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
@@ -20,6 +21,7 @@ use DocId;
|
||||
pub struct FacetReader {
|
||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||
term_dict: TermDictionary,
|
||||
buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
impl FacetReader {
|
||||
@@ -37,6 +39,7 @@ impl FacetReader {
|
||||
FacetReader {
|
||||
term_ords,
|
||||
term_dict,
|
||||
buffer: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,10 +58,18 @@ impl FacetReader {
|
||||
}
|
||||
|
||||
/// Given a term ordinal returns the term associated to it.
|
||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
||||
let found_term = self.term_dict
|
||||
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||
pub fn facet_from_ord(
|
||||
&mut self,
|
||||
facet_ord: TermOrdinal,
|
||||
output: &mut Facet,
|
||||
) -> Result<(), str::Utf8Error> {
|
||||
let found_term = self
|
||||
.term_dict
|
||||
.ord_to_term(facet_ord as u64, &mut self.buffer);
|
||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||
let facet_str = str::from_utf8(&self.buffer[..])?;
|
||||
output.set_facet_str(facet_str);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return the list of facet ordinals associated to a document.
|
||||
|
||||
@@ -127,19 +127,19 @@ mod tests {
|
||||
use common::CompositeFile;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::FastFieldReader;
|
||||
use rand::Rng;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::SeedableRng;
|
||||
use rand::XorShiftRng;
|
||||
use schema::Document;
|
||||
use schema::Field;
|
||||
use schema::Schema;
|
||||
use schema::FAST;
|
||||
use schema::{Schema, SchemaBuilder};
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
lazy_static! {
|
||||
pub static ref SCHEMA: Schema = {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_u64_field("field", FAST);
|
||||
schema_builder.build()
|
||||
};
|
||||
@@ -298,7 +298,7 @@ mod tests {
|
||||
fn test_signed_intfastfield() {
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
@@ -342,7 +342,7 @@ mod tests {
|
||||
fn test_signed_intfastfield_default_val() {
|
||||
let path = Path::new("test");
|
||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
@@ -367,11 +367,10 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
// Warning: this generates the same permutation at each call
|
||||
pub fn generate_permutation() -> Vec<u64> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, 4];
|
||||
let mut rng = XorShiftRng::from_seed(*seed);
|
||||
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
|
||||
rng.shuffle(&mut permutation);
|
||||
let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
|
||||
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
|
||||
permutation
|
||||
}
|
||||
|
||||
|
||||
@@ -9,12 +9,12 @@ mod tests {
|
||||
|
||||
use schema::Cardinality;
|
||||
use schema::IntOptions;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Schema;
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_u64() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field(
|
||||
"multifield",
|
||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||
@@ -49,7 +49,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_i64() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_i64_field(
|
||||
"multifield",
|
||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||
|
||||
@@ -47,11 +47,11 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
mod tests {
|
||||
|
||||
use core::Index;
|
||||
use schema::{Document, Facet, SchemaBuilder};
|
||||
use schema::{Document, Facet, Schema};
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facets");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -82,27 +82,27 @@ mod tests {
|
||||
|
||||
let mut facet = Facet::root();
|
||||
{
|
||||
facet_reader.facet_from_ord(1, &mut facet);
|
||||
facet_reader.facet_from_ord(1, &mut facet).unwrap();
|
||||
assert_eq!(facet, Facet::from("/category"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(2, &mut facet);
|
||||
facet_reader.facet_from_ord(2, &mut facet).unwrap();
|
||||
assert_eq!(facet, Facet::from("/category/cat1"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(3, &mut facet);
|
||||
facet_reader.facet_from_ord(3, &mut facet).unwrap();
|
||||
assert_eq!(format!("{}", facet), "/category/cat2");
|
||||
assert_eq!(facet, Facet::from("/category/cat2"));
|
||||
}
|
||||
{
|
||||
facet_reader.facet_from_ord(4, &mut facet);
|
||||
facet_reader.facet_from_ord(4, &mut facet).unwrap();
|
||||
assert_eq!(facet, Facet::from("/category/cat3"));
|
||||
}
|
||||
|
||||
let mut vals = Vec::new();
|
||||
{
|
||||
facet_reader.facet_ords(0, &mut vals);
|
||||
assert_eq!(&vals[..], &[3, 2]);
|
||||
assert_eq!(&vals[..], &[2, 3]);
|
||||
}
|
||||
{
|
||||
facet_reader.facet_ords(1, &mut vals);
|
||||
|
||||
@@ -90,10 +90,10 @@ impl MultiValueIntFastFieldWriter {
|
||||
|
||||
/// Serializes fast field values by pushing them to the `FastFieldSerializer`.
|
||||
///
|
||||
/// HashMap makes it possible to remap them before serializing.
|
||||
/// Specifically, string terms are first stored in the writer as their
|
||||
/// position in the `IndexWriter`'s `HashMap`. This value is called
|
||||
/// an `UnorderedTermId`.
|
||||
/// If a mapping is given, the values are remapped *and sorted* before serialization.
|
||||
/// This is used when serializing `facets`. Specifically their terms are
|
||||
/// first stored in the writer as their position in the `IndexWriter`'s `HashMap`.
|
||||
/// This value is called an `UnorderedTermId`.
|
||||
///
|
||||
/// During the serialization of the segment, terms gets sorted and
|
||||
/// `tantivy` builds a mapping to convert this `UnorderedTermId` into
|
||||
@@ -125,9 +125,29 @@ impl MultiValueIntFastFieldWriter {
|
||||
mapping.len() as u64,
|
||||
1,
|
||||
)?;
|
||||
for val in &self.vals {
|
||||
let remapped_val = *mapping.get(val).expect("Missing term ordinal");
|
||||
value_serializer.add_val(remapped_val)?;
|
||||
|
||||
let last_interval = (
|
||||
self.doc_index.last().cloned().unwrap(),
|
||||
self.vals.len() as u64,
|
||||
);
|
||||
|
||||
let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
|
||||
for (start, stop) in self
|
||||
.doc_index
|
||||
.windows(2)
|
||||
.map(|interval| (interval[0], interval[1]))
|
||||
.chain(Some(last_interval).into_iter())
|
||||
.map(|(start, stop)| (start as usize, stop as usize))
|
||||
{
|
||||
doc_vals.clear();
|
||||
let remapped_vals = self.vals[start..stop]
|
||||
.iter()
|
||||
.map(|val| *mapping.get(val).expect("Missing term ordinal"));
|
||||
doc_vals.extend(remapped_vals);
|
||||
doc_vals.sort();
|
||||
for &val in &doc_vals {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
|
||||
@@ -7,11 +7,10 @@ use directory::ReadOnlySource;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use owning_ref::OwningRef;
|
||||
use schema::SchemaBuilder;
|
||||
use schema::Schema;
|
||||
use schema::FAST;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::mem;
|
||||
use std::path::Path;
|
||||
use DocId;
|
||||
|
||||
@@ -80,7 +79,8 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
// TODO change start to `u64`.
|
||||
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
||||
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
||||
let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; // ok: Item is either `u64` or `i64`
|
||||
// ok: Item is either `u64` or `i64`
|
||||
let output_u64: &mut [u64] = unsafe { &mut *(output as *mut [Item] as *mut [u64]) };
|
||||
self.bit_unpacker.get_range(start, output_u64);
|
||||
for out in output_u64.iter_mut() {
|
||||
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
|
||||
@@ -108,7 +108,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
|
||||
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let path = Path::new("__dummy__");
|
||||
|
||||
@@ -10,27 +10,28 @@ pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
.unwrap_or_else(|idx| idx - 1) as u8
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::unreadable_literal))]
|
||||
pub const FIELD_NORMS_TABLE: [u32; 256] = [
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
|
||||
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54, 56, 60,
|
||||
64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144, 152, 168, 184, 200, 216, 232,
|
||||
248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984, 1048,
|
||||
1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120,
|
||||
4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336, 14360, 15384,
|
||||
16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984, 45080, 49176,
|
||||
53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480,
|
||||
163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472, 393240, 426008,
|
||||
458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064, 1048600,
|
||||
1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320, 2621464,
|
||||
2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192, 6291480,
|
||||
6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512, 14680088,
|
||||
15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152, 31457304,
|
||||
33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584, 67108888,
|
||||
75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968,
|
||||
167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912,
|
||||
335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800,
|
||||
671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576,
|
||||
1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944,
|
||||
248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984,
|
||||
1_048, 1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608,
|
||||
3864, 4120, 4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336,
|
||||
14360, 15384, 16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984,
|
||||
45080, 49176, 53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904,
|
||||
131096, 147480, 163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472,
|
||||
393240, 426008, 458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064,
|
||||
1048600, 1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320,
|
||||
2621464, 2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192,
|
||||
6291480, 6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512,
|
||||
14680088, 15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152,
|
||||
31457304, 33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584,
|
||||
67108888, 75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752,
|
||||
150994968, 167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480,
|
||||
301989912, 335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936,
|
||||
603979800, 671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848,
|
||||
1207959576, 1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944,
|
||||
];
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
//! precompute computationally expensive functions of the fieldnorm
|
||||
//! in a very short array.
|
||||
//!
|
||||
//! This trick is used by the [BM25 similarity]().
|
||||
//! This trick is used by the BM25 similarity.
|
||||
mod code;
|
||||
mod reader;
|
||||
mod serializer;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use rand::thread_rng;
|
||||
use std::collections::HashSet;
|
||||
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
use rand::Rng;
|
||||
use schema::*;
|
||||
use Index;
|
||||
use Searcher;
|
||||
@@ -15,7 +15,7 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
#[ignore]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_indexing() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
|
||||
@@ -23,7 +23,6 @@ fn test_indexing() {
|
||||
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
|
||||
let universe = Range::new(0u64, 20u64);
|
||||
let mut rng = thread_rng();
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
|
||||
@@ -32,7 +31,7 @@ fn test_indexing() {
|
||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||
|
||||
for _ in 0..200 {
|
||||
let random_val = universe.ind_sample(&mut rng);
|
||||
let random_val = rng.gen_range(0, 20);
|
||||
if random_val == 0 {
|
||||
index_writer.commit().expect("Commit failed");
|
||||
committed_docs.extend(&uncommitted_docs);
|
||||
|
||||
@@ -52,7 +52,8 @@ impl DeleteQueue {
|
||||
//
|
||||
// Past delete operations are not accessible.
|
||||
pub fn cursor(&self) -> DeleteCursor {
|
||||
let last_block = self.inner
|
||||
let last_block = self
|
||||
.inner
|
||||
.read()
|
||||
.expect("Read lock poisoned when opening delete queue cursor")
|
||||
.last_block
|
||||
@@ -92,7 +93,8 @@ impl DeleteQueue {
|
||||
// be some unflushed operations.
|
||||
//
|
||||
fn flush(&self) -> Option<Arc<Block>> {
|
||||
let mut self_wlock = self.inner
|
||||
let mut self_wlock = self
|
||||
.inner
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on delete queue writer");
|
||||
|
||||
@@ -132,7 +134,8 @@ impl From<DeleteQueue> for NextBlock {
|
||||
impl NextBlock {
|
||||
fn next_block(&self) -> Option<Arc<Block>> {
|
||||
{
|
||||
let next_read_lock = self.0
|
||||
let next_read_lock = self
|
||||
.0
|
||||
.read()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
|
||||
@@ -141,7 +144,8 @@ impl NextBlock {
|
||||
}
|
||||
let next_block;
|
||||
{
|
||||
let mut next_write_lock = self.0
|
||||
let mut next_write_lock = self
|
||||
.0
|
||||
.write()
|
||||
.expect("Failed to acquire write lock in delete queue");
|
||||
match *next_write_lock {
|
||||
@@ -182,19 +186,18 @@ impl DeleteCursor {
|
||||
/// `opstamp >= target_opstamp`.
|
||||
pub fn skip_to(&mut self, target_opstamp: u64) {
|
||||
// TODO Can be optimize as we work with block.
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))]
|
||||
loop {
|
||||
if let Some(operation) = self.get() {
|
||||
if operation.opstamp >= target_opstamp {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
while self.is_behind_opstamp(target_opstamp) {
|
||||
self.advance();
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
|
||||
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
|
||||
self.get()
|
||||
.map(|operation| operation.opstamp < target_opstamp)
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// If the current block has been entirely
|
||||
/// consumed, try to load the next one.
|
||||
///
|
||||
|
||||
@@ -1,26 +1,130 @@
|
||||
use core::LOCKFILE_FILEPATH;
|
||||
use directory::error::OpenWriteError;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use Directory;
|
||||
use TantivyError;
|
||||
|
||||
/// The directory lock is a mechanism used to
|
||||
/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html)
|
||||
///
|
||||
/// Only one lock can exist at a time for a given directory.
|
||||
/// The lock is release automatically on `Drop`.
|
||||
pub struct DirectoryLock {
|
||||
directory: Box<Directory>,
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum LockType {
|
||||
/// Only one process should be able to write tantivy's index at a time.
|
||||
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
|
||||
///
|
||||
/// If the process is killed and this file remains, it is safe to remove it manually.
|
||||
///
|
||||
/// Failing to acquire this lock usually means a misuse of tantivy's API,
|
||||
/// (creating more than one instance of the `IndexWriter`), are a spurious
|
||||
/// lock file remaining after a crash. In the latter case, removing the file after
|
||||
/// checking no process running tantivy is running is safe.
|
||||
IndexWriterLock,
|
||||
/// The meta lock file is here to protect the segment files being opened by
|
||||
/// `.load_searchers()` from being garbage collected.
|
||||
/// It makes it possible for another process to safely consume
|
||||
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
|
||||
/// here, but it is difficult to achieve on Windows.
|
||||
///
|
||||
/// Opening segment readers is a very fast process.
|
||||
/// Right now if the lock cannot be acquire on the first attempt, the logic
|
||||
/// is very simplistic. We retry after `100ms` until we effectively
|
||||
/// acquire the lock.
|
||||
/// This lock should not have much contention in normal usage.
|
||||
MetaLock,
|
||||
}
|
||||
|
||||
impl DirectoryLock {
|
||||
pub fn lock(mut directory: Box<Directory>) -> Result<DirectoryLock, OpenWriteError> {
|
||||
directory.open_write(&*LOCKFILE_FILEPATH)?;
|
||||
Ok(DirectoryLock { directory })
|
||||
/// Retry the logic of acquiring locks is pretty simple.
|
||||
/// We just retry `n` times after a given `duratio`, both
|
||||
/// depending on the type of lock.
|
||||
struct RetryPolicy {
|
||||
num_retries: usize,
|
||||
wait_in_ms: u64,
|
||||
}
|
||||
|
||||
impl RetryPolicy {
|
||||
fn no_retry() -> RetryPolicy {
|
||||
RetryPolicy {
|
||||
num_retries: 0,
|
||||
wait_in_ms: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn wait_and_retry(&mut self) -> bool {
|
||||
if self.num_retries == 0 {
|
||||
false
|
||||
} else {
|
||||
self.num_retries -= 1;
|
||||
let wait_duration = Duration::from_millis(self.wait_in_ms);
|
||||
thread::sleep(wait_duration);
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LockType {
|
||||
fn retry_policy(self) -> RetryPolicy {
|
||||
match self {
|
||||
LockType::IndexWriterLock => RetryPolicy::no_retry(),
|
||||
LockType::MetaLock => RetryPolicy {
|
||||
num_retries: 100,
|
||||
wait_in_ms: 100,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn try_acquire_lock(self, directory: &mut Directory) -> Result<DirectoryLock, TantivyError> {
|
||||
let path = self.filename();
|
||||
let mut write = directory.open_write(path).map_err(|e| match e {
|
||||
OpenWriteError::FileAlreadyExists(_) => TantivyError::LockFailure(self),
|
||||
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
|
||||
})?;
|
||||
write.flush()?;
|
||||
Ok(DirectoryLock {
|
||||
directory: directory.box_clone(),
|
||||
path: path.to_owned(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Acquire a lock in the given directory.
|
||||
pub fn acquire_lock(self, directory: &Directory) -> Result<DirectoryLock, TantivyError> {
|
||||
let mut box_directory = directory.box_clone();
|
||||
let mut retry_policy = self.retry_policy();
|
||||
loop {
|
||||
let lock_result = self.try_acquire_lock(&mut *box_directory);
|
||||
match lock_result {
|
||||
Ok(result) => {
|
||||
return Ok(result);
|
||||
}
|
||||
Err(TantivyError::LockFailure(ref filepath)) => {
|
||||
if !retry_policy.wait_and_retry() {
|
||||
return Err(TantivyError::LockFailure(filepath.to_owned()));
|
||||
}
|
||||
}
|
||||
Err(_) => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn filename(&self) -> &Path {
|
||||
match *self {
|
||||
LockType::MetaLock => Path::new(".tantivy-meta.lock"),
|
||||
LockType::IndexWriterLock => Path::new(".tantivy-indexer.lock"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The `DirectoryLock` is an object that represents a file lock.
|
||||
/// See [`LockType`](struct.LockType.html)
|
||||
///
|
||||
/// It is transparently associated to a lock file, that gets deleted
|
||||
/// on `Drop.` The lock is release automatically on `Drop`.
|
||||
pub struct DirectoryLock {
|
||||
directory: Box<Directory>,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl Drop for DirectoryLock {
|
||||
fn drop(&mut self) {
|
||||
if let Err(e) = self.directory.delete(&*LOCKFILE_FILEPATH) {
|
||||
if let Err(e) = self.directory.delete(&*self.path) {
|
||||
error!("Failed to remove the lock file. {:?}", e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,15 +2,15 @@ use super::operation::AddOperation;
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use bit_set::BitSet;
|
||||
use chan;
|
||||
use core::Index;
|
||||
use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SegmentReader;
|
||||
use crossbeam::channel;
|
||||
use docset::DocSet;
|
||||
use error::{Error, ErrorKind, Result, ResultExt};
|
||||
use error::TantivyError;
|
||||
use fastfield::write_delete_bitset;
|
||||
use futures::sync::oneshot::Receiver;
|
||||
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||
@@ -29,6 +29,7 @@ use std::mem;
|
||||
use std::mem::swap;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
use Result;
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
// in the heap goes below MARGIN_IN_BYTES.
|
||||
@@ -42,8 +43,8 @@ pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
|
||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
|
||||
type DocumentSender = chan::Sender<AddOperation>;
|
||||
type DocumentReceiver = chan::Receiver<AddOperation>;
|
||||
type DocumentSender = channel::Sender<AddOperation>;
|
||||
type DocumentReceiver = channel::Receiver<AddOperation>;
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
@@ -53,13 +54,14 @@ type DocumentReceiver = chan::Receiver<AddOperation>;
|
||||
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
(1..)
|
||||
.into_iter()
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
.expect(&format!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
))
|
||||
.unwrap_or_else(|| {
|
||||
panic!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
)
|
||||
})
|
||||
.min(19) // we cap it at 512K
|
||||
}
|
||||
|
||||
@@ -122,14 +124,14 @@ pub fn open_index_writer(
|
||||
"The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_MIN
|
||||
);
|
||||
bail!(ErrorKind::InvalidArgument(err_msg));
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
if heap_size_in_bytes_per_thread >= HEAP_SIZE_MAX {
|
||||
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
||||
bail!(ErrorKind::InvalidArgument(err_msg));
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
@@ -138,7 +140,7 @@ pub fn open_index_writer(
|
||||
let stamper = Stamper::new(current_opstamp);
|
||||
|
||||
let segment_updater =
|
||||
SegmentUpdater::new(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||
|
||||
let mut index_writer = IndexWriter {
|
||||
_directory_lock: Some(directory_lock),
|
||||
@@ -176,7 +178,7 @@ pub fn compute_deleted_bitset(
|
||||
) -> Result<bool> {
|
||||
let mut might_have_changed = false;
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))]
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::while_let_loop))]
|
||||
loop {
|
||||
if let Some(delete_op) = delete_cursor.get() {
|
||||
if delete_op.opstamp > target_opstamp {
|
||||
@@ -300,25 +302,29 @@ fn index_documents(
|
||||
|
||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
||||
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
|
||||
if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
let segment_entry: SegmentEntry = if delete_cursor.get().is_some() {
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
SegmentEntry::new(segment_meta, delete_cursor, {
|
||||
if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
} else {
|
||||
// if there are no delete operation in the queue, no need
|
||||
// to even open the segment.
|
||||
SegmentEntry::new(segment_meta, delete_cursor, None)
|
||||
};
|
||||
Ok(segment_updater.add_segment(generation, segment_entry))
|
||||
}
|
||||
|
||||
@@ -334,13 +340,16 @@ impl IndexWriter {
|
||||
join_handle
|
||||
.join()
|
||||
.expect("Indexing Worker thread panicked")
|
||||
.chain_err(|| ErrorKind::ErrorInThread("Error in indexing worker thread.".into()))?;
|
||||
.map_err(|_| {
|
||||
TantivyError::ErrorInThread("Error in indexing worker thread.".into())
|
||||
})?;
|
||||
}
|
||||
drop(self.workers_join_handle);
|
||||
|
||||
let result = self.segment_updater
|
||||
let result = self
|
||||
.segment_updater
|
||||
.wait_merging_thread()
|
||||
.chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into()));
|
||||
.map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into()));
|
||||
|
||||
if let Err(ref e) = result {
|
||||
error!("Some merging thread failed {:?}", e);
|
||||
@@ -380,7 +389,7 @@ impl IndexWriter {
|
||||
let mem_budget = self.heap_size_in_bytes_per_thread;
|
||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||
.name(format!(
|
||||
"indexing thread {} for gen {}",
|
||||
"thrd-tantivy-index{}-gen{}",
|
||||
self.worker_id, generation
|
||||
))
|
||||
.spawn(move || {
|
||||
@@ -458,10 +467,8 @@ impl IndexWriter {
|
||||
///
|
||||
/// Returns the former segment_ready channel.
|
||||
fn recreate_document_channel(&mut self) -> DocumentReceiver {
|
||||
let (mut document_sender, mut document_receiver): (
|
||||
DocumentSender,
|
||||
DocumentReceiver,
|
||||
) = chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
swap(&mut self.document_sender, &mut document_sender);
|
||||
swap(&mut self.document_receiver, &mut document_receiver);
|
||||
document_receiver
|
||||
@@ -485,7 +492,8 @@ impl IndexWriter {
|
||||
let document_receiver = self.document_receiver.clone();
|
||||
|
||||
// take the directory lock to create a new index_writer.
|
||||
let directory_lock = self._directory_lock
|
||||
let directory_lock = self
|
||||
._directory_lock
|
||||
.take()
|
||||
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
|
||||
|
||||
@@ -550,16 +558,13 @@ impl IndexWriter {
|
||||
// and recreate a new one channels.
|
||||
self.recreate_document_channel();
|
||||
|
||||
let mut former_workers_join_handle = Vec::new();
|
||||
swap(
|
||||
&mut former_workers_join_handle,
|
||||
&mut self.workers_join_handle,
|
||||
);
|
||||
let former_workers_join_handle =
|
||||
mem::replace(&mut self.workers_join_handle, Vec::new());
|
||||
|
||||
for worker_handle in former_workers_join_handle {
|
||||
let indexing_worker_result = worker_handle
|
||||
.join()
|
||||
.map_err(|e| Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))))?;
|
||||
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
|
||||
|
||||
indexing_worker_result?;
|
||||
// add a new worker for the next generation.
|
||||
@@ -633,7 +638,10 @@ impl IndexWriter {
|
||||
pub fn add_document(&mut self, document: Document) -> u64 {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let add_operation = AddOperation { opstamp, document };
|
||||
self.document_sender.send(add_operation);
|
||||
let send_result = self.document_sender.send(add_operation);
|
||||
if let Err(e) = send_result {
|
||||
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||
}
|
||||
opstamp
|
||||
}
|
||||
}
|
||||
@@ -642,7 +650,6 @@ impl IndexWriter {
|
||||
mod tests {
|
||||
|
||||
use super::initial_table_size;
|
||||
use env_logger;
|
||||
use error::*;
|
||||
use indexer::NoMergePolicy;
|
||||
use schema::{self, Document};
|
||||
@@ -651,18 +658,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_lockfile_stops_duplicates() {
|
||||
let schema_builder = schema::SchemaBuilder::default();
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let _index_writer = index.writer(40_000_000).unwrap();
|
||||
match index.writer(40_000_000) {
|
||||
Err(Error(ErrorKind::FileAlreadyExists(_), _)) => {}
|
||||
Err(TantivyError::LockFailure(_)) => {}
|
||||
_ => panic!("Expected FileAlreadyExists error"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lockfile_already_exists_error_msg() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
match index.writer_with_num_threads(1, 3_000_000) {
|
||||
Err(err) => {
|
||||
let err_msg = err.to_string();
|
||||
assert!(err_msg.contains("Lockfile"));
|
||||
assert!(err_msg.contains("Possible causes:"))
|
||||
}
|
||||
_ => panic!("Expected LockfileAlreadyExists error"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_set_merge_policy() {
|
||||
let schema_builder = schema::SchemaBuilder::default();
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let index_writer = index.writer(40_000_000).unwrap();
|
||||
assert_eq!(
|
||||
@@ -680,7 +702,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_lockfile_released_on_drop() {
|
||||
let schema_builder = schema::SchemaBuilder::default();
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let _index_writer = index.writer(40_000_000).unwrap();
|
||||
@@ -692,7 +714,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_commit_and_rollback() {
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
@@ -714,7 +736,7 @@ mod tests {
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
}
|
||||
assert_eq!(index_writer.commit().unwrap(), 2u64);
|
||||
assert_eq!(index_writer.commit().unwrap(), 3u64);
|
||||
index.load_searchers().unwrap();
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
assert_eq!(num_docs_containing("b"), 1);
|
||||
@@ -726,8 +748,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_with_merges() {
|
||||
let _ = env_logger::init();
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let num_docs_containing = |s: &str| {
|
||||
@@ -764,8 +785,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_prepare_with_commit_message() {
|
||||
let _ = env_logger::init();
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
@@ -779,7 +799,6 @@ mod tests {
|
||||
{
|
||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||
prepared_commit.set_payload("first commit");
|
||||
assert_eq!(prepared_commit.opstamp(), 100);
|
||||
prepared_commit.commit().expect("commit failed");
|
||||
}
|
||||
{
|
||||
@@ -799,8 +818,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_prepare_but_rollback() {
|
||||
let _ = env_logger::init();
|
||||
let mut schema_builder = schema::SchemaBuilder::default();
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
@@ -814,7 +832,6 @@ mod tests {
|
||||
{
|
||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||
prepared_commit.set_payload("first commit");
|
||||
assert_eq!(prepared_commit.opstamp(), 100);
|
||||
prepared_commit.abort().expect("commit failed");
|
||||
}
|
||||
{
|
||||
@@ -844,4 +861,32 @@ mod tests {
|
||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "no_fail"))]
|
||||
#[test]
|
||||
fn test_write_commit_fails() {
|
||||
use fail;
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap();
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "b"));
|
||||
}
|
||||
assert!(index_writer.commit().is_err());
|
||||
index.load_searchers().unwrap();
|
||||
let num_docs_containing = |s: &str| {
|
||||
let searcher = index.searcher();
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term_a)
|
||||
};
|
||||
assert_eq!(num_docs_containing("a"), 100);
|
||||
assert_eq!(num_docs_containing("b"), 0);
|
||||
fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,10 +80,6 @@ impl MergePolicy for LogMergePolicy {
|
||||
.map(|ind_vec| MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LogMergePolicy {
|
||||
|
||||
@@ -11,18 +11,31 @@ pub struct MergeCandidate(pub Vec<SegmentId>);
|
||||
///
|
||||
/// Every time a the list of segments changes, the segment updater
|
||||
/// asks the merge policy if some segments should be merged.
|
||||
pub trait MergePolicy: marker::Send + marker::Sync + Debug {
|
||||
pub trait MergePolicy: MergePolicyClone + marker::Send + marker::Sync + Debug {
|
||||
/// Given the list of segment metas, returns the list of merge candidates.
|
||||
///
|
||||
/// This call happens on the segment updater thread, and will block
|
||||
/// other segment updates, so all implementations should happen rapidly.
|
||||
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
|
||||
}
|
||||
|
||||
/// MergePolicyClone
|
||||
pub trait MergePolicyClone {
|
||||
/// Returns a boxed clone of the MergePolicy.
|
||||
fn box_clone(&self) -> Box<MergePolicy>;
|
||||
}
|
||||
|
||||
impl<T> MergePolicyClone for T
|
||||
where
|
||||
T: 'static + MergePolicy + Clone,
|
||||
{
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// Never merge segments.
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NoMergePolicy;
|
||||
|
||||
impl Default for NoMergePolicy {
|
||||
@@ -35,10 +48,6 @@ impl MergePolicy for NoMergePolicy {
|
||||
fn compute_merge_candidates(&self, _segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
|
||||
Vec::new()
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
Box::new(NoMergePolicy)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -52,7 +61,7 @@ pub mod tests {
|
||||
///
|
||||
/// Everytime there is more than one segment,
|
||||
/// it will suggest to merge them.
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MergeWheneverPossible;
|
||||
|
||||
impl MergePolicy for MergeWheneverPossible {
|
||||
@@ -67,9 +76,5 @@ pub mod tests {
|
||||
vec![]
|
||||
}
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
Box::new(MergeWheneverPossible)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user