mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 04:52:55 +00:00
Compare commits
79 Commits
0.9
...
python-bin
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a7c579f5c9 | ||
|
|
f2e546bdff | ||
|
|
efd1af1325 | ||
|
|
c91eb7fba7 | ||
|
|
6eb4e08636 | ||
|
|
c3231ca252 | ||
|
|
7211df6719 | ||
|
|
f27ce6412c | ||
|
|
8197a9921f | ||
|
|
b0e23b5715 | ||
|
|
0167151f5b | ||
|
|
0668949390 | ||
|
|
94d0e52786 | ||
|
|
818a0abbee | ||
|
|
4e6dcf3cbe | ||
|
|
af7ea1422a | ||
|
|
498057c5b7 | ||
|
|
5095e6b010 | ||
|
|
1aebc87ee3 | ||
|
|
9fb5058b29 | ||
|
|
158e0a28ba | ||
|
|
3576a006f7 | ||
|
|
80c25ae9f3 | ||
|
|
4867be3d3b | ||
|
|
697c7e721d | ||
|
|
3e368d92cb | ||
|
|
0bc2c64a53 | ||
|
|
35236c8634 | ||
|
|
462774b15c | ||
|
|
185a5b8d31 | ||
|
|
73d7791479 | ||
|
|
f52b1e68d1 | ||
|
|
3e0907fe05 | ||
|
|
ab4a8916d3 | ||
|
|
bcd7386fc5 | ||
|
|
c23a7c992b | ||
|
|
2a88094ec4 | ||
|
|
ca3cfddab4 | ||
|
|
7bd9f9773b | ||
|
|
e2da92fcb5 | ||
|
|
876e1451c4 | ||
|
|
a37d2f9777 | ||
|
|
4822940b19 | ||
|
|
d590f4c6b0 | ||
|
|
edfa619519 | ||
|
|
96f194635f | ||
|
|
444662485f | ||
|
|
943c25d0f8 | ||
|
|
5c0b2a4579 | ||
|
|
9870a9258d | ||
|
|
7102b363f5 | ||
|
|
66b4615e4e | ||
|
|
da46913839 | ||
|
|
3df037961f | ||
|
|
8ffae47854 | ||
|
|
1a90a1f3b0 | ||
|
|
dac50c6aeb | ||
|
|
31b22c5acc | ||
|
|
8e50921363 | ||
|
|
96a4f503ec | ||
|
|
9df288b0c9 | ||
|
|
b7c2d0de97 | ||
|
|
62445e0ec8 | ||
|
|
a228825462 | ||
|
|
d3eabd14bc | ||
|
|
c967031d21 | ||
|
|
d823163d52 | ||
|
|
c4f59f202d | ||
|
|
acd29b535d | ||
|
|
2cd31bcda2 | ||
|
|
99870de55c | ||
|
|
cad2d91845 | ||
|
|
79f3cd6cf4 | ||
|
|
e3abb4481b | ||
|
|
bfa61d2f2f | ||
|
|
6c0e621fdb | ||
|
|
a8cc5208f1 | ||
|
|
83eb0d0cb7 | ||
|
|
ee6e273365 |
32
.travis.yml
32
.travis.yml
@@ -10,7 +10,7 @@ env:
|
||||
global:
|
||||
- CRATE_NAME=tantivy
|
||||
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
|
||||
- secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
# - secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
|
||||
addons:
|
||||
apt:
|
||||
@@ -29,7 +29,7 @@ addons:
|
||||
matrix:
|
||||
include:
|
||||
# Android
|
||||
- env: TARGET=aarch64-linux-android
|
||||
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
||||
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||
@@ -38,12 +38,11 @@ matrix:
|
||||
# Linux
|
||||
#- env: TARGET=aarch64-unknown-linux-gnu
|
||||
#- env: TARGET=i686-unknown-linux-gnu
|
||||
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1
|
||||
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 #UPLOAD_DOCS=1
|
||||
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
|
||||
|
||||
# OSX
|
||||
- env: TARGET=x86_64-apple-darwin
|
||||
os: osx
|
||||
#- env: TARGET=x86_64-apple-darwin
|
||||
# os: osx
|
||||
|
||||
before_install:
|
||||
- set -e
|
||||
@@ -52,6 +51,7 @@ before_install:
|
||||
install:
|
||||
- sh ci/install.sh
|
||||
- source ~/.cargo/env || true
|
||||
- env | grep "TRAVIS"
|
||||
|
||||
before_script:
|
||||
- export PATH=$HOME/.cargo/bin:$PATH
|
||||
@@ -64,10 +64,20 @@ script:
|
||||
before_deploy:
|
||||
- sh ci/before_deploy.sh
|
||||
|
||||
cache: cargo
|
||||
before_cache:
|
||||
# Travis can't cache files that are not readable by "others"
|
||||
- chmod -R a+r $HOME/.cargo
|
||||
after_success:
|
||||
# Needs GH_TOKEN env var to be set in travis settings
|
||||
- if [[ -v GH_TOKEN ]]; then echo "GH TOKEN IS SET"; else echo "GH TOKEN NOT SET"; fi
|
||||
- if [[ -v UPLOAD_DOCS ]]; then cargo doc; cargo doc-upload; else echo "doc upload disabled."; fi
|
||||
|
||||
#cache: cargo
|
||||
#before_cache:
|
||||
# # Travis can't cache files that are not readable by "others"
|
||||
# - chmod -R a+r $HOME/.cargo
|
||||
# - find ./target/debug -type f -maxdepth 1 -delete
|
||||
# - rm -f ./target/.rustc_info.json
|
||||
# - rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
|
||||
# - rm -r target/debug/examples/
|
||||
# - ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
|
||||
|
||||
#branches:
|
||||
# only:
|
||||
@@ -77,4 +87,4 @@ before_cache:
|
||||
|
||||
notifications:
|
||||
email:
|
||||
on_success: never
|
||||
on_success: never
|
||||
|
||||
87
CHANGELOG.md
87
CHANGELOG.md
@@ -1,3 +1,61 @@
|
||||
Tantivy 0.11.0
|
||||
=====================
|
||||
|
||||
- Added f64 field. Internally reuse u64 code the same way i64 does (@fdb-hiroshima)
|
||||
|
||||
Tantivy 0.10.1
|
||||
=====================
|
||||
|
||||
- Closes #544. A few users experienced problems with the directory watching system.
|
||||
Avoid watching the mmap directory until someone effectively creates a reader that uses
|
||||
this functionality.
|
||||
|
||||
|
||||
Tantivy 0.10.0
|
||||
=====================
|
||||
|
||||
*Tantivy 0.10.0 index format is compatible with the index format in 0.9.0.*
|
||||
|
||||
- Added an API to easily tweak or entirely replace the
|
||||
default score. See `TopDocs::tweak_score`and `TopScore::custom_score` (@pmasurel)
|
||||
- Added an ASCII folding filter (@drusellers)
|
||||
- Bugfix in `query.count` in presence of deletes (@pmasurel)
|
||||
- Added `.explain(...)` in `Query` and `Weight` to (@pmasurel)
|
||||
- Added an efficient way to `delete_all_documents` in `IndexWriter` (@petr-tik).
|
||||
All segments are simply removed.
|
||||
|
||||
Minor
|
||||
---------
|
||||
- Switched to Rust 2018 (@uvd)
|
||||
- Small simplification of the code.
|
||||
Calling .freq() or .doc() when .advance() has never been called
|
||||
on segment postings should panic from now on.
|
||||
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
|
||||
- Fast fields are now preloaded when the `SegmentReader` is created.
|
||||
- `IndexMeta` is now public. (@hntd187)
|
||||
- `IndexWriter` `add_document`, `delete_term`. `IndexWriter` is `Sync`, making it possible to use it with a `
|
||||
Arc<RwLock<IndexWriter>>`. `add_document` and `delete_term` can
|
||||
only require a read lock. (@pmasurel)
|
||||
- Introducing `Opstamp` as an expressive type alias for `u64`. (@petr-tik)
|
||||
- Stamper now relies on `AtomicU64` on all platforms (@petr-tik)
|
||||
- Bugfix - Files get deleted slightly earlier
|
||||
- Compilation resources improved (@fdb-hiroshima)
|
||||
|
||||
## How to update?
|
||||
|
||||
Your program should be usable as is.
|
||||
|
||||
### Fast fields
|
||||
|
||||
Fast fields used to be accessed directly from the `SegmentReader`.
|
||||
The API changed, you are now required to acquire your fast field reader via the
|
||||
`segment_reader.fast_fields()`, and use one of the typed method:
|
||||
- `.u64()`, `.i64()` if your field is single-valued ;
|
||||
- `.u64s()`, `.i64s()` if your field is multi-valued ;
|
||||
- `.bytes()` if your field is bytes fast field.
|
||||
|
||||
|
||||
|
||||
Tantivy 0.9.0
|
||||
=====================
|
||||
*0.9.0 index format is not compatible with the
|
||||
@@ -15,6 +73,35 @@ previous index format.*
|
||||
for int fields. (@fulmicoton)
|
||||
- Added DateTime field (@barrotsteindev)
|
||||
- Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
|
||||
- SIMD linear search within blocks (@fulmicoton)
|
||||
|
||||
## How to update ?
|
||||
|
||||
tantivy 0.9 brought some API breaking change.
|
||||
To update from tantivy 0.8, you will need to go through the following steps.
|
||||
|
||||
- `schema::INT_INDEXED` and `schema::INT_STORED` should be replaced by `schema::INDEXED` and `schema::INT_STORED`.
|
||||
- The index now does not hold the pool of searcher anymore. You are required to create an intermediary object called
|
||||
`IndexReader` for this.
|
||||
|
||||
```rust
|
||||
// create the reader. You typically need to create 1 reader for the entire
|
||||
// lifetime of you program.
|
||||
let reader = index.reader()?;
|
||||
|
||||
// Acquire a searcher (previously `index.searcher()`) is now written:
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// With the default setting of the reader, you are not required to
|
||||
// call `index.load_searchers()` anymore.
|
||||
//
|
||||
// The IndexReader will pick up that change automatically, regardless
|
||||
// of whether the update was done in a different process or not.
|
||||
// If this behavior is not wanted, you can create your reader with
|
||||
// the `ReloadPolicy::Manual`, and manually decide when to reload the index
|
||||
// by calling `reader.reload()?`.
|
||||
|
||||
```
|
||||
|
||||
|
||||
Tantivy 0.8.2
|
||||
|
||||
37
Cargo.toml
37
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.9.0"
|
||||
version = "0.10.1"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -10,11 +10,12 @@ homepage = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.10.0"
|
||||
byteorder = "1.0"
|
||||
lazy_static = "1"
|
||||
once_cell = "0.2"
|
||||
regex = "1.0"
|
||||
tantivy-fst = "0.1"
|
||||
memmap = {version = "0.7", optional=true}
|
||||
@@ -23,7 +24,7 @@ snap = {version="0.2"}
|
||||
atomicwrites = {version="0.2.2", optional=true}
|
||||
tempfile = "3.0"
|
||||
log = "0.4"
|
||||
combine = "3"
|
||||
combine = ">=3.6.0,<4.0.0"
|
||||
tempdir = "0.3"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
@@ -42,22 +43,23 @@ owning_ref = "0.4"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "1.1"
|
||||
downcast-rs = { version="1.0" }
|
||||
bitpacking = "0.6"
|
||||
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
|
||||
census = "0.2"
|
||||
fnv = "1.0.6"
|
||||
owned-read = "0.4"
|
||||
failure = "0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.2"
|
||||
fail = "0.3"
|
||||
scoped-pool = "1.0"
|
||||
murmurhash32 = "0.2"
|
||||
chrono = "0.4"
|
||||
smallvec = "0.6"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
winapi = "0.3"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.6"
|
||||
rand = "0.7"
|
||||
maplit = "1"
|
||||
matches = "0.1.8"
|
||||
time = "0.1.42"
|
||||
@@ -72,13 +74,28 @@ debug-assertions = true
|
||||
overflow-checks = true
|
||||
|
||||
[features]
|
||||
# by default no-fail is disabled. We manually enable it when running test.
|
||||
default = ["mmap", "no_fail"]
|
||||
default = ["mmap"]
|
||||
mmap = ["atomicwrites", "fs2", "memmap", "notify"]
|
||||
lz4-compression = ["lz4"]
|
||||
no_fail = ["fail/no_fail"]
|
||||
failpoints = ["fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
wasm-bindgen = ["uuid/wasm-bindgen"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
[dev-dependencies.fail]
|
||||
features = ["failpoints"]
|
||||
|
||||
|
||||
# Following the "fail" crate best practises, we isolate
|
||||
# tests that define specific behavior in fail check points
|
||||
# in a different binary.
|
||||
#
|
||||
# We do that because, fail rely on a global definition of
|
||||
# failpoints behavior and hence, it is incompatible with
|
||||
# multithreading.
|
||||
[[test]]
|
||||
name = "failpoints"
|
||||
path = "tests/failpoints/mod.rs"
|
||||
required-features = ["fail/failpoints"]
|
||||
72
README.md
72
README.md
@@ -4,6 +4,7 @@
|
||||
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
|
||||
[](https://crates.io/crates/tantivy)
|
||||
[](https://saythanks.io/to/fulmicoton)
|
||||
|
||||

|
||||
@@ -17,6 +18,7 @@
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
|
||||
|
||||
[](https://www.patreon.com/fulmicoton)
|
||||
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in rust.
|
||||
@@ -27,6 +29,14 @@ to build such a search engine.
|
||||
|
||||
Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
# Benchmark
|
||||
|
||||
Tantivy is typically faster than Lucene, but the results will depend on
|
||||
the nature of the queries in your workload.
|
||||
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
||||
performance for different type of queries / collection.
|
||||
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
@@ -40,9 +50,9 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
|
||||
- Mmap directory
|
||||
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
|
||||
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
|
||||
- Single valued and multivalued u64, i64 and f64 fast fields (equivalent of doc values in Lucene)
|
||||
- `&[u8]` fast fields
|
||||
- Text, i64, u64, dates and hierarchical facet fields
|
||||
- Text, i64, u64, f64, dates and hierarchical facet fields
|
||||
- LZ4 compressed document store
|
||||
- Range queries
|
||||
- Faceted search
|
||||
@@ -55,38 +65,74 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
library upon which one could build a distributed search. Serializable/mergeable collector state for instance,
|
||||
are within the scope of tantivy.
|
||||
|
||||
|
||||
# Supported OS and compiler
|
||||
|
||||
Tantivy works on stable rust (>= 1.27) and supports Linux, MacOS and Windows.
|
||||
|
||||
# Getting started
|
||||
|
||||
- [tantivy's simple search example](http://fulmicoton.com/tantivy-examples/simple_search.html)
|
||||
- [tantivy's simple search example](https://tantivy-search.github.io/examples/basic_search.html)
|
||||
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli).
|
||||
`tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
|
||||
index documents and search via the CLI or a small server with a REST API.
|
||||
It will walk you through getting a wikipedia search engine up and running in a few minutes.
|
||||
- [reference doc]
|
||||
- [For the last released version](https://docs.rs/tantivy/)
|
||||
- [For the last master branch](https://tantivy-search.github.io/tantivy/tantivy/index.html)
|
||||
- [reference doc for the last released version](https://docs.rs/tantivy/)
|
||||
|
||||
# Compiling
|
||||
# How can I support this project?
|
||||
|
||||
## Development
|
||||
There are many ways to support this project.
|
||||
|
||||
- Use tantivy and tell us about your experience on [gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
|
||||
- Report bugs
|
||||
- Write a blog post
|
||||
- Help with documentation by asking questions or submitting PRs
|
||||
- Contribute code (you can join [our gitter](https://gitter.im/tantivy-search/tantivy) )
|
||||
- Talk about tantivy around you
|
||||
- Drop a word on on [](https://saythanks.io/to/fulmicoton) or even [](https://www.patreon.com/fulmicoton)
|
||||
|
||||
# Contributing code
|
||||
|
||||
We use the GitHub Pull Request workflow - reference a GitHub ticket and/or include a comprehensive commit message when opening a PR.
|
||||
|
||||
## Clone and build locally
|
||||
|
||||
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
|
||||
To check out and run tests, you can simply run :
|
||||
|
||||
```bash
|
||||
git clone https://github.com/tantivy-search/tantivy.git
|
||||
cd tantivy
|
||||
cargo build
|
||||
```
|
||||
|
||||
## Running tests
|
||||
## Run tests
|
||||
|
||||
Some tests will not run with just `cargo test` because of `fail-rs`.
|
||||
To run the tests exhaustively, run `./run-tests.sh`.
|
||||
To run the tests exhaustively, run `./run-tests.sh`
|
||||
|
||||
# Contribute
|
||||
## Debug
|
||||
|
||||
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||
You might find it useful to step through the programme with a debugger.
|
||||
|
||||
### A failing test
|
||||
|
||||
Make sure you haven't run `cargo clean` after the most recent `cargo test` or `cargo build` to guarantee that `target/` dir exists. Use this bash script to find the most name of the most recent debug build of tantivy and run it under rust-gdb.
|
||||
|
||||
```bash
|
||||
find target/debug/ -maxdepth 1 -executable -type f -name "tantivy*" -printf '%TY-%Tm-%Td %TT %p\n' | sort -r | cut -d " " -f 3 | xargs -I RECENT_DBG_TANTIVY rust-gdb RECENT_DBG_TANTIVY
|
||||
```
|
||||
|
||||
Now that you are in rust-gdb, you can set breakpoints on lines and methods that match your source-code and run the debug executable with flags that you normally pass to `cargo test` to like this
|
||||
|
||||
```bash
|
||||
$gdb run --test-threads 1 --test $NAME_OF_TEST
|
||||
```
|
||||
|
||||
### An example
|
||||
|
||||
By default, rustc compiles everything in the `examples/` dir in debug mode. This makes it easy for you to make examples to reproduce bugs.
|
||||
|
||||
```bash
|
||||
rust-gdb target/debug/examples/$EXAMPLE_NAME
|
||||
$ gdb run
|
||||
```
|
||||
|
||||
@@ -18,5 +18,5 @@ install:
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap
|
||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||
|
||||
@@ -10,8 +10,6 @@
|
||||
// - search for the best document matchings "sea whale"
|
||||
// - retrieve the best document original content.
|
||||
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
|
||||
@@ -7,8 +7,6 @@
|
||||
// Of course, you can have a look at the tantivy's built-in collectors
|
||||
// such as the `CountCollector` for more examples.
|
||||
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
@@ -18,8 +16,8 @@ use tantivy::fastfield::FastFieldReader;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::Field;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||
use tantivy::Index;
|
||||
use tantivy::SegmentReader;
|
||||
use tantivy::{Index, TantivyError};
|
||||
|
||||
#[derive(Default)]
|
||||
struct Stats {
|
||||
@@ -75,9 +73,18 @@ impl Collector for StatsCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: u32,
|
||||
segment: &SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> tantivy::Result<StatsSegmentCollector> {
|
||||
let fast_field_reader = segment.fast_field_reader(self.field)?;
|
||||
let fast_field_reader = segment_reader
|
||||
.fast_fields()
|
||||
.u64(self.field)
|
||||
.ok_or_else(|| {
|
||||
let field_name = segment_reader.schema().get_field_name(self.field);
|
||||
TantivyError::SchemaError(format!(
|
||||
"Field {:?} is not a u64 fast field.",
|
||||
field_name
|
||||
))
|
||||
})?;
|
||||
Ok(StatsSegmentCollector {
|
||||
fast_field_reader,
|
||||
stats: Stats::default(),
|
||||
|
||||
@@ -10,8 +10,6 @@
|
||||
// - search for the best document matchings "sea whale"
|
||||
// - retrieve the best document original content.
|
||||
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
|
||||
105
examples/multiple_producer.rs
Normal file
105
examples/multiple_producer.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
// # Indexing from different threads.
|
||||
//
|
||||
// It is fairly common to have to index from different threads.
|
||||
// Tantivy forbids to create more than one `IndexWriter` at a time.
|
||||
//
|
||||
// This `IndexWriter` itself has its own multithreaded layer, so managing your own
|
||||
// indexing threads will not help. However, it can still be useful for some applications.
|
||||
//
|
||||
// For instance, if preparing documents to send to tantivy before indexing is the bottleneck of
|
||||
// your application, it is reasonable to have multiple threads.
|
||||
//
|
||||
// Another very common reason to want to index from multiple threads, is implementing a webserver
|
||||
// with CRUD capabilities. The server framework will most likely handle request from
|
||||
// different threads.
|
||||
//
|
||||
// The recommended way to address both of these use case is to wrap your `IndexWriter` into a
|
||||
// `Arc<RwLock<IndexWriter>>`.
|
||||
//
|
||||
// While this is counterintuitive, adding and deleting documents do not require mutability
|
||||
// over the `IndexWriter`, so several threads will be able to do this operation concurrently.
|
||||
//
|
||||
// The example below does not represent an actual real-life use case (who would spawn thread to
|
||||
// index a single document?), but aims at demonstrating the mechanism that makes indexing
|
||||
// from several threads possible.
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tantivy::schema::{Schema, STORED, TEXT};
|
||||
use tantivy::Opstamp;
|
||||
use tantivy::{Index, IndexWriter};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let body = schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let index_writer: Arc<RwLock<IndexWriter>> = Arc::new(RwLock::new(index.writer(50_000_000)?));
|
||||
|
||||
// # First indexing thread.
|
||||
let index_writer_clone_1 = index_writer.clone();
|
||||
thread::spawn(move || {
|
||||
// we index 100 times the document... for the sake of the example.
|
||||
for i in 0..100 {
|
||||
let opstamp = {
|
||||
// A read lock is sufficient here.
|
||||
let index_writer_rlock = index_writer_clone_1.read().unwrap();
|
||||
index_writer_rlock.add_document(
|
||||
doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
))
|
||||
};
|
||||
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
}
|
||||
});
|
||||
|
||||
// # Second indexing thread.
|
||||
let index_writer_clone_2 = index_writer.clone();
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
thread::spawn(move || {
|
||||
// we index 100 times the document... for the sake of the example.
|
||||
for i in 0..100 {
|
||||
// A read lock is sufficient here.
|
||||
let opstamp = {
|
||||
let index_writer_rlock = index_writer_clone_2.read().unwrap();
|
||||
index_writer_rlock.add_document(doc!(
|
||||
title => "Manufacturing consent",
|
||||
body => "Some great book description..."
|
||||
))
|
||||
};
|
||||
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(10));
|
||||
}
|
||||
});
|
||||
|
||||
// # In the main thread, we commit 10 times, once every 500ms.
|
||||
for _ in 0..10 {
|
||||
let opstamp: Opstamp = {
|
||||
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
|
||||
let mut index_writer_wlock = index_writer.write().unwrap();
|
||||
index_writer_wlock.commit().unwrap()
|
||||
};
|
||||
println!("committed with opstamp {}", opstamp);
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -4,7 +4,6 @@
|
||||
// your hit result.
|
||||
// Snippet are an extracted of a target document, and returned in HTML format.
|
||||
// The keyword searched by the user are highlighted with a `<b>` tag.
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
|
||||
@@ -9,8 +9,6 @@
|
||||
// - add a few stop words
|
||||
// - index few documents in our index
|
||||
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
extern crate tantivy;
|
||||
use tantivy;
|
||||
use tantivy::schema::*;
|
||||
|
||||
// # Document from json
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
#!/bin/bash
|
||||
cargo test --no-default-features --features mmap -- --test-threads 1
|
||||
cargo test
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use super::Collector;
|
||||
use collector::SegmentCollector;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use crate::collector::SegmentCollector;
|
||||
use crate::DocId;
|
||||
use crate::Result;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
|
||||
/// `CountCollector` collector only counts how many
|
||||
/// documents match the query.
|
||||
@@ -94,8 +94,8 @@ impl SegmentCollector for SegmentCountCollector {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{Count, SegmentCountCollector};
|
||||
use collector::Collector;
|
||||
use collector::SegmentCollector;
|
||||
use crate::collector::Collector;
|
||||
use crate::collector::SegmentCollector;
|
||||
|
||||
#[test]
|
||||
fn test_count_collect_does_not_requires_scoring() {
|
||||
|
||||
126
src/collector/custom_score_top_collector.rs
Normal file
126
src/collector/custom_score_top_collector.rs
Normal file
@@ -0,0 +1,126 @@
|
||||
use crate::collector::top_collector::{TopCollector, TopSegmentCollector};
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::Result;
|
||||
use crate::{DocAddress, DocId, Score, SegmentReader};
|
||||
|
||||
pub(crate) struct CustomScoreTopCollector<TCustomScorer, TScore = Score> {
|
||||
custom_scorer: TCustomScorer,
|
||||
collector: TopCollector<TScore>,
|
||||
}
|
||||
|
||||
impl<TCustomScorer, TScore> CustomScoreTopCollector<TCustomScorer, TScore>
|
||||
where
|
||||
TScore: Clone + PartialOrd,
|
||||
{
|
||||
pub fn new(
|
||||
custom_scorer: TCustomScorer,
|
||||
limit: usize,
|
||||
) -> CustomScoreTopCollector<TCustomScorer, TScore> {
|
||||
CustomScoreTopCollector {
|
||||
custom_scorer,
|
||||
collector: TopCollector::with_limit(limit),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A custom segment scorer makes it possible to define any kind of score
|
||||
/// for a given document belonging to a specific segment.
|
||||
///
|
||||
/// It is the segment local version of the [`CustomScorer`](./trait.CustomScorer.html).
|
||||
pub trait CustomSegmentScorer<TScore>: 'static {
|
||||
/// Computes the score of a specific `doc`.
|
||||
fn score(&self, doc: DocId) -> TScore;
|
||||
}
|
||||
|
||||
/// `CustomScorer` makes it possible to define any kind of score.
|
||||
///
|
||||
/// The `CustomerScorer` itself does not make much of the computation itself.
|
||||
/// Instead, it helps constructing `Self::Child` instances that will compute
|
||||
/// the score at a segment scale.
|
||||
pub trait CustomScorer<TScore>: Sync {
|
||||
/// Type of the associated [`CustomSegmentScorer`](./trait.CustomSegmentScorer.html).
|
||||
type Child: CustomSegmentScorer<TScore>;
|
||||
/// Builds a child scorer for a specific segment. The child scorer is associated to
|
||||
/// a specific segment.
|
||||
fn segment_scorer(&self, segment_reader: &SegmentReader) -> Result<Self::Child>;
|
||||
}
|
||||
|
||||
impl<TCustomScorer, TScore> Collector for CustomScoreTopCollector<TCustomScorer, TScore>
|
||||
where
|
||||
TCustomScorer: CustomScorer<TScore>,
|
||||
TScore: 'static + PartialOrd + Clone + Send + Sync,
|
||||
{
|
||||
type Fruit = Vec<(TScore, DocAddress)>;
|
||||
|
||||
type Child = CustomScoreTopSegmentCollector<TCustomScorer::Child, TScore>;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Result<Self::Child> {
|
||||
let segment_scorer = self.custom_scorer.segment_scorer(segment_reader)?;
|
||||
let segment_collector = self
|
||||
.collector
|
||||
.for_segment(segment_local_id, segment_reader)?;
|
||||
Ok(CustomScoreTopSegmentCollector {
|
||||
segment_collector,
|
||||
segment_scorer,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
||||
self.collector.merge_fruits(segment_fruits)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CustomScoreTopSegmentCollector<T, TScore>
|
||||
where
|
||||
TScore: 'static + PartialOrd + Clone + Send + Sync + Sized,
|
||||
T: CustomSegmentScorer<TScore>,
|
||||
{
|
||||
segment_collector: TopSegmentCollector<TScore>,
|
||||
segment_scorer: T,
|
||||
}
|
||||
|
||||
impl<T, TScore> SegmentCollector for CustomScoreTopSegmentCollector<T, TScore>
|
||||
where
|
||||
TScore: 'static + PartialOrd + Clone + Send + Sync,
|
||||
T: 'static + CustomSegmentScorer<TScore>,
|
||||
{
|
||||
type Fruit = Vec<(TScore, DocAddress)>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
let score = self.segment_scorer.score(doc);
|
||||
self.segment_collector.collect(doc, score);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Vec<(TScore, DocAddress)> {
|
||||
self.segment_collector.harvest()
|
||||
}
|
||||
}
|
||||
|
||||
impl<F, TScore, T> CustomScorer<TScore> for F
|
||||
where
|
||||
F: 'static + Send + Sync + Fn(&SegmentReader) -> T,
|
||||
T: CustomSegmentScorer<TScore>,
|
||||
{
|
||||
type Child = T;
|
||||
|
||||
fn segment_scorer(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
|
||||
Ok((self)(segment_reader))
|
||||
}
|
||||
}
|
||||
|
||||
impl<F, TScore> CustomSegmentScorer<TScore> for F
|
||||
where
|
||||
F: 'static + Sync + Send + Fn(DocId) -> TScore,
|
||||
{
|
||||
fn score(&self, doc: DocId) -> TScore {
|
||||
(self)(doc)
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,15 @@
|
||||
use collector::Collector;
|
||||
use collector::SegmentCollector;
|
||||
use docset::SkipResult;
|
||||
use fastfield::FacetReader;
|
||||
use schema::Facet;
|
||||
use schema::Field;
|
||||
use crate::collector::Collector;
|
||||
use crate::collector::SegmentCollector;
|
||||
use crate::docset::SkipResult;
|
||||
use crate::fastfield::FacetReader;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::Field;
|
||||
use crate::DocId;
|
||||
use crate::Result;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
use crate::TantivyError;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::btree_map;
|
||||
use std::collections::BTreeMap;
|
||||
@@ -12,11 +18,6 @@ use std::collections::BinaryHeap;
|
||||
use std::collections::Bound;
|
||||
use std::iter::Peekable;
|
||||
use std::{u64, usize};
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
|
||||
struct Hit<'a> {
|
||||
count: u64,
|
||||
@@ -26,13 +27,13 @@ struct Hit<'a> {
|
||||
impl<'a> Eq for Hit<'a> {}
|
||||
|
||||
impl<'a> PartialEq<Hit<'a>> for Hit<'a> {
|
||||
fn eq(&self, other: &Hit) -> bool {
|
||||
fn eq(&self, other: &Hit<'_>) -> bool {
|
||||
self.count == other.count
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> PartialOrd<Hit<'a>> for Hit<'a> {
|
||||
fn partial_cmp(&self, other: &Hit) -> Option<Ordering> {
|
||||
fn partial_cmp(&self, other: &Hit<'_>) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
@@ -264,7 +265,10 @@ impl Collector for FacetCollector {
|
||||
_: SegmentLocalId,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<FacetSegmentCollector> {
|
||||
let facet_reader = reader.facet_reader(self.field)?;
|
||||
let field_name = reader.schema().get_field_name(self.field);
|
||||
let facet_reader = reader.facet_reader(self.field).ok_or_else(|| {
|
||||
TantivyError::SchemaError(format!("Field {:?} is not a facet field.", field_name))
|
||||
})?;
|
||||
|
||||
let mut collapse_mapping = Vec::new();
|
||||
let mut counts = Vec::new();
|
||||
@@ -394,7 +398,7 @@ impl<'a> Iterator for FacetChildIterator<'a> {
|
||||
}
|
||||
|
||||
impl FacetCounts {
|
||||
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator
|
||||
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator<'_>
|
||||
where
|
||||
Facet: From<T>,
|
||||
{
|
||||
@@ -408,7 +412,8 @@ impl FacetCounts {
|
||||
let facet_after = Facet::from_encoded_string(facet_after_bytes);
|
||||
Bound::Excluded(facet_after)
|
||||
};
|
||||
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
|
||||
let underlying: btree_map::Range<'_, _, _> =
|
||||
self.facet_counts.range((left_bound, right_bound));
|
||||
FacetChildIterator { underlying }
|
||||
}
|
||||
|
||||
@@ -449,12 +454,12 @@ impl FacetCounts {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{FacetCollector, FacetCounts};
|
||||
use core::Index;
|
||||
use query::AllQuery;
|
||||
use crate::core::Index;
|
||||
use crate::query::AllQuery;
|
||||
use crate::schema::{Document, Facet, Field, Schema};
|
||||
use rand::distributions::Uniform;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::{thread_rng, Rng};
|
||||
use schema::{Document, Facet, Field, Schema};
|
||||
use std::iter;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -82,6 +82,7 @@ mod tests {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
|
||||
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
|
||||
let num_field_f64 = schema_builder.add_f64_field("num_f64", FAST);
|
||||
let text_field = schema_builder.add_text_field("text", STRING);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
@@ -94,6 +95,7 @@ mod tests {
|
||||
index_writer.add_document(doc!(
|
||||
num_field_i64 => ((i as i64) % 3i64) as i64,
|
||||
num_field_u64 => (i % 2u64) as u64,
|
||||
num_field_f64 => (i % 4u64) as f64,
|
||||
text_field => "text"
|
||||
));
|
||||
}
|
||||
@@ -104,10 +106,11 @@ mod tests {
|
||||
let searcher = index.reader().searcher();
|
||||
let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64);
|
||||
let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64);
|
||||
let mut ffvf_f64: IntFacetCollector<F64FastFieldReader> = IntFacetCollector::new(num_field_f64);
|
||||
|
||||
{
|
||||
// perform the query
|
||||
let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64);
|
||||
let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64).push(&mut ffvf_f64);
|
||||
let mut query_parser = QueryParser::for_index(index, vec![text_field]);
|
||||
let query = query_parser.parse_query("text:text").unwrap();
|
||||
query.search(&searcher, &mut facet_collectors).unwrap();
|
||||
@@ -117,6 +120,8 @@ mod tests {
|
||||
assert_eq!(ffvf_u64.counters[&1], 5);
|
||||
assert_eq!(ffvf_i64.counters[&0], 4);
|
||||
assert_eq!(ffvf_i64.counters[&1], 3);
|
||||
assert_eq!(ffvf_f64.counters[&0.0], 3);
|
||||
assert_eq!(ffvf_f64.counters[&2.0], 2);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,7 +66,7 @@ let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
|
||||
|
||||
The `Collector` trait is implemented for up to 4 collectors.
|
||||
If you have more than 4 collectors, you can either group them into
|
||||
tuples of tuples `(a,(b,(c,d)))`, or rely on `MultiCollector`'s.
|
||||
tuples of tuples `(a,(b,(c,d)))`, or rely on [`MultiCollector`](./struct.MultiCollector.html).
|
||||
|
||||
# Combining several collectors dynamically
|
||||
|
||||
@@ -85,12 +85,12 @@ See the `custom_collector` example.
|
||||
|
||||
*/
|
||||
|
||||
use downcast_rs;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use crate::DocId;
|
||||
use crate::Result;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
use downcast_rs::impl_downcast;
|
||||
|
||||
mod count_collector;
|
||||
pub use self::count_collector::Count;
|
||||
@@ -103,8 +103,11 @@ mod top_collector;
|
||||
mod top_score_collector;
|
||||
pub use self::top_score_collector::TopDocs;
|
||||
|
||||
mod top_field_collector;
|
||||
pub use self::top_field_collector::TopDocsByField;
|
||||
mod custom_score_top_collector;
|
||||
pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};
|
||||
|
||||
mod tweak_score_top_collector;
|
||||
pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
|
||||
|
||||
mod facet_collector;
|
||||
pub use self::facet_collector::FacetCollector;
|
||||
|
||||
@@ -1,29 +1,30 @@
|
||||
use super::Collector;
|
||||
use super::SegmentCollector;
|
||||
use collector::Fruit;
|
||||
use crate::collector::Fruit;
|
||||
use crate::DocId;
|
||||
use crate::Result;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
use crate::TantivyError;
|
||||
use std::marker::PhantomData;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use TantivyError;
|
||||
use std::ops::Deref;
|
||||
|
||||
pub struct MultiFruit {
|
||||
sub_fruits: Vec<Option<Box<Fruit>>>,
|
||||
sub_fruits: Vec<Option<Box<dyn Fruit>>>,
|
||||
}
|
||||
|
||||
pub struct CollectorWrapper<TCollector: Collector>(TCollector);
|
||||
|
||||
impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
||||
type Fruit = Box<Fruit>;
|
||||
type Child = Box<BoxableSegmentCollector>;
|
||||
type Fruit = Box<dyn Fruit>;
|
||||
type Child = Box<dyn BoxableSegmentCollector>;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<Box<BoxableSegmentCollector>> {
|
||||
) -> Result<Box<dyn BoxableSegmentCollector>> {
|
||||
let child = self.0.for_segment(segment_local_id, reader)?;
|
||||
Ok(Box::new(SegmentCollectorWrapper(child)))
|
||||
}
|
||||
@@ -32,7 +33,7 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
||||
self.0.requires_scoring()
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, children: Vec<<Self as Collector>::Fruit>) -> Result<Box<Fruit>> {
|
||||
fn merge_fruits(&self, children: Vec<<Self as Collector>::Fruit>) -> Result<Box<dyn Fruit>> {
|
||||
let typed_fruit: Vec<TCollector::Fruit> = children
|
||||
.into_iter()
|
||||
.map(|untyped_fruit| {
|
||||
@@ -49,21 +50,21 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for Box<BoxableSegmentCollector> {
|
||||
type Fruit = Box<Fruit>;
|
||||
impl SegmentCollector for Box<dyn BoxableSegmentCollector> {
|
||||
type Fruit = Box<dyn Fruit>;
|
||||
|
||||
fn collect(&mut self, doc: u32, score: f32) {
|
||||
self.as_mut().collect(doc, score);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Box<Fruit> {
|
||||
fn harvest(self) -> Box<dyn Fruit> {
|
||||
BoxableSegmentCollector::harvest_from_box(self)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait BoxableSegmentCollector {
|
||||
fn collect(&mut self, doc: u32, score: f32);
|
||||
fn harvest_from_box(self: Box<Self>) -> Box<Fruit>;
|
||||
fn harvest_from_box(self: Box<Self>) -> Box<dyn Fruit>;
|
||||
}
|
||||
|
||||
pub struct SegmentCollectorWrapper<TSegmentCollector: SegmentCollector>(TSegmentCollector);
|
||||
@@ -75,7 +76,7 @@ impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector
|
||||
self.0.collect(doc, score);
|
||||
}
|
||||
|
||||
fn harvest_from_box(self: Box<Self>) -> Box<Fruit> {
|
||||
fn harvest_from_box(self: Box<Self>) -> Box<dyn Fruit> {
|
||||
Box::new(self.0.harvest())
|
||||
}
|
||||
}
|
||||
@@ -156,8 +157,9 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[derive(Default)]
|
||||
pub struct MultiCollector<'a> {
|
||||
collector_wrappers:
|
||||
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>,
|
||||
collector_wrappers: Vec<
|
||||
Box<dyn Collector<Child = Box<dyn BoxableSegmentCollector>, Fruit = Box<dyn Fruit>> + 'a>,
|
||||
>,
|
||||
}
|
||||
|
||||
impl<'a> MultiCollector<'a> {
|
||||
@@ -199,11 +201,14 @@ impl<'a> Collector for MultiCollector<'a> {
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collector_wrappers.iter().any(|c| c.requires_scoring())
|
||||
self.collector_wrappers
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.any(Collector::requires_scoring)
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, segments_multifruits: Vec<MultiFruit>) -> Result<MultiFruit> {
|
||||
let mut segment_fruits_list: Vec<Vec<Box<Fruit>>> = (0..self.collector_wrappers.len())
|
||||
let mut segment_fruits_list: Vec<Vec<Box<dyn Fruit>>> = (0..self.collector_wrappers.len())
|
||||
.map(|_| Vec::with_capacity(segments_multifruits.len()))
|
||||
.collect::<Vec<_>>();
|
||||
for segment_multifruit in segments_multifruits {
|
||||
@@ -226,7 +231,7 @@ impl<'a> Collector for MultiCollector<'a> {
|
||||
}
|
||||
|
||||
pub struct MultiCollectorChild {
|
||||
children: Vec<Box<BoxableSegmentCollector>>,
|
||||
children: Vec<Box<dyn BoxableSegmentCollector>>,
|
||||
}
|
||||
|
||||
impl SegmentCollector for MultiCollectorChild {
|
||||
@@ -253,12 +258,12 @@ impl SegmentCollector for MultiCollectorChild {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::{Count, TopDocs};
|
||||
use query::TermQuery;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Schema, TEXT};
|
||||
use Index;
|
||||
use Term;
|
||||
use crate::collector::{Count, TopDocs};
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
|
||||
#[test]
|
||||
fn test_multi_collector() {
|
||||
|
||||
@@ -1,12 +1,20 @@
|
||||
use super::*;
|
||||
use core::SegmentReader;
|
||||
use fastfield::BytesFastFieldReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use schema::Field;
|
||||
use DocAddress;
|
||||
use DocId;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fastfield::BytesFastFieldReader;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Field;
|
||||
use crate::DocAddress;
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
|
||||
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
|
||||
compute_score: true,
|
||||
};
|
||||
|
||||
pub const TEST_COLLECTOR_WITHOUT_SCORE: TestCollector = TestCollector {
|
||||
compute_score: true,
|
||||
};
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
/// This collector is only used for tests.
|
||||
@@ -14,7 +22,9 @@ use SegmentLocalId;
|
||||
///
|
||||
/// actise, as it does not store
|
||||
/// the segment ordinals
|
||||
pub struct TestCollector;
|
||||
pub struct TestCollector {
|
||||
pub compute_score: bool,
|
||||
}
|
||||
|
||||
pub struct TestSegmentCollector {
|
||||
segment_id: SegmentLocalId,
|
||||
@@ -32,7 +42,6 @@ impl TestFruit {
|
||||
pub fn docs(&self) -> &[DocAddress] {
|
||||
&self.docs[..]
|
||||
}
|
||||
|
||||
pub fn scores(&self) -> &[Score] {
|
||||
&self.scores[..]
|
||||
}
|
||||
@@ -54,7 +63,7 @@ impl Collector for TestCollector {
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
self.compute_score
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, mut children: Vec<TestFruit>) -> Result<TestFruit> {
|
||||
@@ -114,11 +123,15 @@ impl Collector for FastFieldTestCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_: SegmentLocalId,
|
||||
reader: &SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Result<FastFieldSegmentCollector> {
|
||||
let reader = segment_reader
|
||||
.fast_fields()
|
||||
.u64(self.field)
|
||||
.expect("Requested field is not a fast field.");
|
||||
Ok(FastFieldSegmentCollector {
|
||||
vals: Vec::new(),
|
||||
reader: reader.fast_field_reader(self.field)?,
|
||||
reader,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -170,11 +183,14 @@ impl Collector for BytesFastFieldTestCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: u32,
|
||||
segment: &SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Result<BytesFastFieldSegmentCollector> {
|
||||
Ok(BytesFastFieldSegmentCollector {
|
||||
vals: Vec::new(),
|
||||
reader: segment.bytes_fast_field_reader(self.field)?,
|
||||
reader: segment_reader
|
||||
.fast_fields()
|
||||
.bytes(self.field)
|
||||
.expect("Field is not a bytes fast field."),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -191,7 +207,7 @@ impl SegmentCollector for BytesFastFieldSegmentCollector {
|
||||
type Fruit = Vec<u8>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let data = self.reader.get_val(doc);
|
||||
let data = self.reader.get_bytes(doc);
|
||||
self.vals.extend(data);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use crate::DocAddress;
|
||||
use crate::DocId;
|
||||
use crate::Result;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
use serde::export::PhantomData;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use DocAddress;
|
||||
use DocId;
|
||||
use Result;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
|
||||
/// Contains a feature (field, score, etc.) of a document along with the document address.
|
||||
///
|
||||
@@ -98,11 +98,11 @@ where
|
||||
.collect())
|
||||
}
|
||||
|
||||
pub(crate) fn for_segment(
|
||||
pub(crate) fn for_segment<F: PartialOrd>(
|
||||
&self,
|
||||
segment_id: SegmentLocalId,
|
||||
_: &SegmentReader,
|
||||
) -> Result<TopSegmentCollector<T>> {
|
||||
) -> Result<TopSegmentCollector<F>> {
|
||||
Ok(TopSegmentCollector::new(segment_id, self.limit))
|
||||
}
|
||||
}
|
||||
@@ -177,9 +177,8 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{TopCollector, TopSegmentCollector};
|
||||
use DocAddress;
|
||||
use Score;
|
||||
use super::TopSegmentCollector;
|
||||
use crate::DocAddress;
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity() {
|
||||
@@ -215,10 +214,4 @@ mod tests {
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_top_0() {
|
||||
let _collector: TopCollector<Score> = TopCollector::with_limit(0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,257 +0,0 @@
|
||||
use super::Collector;
|
||||
use collector::top_collector::TopCollector;
|
||||
use collector::top_collector::TopSegmentCollector;
|
||||
use collector::SegmentCollector;
|
||||
use fastfield::FastFieldReader;
|
||||
use fastfield::FastValue;
|
||||
use schema::Field;
|
||||
use DocAddress;
|
||||
use Result;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
|
||||
/// The Top Field Collector keeps track of the K documents
|
||||
/// sorted by a fast field in the index
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// # use tantivy::schema::{Schema, Field, FAST, TEXT};
|
||||
/// # use tantivy::{Index, Result, DocAddress};
|
||||
/// # use tantivy::query::{Query, QueryParser};
|
||||
/// use tantivy::Searcher;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
/// # let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// # let rating = schema_builder.add_u64_field("rating", FAST);
|
||||
/// # let schema = schema_builder.build();
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||
/// # index_writer.add_document(doc!(
|
||||
/// # title => "The Name of the Wind",
|
||||
/// # rating => 92u64,
|
||||
/// # ));
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
|
||||
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
|
||||
/// # index_writer.commit()?;
|
||||
/// # let reader = index.reader()?;
|
||||
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
|
||||
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
|
||||
/// # assert_eq!(top_docs,
|
||||
/// # vec![(97u64, DocAddress(0u32, 1)),
|
||||
/// # (80u64, DocAddress(0u32, 3))]);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// #
|
||||
/// /// Searches the document matching the given query, and
|
||||
/// /// collects the top 10 documents, order by the `field`
|
||||
/// /// given in argument.
|
||||
/// ///
|
||||
/// /// `field` is required to be a FAST field.
|
||||
/// fn docs_sorted_by_rating(searcher: &Searcher,
|
||||
/// query: &Query,
|
||||
/// sort_by_field: Field)
|
||||
/// -> Result<Vec<(u64, DocAddress)>> {
|
||||
///
|
||||
/// // This is where we build our collector!
|
||||
/// let top_docs_by_rating = TopDocs::with_limit(2).order_by_field(sort_by_field);
|
||||
///
|
||||
/// // ... and here is our documents. Not this is a simple vec.
|
||||
/// // The `u64` in the pair is the value of our fast field for each documents.
|
||||
/// searcher.search(query, &top_docs_by_rating)
|
||||
/// }
|
||||
/// ```
|
||||
pub struct TopDocsByField<T> {
|
||||
collector: TopCollector<T>,
|
||||
field: Field,
|
||||
}
|
||||
|
||||
impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
|
||||
/// Creates a top field collector, with a number of documents equal to "limit".
|
||||
///
|
||||
/// The given field name must be a fast field, otherwise the collector have an error while
|
||||
/// collecting results.
|
||||
///
|
||||
/// This constructor is crate-private. Client are supposed to call
|
||||
/// build `TopDocsByField` object using the `TopDocs` API.
|
||||
///
|
||||
/// e.g.:
|
||||
/// `TopDocs::with_limit(2).order_by_field(sort_by_field)`
|
||||
///
|
||||
/// # Panics
|
||||
/// The method panics if limit is 0
|
||||
pub(crate) fn new(field: Field, limit: usize) -> TopDocsByField<T> {
|
||||
TopDocsByField {
|
||||
collector: TopCollector::with_limit(limit),
|
||||
field,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByField<T> {
|
||||
type Fruit = Vec<(T, DocAddress)>;
|
||||
|
||||
type Child = TopFieldSegmentCollector<T>;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: SegmentLocalId,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<TopFieldSegmentCollector<T>> {
|
||||
let collector = self.collector.for_segment(segment_local_id, reader)?;
|
||||
let reader = reader.fast_field_reader(self.field)?;
|
||||
Ok(TopFieldSegmentCollector { collector, reader })
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
&self,
|
||||
segment_fruits: Vec<Vec<(T, DocAddress)>>,
|
||||
) -> Result<Vec<(T, DocAddress)>> {
|
||||
self.collector.merge_fruits(segment_fruits)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TopFieldSegmentCollector<T: FastValue + PartialOrd> {
|
||||
collector: TopSegmentCollector<T>,
|
||||
reader: FastFieldReader<T>,
|
||||
}
|
||||
|
||||
impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
||||
for TopFieldSegmentCollector<T>
|
||||
{
|
||||
type Fruit = Vec<(T, DocAddress)>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let field_value = self.reader.get(doc);
|
||||
self.collector.collect(doc, field_value);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Vec<(T, DocAddress)> {
|
||||
self.collector.harvest()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::TopDocsByField;
|
||||
use collector::Collector;
|
||||
use collector::TopDocs;
|
||||
use query::Query;
|
||||
use query::QueryParser;
|
||||
use schema::Field;
|
||||
use schema::IntOptions;
|
||||
use schema::{Schema, FAST, TEXT};
|
||||
use DocAddress;
|
||||
use Index;
|
||||
use IndexWriter;
|
||||
use TantivyError;
|
||||
|
||||
const TITLE: &str = "title";
|
||||
const SIZE: &str = "size";
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_not_at_capacity() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, query) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "growler of beer",
|
||||
size => 64u64,
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "pint of beer",
|
||||
size => 16u64,
|
||||
));
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
|
||||
let top_collector = TopDocs::with_limit(4).order_by_field(size);
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
|
||||
assert_eq!(
|
||||
top_docs,
|
||||
vec![
|
||||
(64, DocAddress(0, 1)),
|
||||
(16, DocAddress(0, 2)),
|
||||
(12, DocAddress(0, 0))
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_field_does_not_exist() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, _) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(Field(2));
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
top_collector
|
||||
.for_segment(0, segment_reader)
|
||||
.expect("should panic");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_field_not_fast_field() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, IntOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let (index, _) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(size);
|
||||
assert_matches!(
|
||||
top_collector
|
||||
.for_segment(0, segment)
|
||||
.map(|_| ())
|
||||
.unwrap_err(),
|
||||
TantivyError::FastFieldError(_)
|
||||
);
|
||||
}
|
||||
|
||||
fn index(
|
||||
query: &str,
|
||||
query_field: Field,
|
||||
schema: Schema,
|
||||
mut doc_adder: impl FnMut(&mut IndexWriter) -> (),
|
||||
) -> (Index, Box<Query>) {
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
doc_adder(&mut index_writer);
|
||||
index_writer.commit().unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![query_field]);
|
||||
let query = query_parser.parse_query(query).unwrap();
|
||||
(index, query)
|
||||
}
|
||||
}
|
||||
@@ -1,16 +1,19 @@
|
||||
use super::Collector;
|
||||
use collector::top_collector::TopCollector;
|
||||
use collector::top_collector::TopSegmentCollector;
|
||||
use collector::SegmentCollector;
|
||||
use collector::TopDocsByField;
|
||||
use fastfield::FastValue;
|
||||
use schema::Field;
|
||||
use DocAddress;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
|
||||
use crate::collector::top_collector::TopCollector;
|
||||
use crate::collector::top_collector::TopSegmentCollector;
|
||||
use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
|
||||
use crate::collector::{
|
||||
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
|
||||
};
|
||||
use crate::schema::Field;
|
||||
use crate::DocAddress;
|
||||
use crate::DocId;
|
||||
use crate::Result;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
use std::fmt;
|
||||
|
||||
/// The Top Score Collector keeps track of the K documents
|
||||
/// sorted by their score.
|
||||
@@ -66,6 +69,12 @@ use SegmentReader;
|
||||
/// ```
|
||||
pub struct TopDocs(TopCollector<Score>);
|
||||
|
||||
impl fmt::Debug for TopDocs {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "TopDocs({})", self.0.limit())
|
||||
}
|
||||
}
|
||||
|
||||
impl TopDocs {
|
||||
/// Creates a top score collector, with a number of documents equal to "limit".
|
||||
///
|
||||
@@ -77,13 +86,312 @@ impl TopDocs {
|
||||
|
||||
/// Set top-K to rank documents by a given fast field.
|
||||
///
|
||||
/// (By default, `TopDocs` collects the top-K documents sorted by
|
||||
/// the similarity score.)
|
||||
pub fn order_by_field<T: PartialOrd + FastValue + Clone>(
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// # use tantivy::schema::{Schema, FAST, TEXT};
|
||||
/// # use tantivy::{Index, Result, DocAddress};
|
||||
/// # use tantivy::query::{Query, QueryParser};
|
||||
/// use tantivy::Searcher;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::schema::Field;
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
/// # let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// # let rating = schema_builder.add_u64_field("rating", FAST);
|
||||
/// # let schema = schema_builder.build();
|
||||
/// #
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||
/// # index_writer.add_document(doc!(
|
||||
/// # title => "The Name of the Wind",
|
||||
/// # rating => 92u64,
|
||||
/// # ));
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
|
||||
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
|
||||
/// # index_writer.commit()?;
|
||||
/// # let reader = index.reader()?;
|
||||
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
|
||||
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
|
||||
/// # assert_eq!(top_docs,
|
||||
/// # vec![(97u64, DocAddress(0u32, 1)),
|
||||
/// # (80u64, DocAddress(0u32, 3))]);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
///
|
||||
///
|
||||
/// /// Searches the document matching the given query, and
|
||||
/// /// collects the top 10 documents, order by the u64-`field`
|
||||
/// /// given in argument.
|
||||
/// ///
|
||||
/// /// `field` is required to be a FAST field.
|
||||
/// fn docs_sorted_by_rating(searcher: &Searcher,
|
||||
/// query: &Query,
|
||||
/// sort_by_field: Field)
|
||||
/// -> Result<Vec<(u64, DocAddress)>> {
|
||||
///
|
||||
/// // This is where we build our topdocs collector
|
||||
/// //
|
||||
/// // Note the generics parameter that needs to match the
|
||||
/// // type `sort_by_field`.
|
||||
/// let top_docs_by_rating = TopDocs
|
||||
/// ::with_limit(10)
|
||||
/// .order_by_u64_field(sort_by_field);
|
||||
///
|
||||
/// // ... and here are our documents. Note this is a simple vec.
|
||||
/// // The `u64` in the pair is the value of our fast field for
|
||||
/// // each documents.
|
||||
/// //
|
||||
/// // The vec is sorted decreasingly by `sort_by_field`, and has a
|
||||
/// // length of 10, or less if not enough documents matched the
|
||||
/// // query.
|
||||
/// let resulting_docs: Vec<(u64, DocAddress)> =
|
||||
/// searcher.search(query, &top_docs_by_rating)?;
|
||||
///
|
||||
/// Ok(resulting_docs)
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if the field requested is not a fast field.
|
||||
///
|
||||
pub fn order_by_u64_field(
|
||||
self,
|
||||
field: Field,
|
||||
) -> TopDocsByField<T> {
|
||||
TopDocsByField::new(field, self.0.limit())
|
||||
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
|
||||
self.custom_score(move |segment_reader: &SegmentReader| {
|
||||
let ff_reader = segment_reader
|
||||
.fast_fields()
|
||||
.u64(field)
|
||||
.expect("Field requested is not a i64/u64 fast field.");
|
||||
//TODO error message missmatch actual behavior for i64
|
||||
move |doc: DocId| ff_reader.get(doc)
|
||||
})
|
||||
}
|
||||
|
||||
/// Ranks the documents using a custom score.
|
||||
///
|
||||
/// This method offers a convenient way to tweak or replace
|
||||
/// the documents score. As suggested by the prototype you can
|
||||
/// manually define your own [`ScoreTweaker`](./trait.ScoreTweaker.html)
|
||||
/// and pass it as an argument, but there is a much simpler way to
|
||||
/// tweak your score: you can use a closure as in the following
|
||||
/// example.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// Typically, you will want to rely on one or more fast fields,
|
||||
/// to alter the original relevance `Score`.
|
||||
///
|
||||
/// For instance, in the following, we assume that we are implementing
|
||||
/// an e-commerce website that has a fast field called `popularity`
|
||||
/// that rates whether a product is typically often bought by users.
|
||||
///
|
||||
/// In the following example will will tweak our ranking a bit by
|
||||
/// boosting popular products a notch.
|
||||
///
|
||||
/// In more serious application, this tweaking could involved running a
|
||||
/// learning-to-rank model over various features
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// # use tantivy::schema::{Schema, FAST, TEXT};
|
||||
/// # use tantivy::{Index, DocAddress, DocId, Score};
|
||||
/// # use tantivy::query::QueryParser;
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::schema::Field;
|
||||
///
|
||||
/// # fn create_schema() -> Schema {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
/// # schema_builder.add_text_field("product_name", TEXT);
|
||||
/// # schema_builder.add_u64_field("popularity", FAST);
|
||||
/// # schema_builder.build()
|
||||
/// # }
|
||||
/// #
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// # let schema = create_schema();
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||
/// # let product_name = index.schema().get_field("product_name").unwrap();
|
||||
/// #
|
||||
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
|
||||
/// # index_writer.add_document(doc!(product_name => "The Diary of Muadib", popularity => 1u64));
|
||||
/// # index_writer.add_document(doc!(product_name => "A Dairy Cow", popularity => 10u64));
|
||||
/// # index_writer.add_document(doc!(product_name => "The Diary of a Young Girl", popularity => 15u64));
|
||||
/// # index_writer.commit()?;
|
||||
/// // ...
|
||||
/// # let user_query = "diary";
|
||||
/// # let query = QueryParser::for_index(&index, vec![product_name]).parse_query(user_query)?;
|
||||
///
|
||||
/// // This is where we build our collector with our custom score.
|
||||
/// let top_docs_by_custom_score = TopDocs
|
||||
/// ::with_limit(10)
|
||||
/// .tweak_score(move |segment_reader: &SegmentReader| {
|
||||
/// // The argument is a function that returns our scoring
|
||||
/// // function.
|
||||
/// //
|
||||
/// // The point of this "mother" function is to gather all
|
||||
/// // of the segment level information we need for scoring.
|
||||
/// // Typically, fast_fields.
|
||||
/// //
|
||||
/// // In our case, we will get a reader for the popularity
|
||||
/// // fast field.
|
||||
/// let popularity_reader =
|
||||
/// segment_reader.fast_fields().u64(popularity).unwrap();
|
||||
///
|
||||
/// // We can now define our actual scoring function
|
||||
/// move |doc: DocId, original_score: Score| {
|
||||
/// let popularity: u64 = popularity_reader.get(doc);
|
||||
/// // Well.. For the sake of the example we use a simple logarithm
|
||||
/// // function.
|
||||
/// let popularity_boost_score = ((2u64 + popularity) as f32).log2();
|
||||
/// popularity_boost_score * original_score
|
||||
/// }
|
||||
/// });
|
||||
/// # let reader = index.reader()?;
|
||||
/// # let searcher = reader.searcher();
|
||||
/// // ... and here are our documents. Note this is a simple vec.
|
||||
/// // The `Score` in the pair is our tweaked score.
|
||||
/// let resulting_docs: Vec<(Score, DocAddress)> =
|
||||
/// searcher.search(&*query, &top_docs_by_custom_score)?;
|
||||
///
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// # See also
|
||||
/// [custom_score(...)](#method.custom_score).
|
||||
pub fn tweak_score<TScore, TScoreSegmentTweaker, TScoreTweaker>(
|
||||
self,
|
||||
score_tweaker: TScoreTweaker,
|
||||
) -> impl Collector<Fruit = Vec<(TScore, DocAddress)>>
|
||||
where
|
||||
TScore: 'static + Send + Sync + Clone + PartialOrd,
|
||||
TScoreSegmentTweaker: ScoreSegmentTweaker<TScore> + 'static,
|
||||
TScoreTweaker: ScoreTweaker<TScore, Child = TScoreSegmentTweaker>,
|
||||
{
|
||||
TweakedScoreTopCollector::new(score_tweaker, self.0.limit())
|
||||
}
|
||||
|
||||
/// Ranks the documents using a custom score.
|
||||
///
|
||||
/// This method offers a convenient way to use a different score.
|
||||
///
|
||||
/// As suggested by the prototype you can manually define your
|
||||
/// own [`CustomScorer`](./trait.CustomScorer.html)
|
||||
/// and pass it as an argument, but there is a much simpler way to
|
||||
/// tweak your score: you can use a closure as in the following
|
||||
/// example.
|
||||
///
|
||||
/// # Limitation
|
||||
///
|
||||
/// This method only makes it possible to compute the score from a given
|
||||
/// `DocId`, fastfield values for the doc and any information you could
|
||||
/// have precomputed beforehands. It does not make it possible for instance
|
||||
/// to compute something like TfIdf as it does not have access to the list of query
|
||||
/// terms present in the document, nor the term frequencies for the different terms.
|
||||
///
|
||||
/// It can be used if your search engine relies on a learning-to-rank model for instance,
|
||||
/// which does not rely on the term frequencies or positions as features.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// # #[macro_use]
|
||||
/// # extern crate tantivy;
|
||||
/// # use tantivy::schema::{Schema, FAST, TEXT};
|
||||
/// # use tantivy::{Index, DocAddress, DocId};
|
||||
/// # use tantivy::query::QueryParser;
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::schema::Field;
|
||||
///
|
||||
/// # fn create_schema() -> Schema {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
/// # schema_builder.add_text_field("product_name", TEXT);
|
||||
/// # schema_builder.add_u64_field("popularity", FAST);
|
||||
/// # schema_builder.add_u64_field("boosted", FAST);
|
||||
/// # schema_builder.build()
|
||||
/// # }
|
||||
/// #
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// # let schema = create_schema();
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||
/// # let product_name = index.schema().get_field("product_name").unwrap();
|
||||
/// #
|
||||
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
|
||||
/// let boosted: Field = index.schema().get_field("boosted").unwrap();
|
||||
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64));
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "A Dairy Cow", popularity => 10u64));
|
||||
/// # index_writer.add_document(doc!(boosted=>0u64, product_name => "The Diary of a Young Girl", popularity => 15u64));
|
||||
/// # index_writer.commit()?;
|
||||
/// // ...
|
||||
/// # let user_query = "diary";
|
||||
/// # let query = QueryParser::for_index(&index, vec![product_name]).parse_query(user_query)?;
|
||||
///
|
||||
/// // This is where we build our collector with our custom score.
|
||||
/// let top_docs_by_custom_score = TopDocs
|
||||
/// ::with_limit(10)
|
||||
/// .custom_score(move |segment_reader: &SegmentReader| {
|
||||
/// // The argument is a function that returns our scoring
|
||||
/// // function.
|
||||
/// //
|
||||
/// // The point of this "mother" function is to gather all
|
||||
/// // of the segment level information we need for scoring.
|
||||
/// // Typically, fast_fields.
|
||||
/// //
|
||||
/// // In our case, we will get a reader for the popularity
|
||||
/// // fast field and a boosted field.
|
||||
/// //
|
||||
/// // We want to get boosted items score, and when we get
|
||||
/// // a tie, return the item with the highest popularity.
|
||||
/// //
|
||||
/// // Note that this is implemented by using a `(u64, u64)`
|
||||
/// // as a score.
|
||||
/// let popularity_reader =
|
||||
/// segment_reader.fast_fields().u64(popularity).unwrap();
|
||||
/// let boosted_reader =
|
||||
/// segment_reader.fast_fields().u64(boosted).unwrap();
|
||||
///
|
||||
/// // We can now define our actual scoring function
|
||||
/// move |doc: DocId| {
|
||||
/// let popularity: u64 = popularity_reader.get(doc);
|
||||
/// let boosted: u64 = boosted_reader.get(doc);
|
||||
/// // Score do not have to be `f64` in tantivy.
|
||||
/// // Here we return a couple to get lexicographical order
|
||||
/// // for free.
|
||||
/// (boosted, popularity)
|
||||
/// }
|
||||
/// });
|
||||
/// # let reader = index.reader()?;
|
||||
/// # let searcher = reader.searcher();
|
||||
/// // ... and here are our documents. Note this is a simple vec.
|
||||
/// // The `Score` in the pair is our tweaked score.
|
||||
/// let resulting_docs: Vec<((u64, u64), DocAddress)> =
|
||||
/// searcher.search(&*query, &top_docs_by_custom_score)?;
|
||||
///
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// # See also
|
||||
/// [tweak_score(...)](#method.tweak_score).
|
||||
pub fn custom_score<TScore, TCustomSegmentScorer, TCustomScorer>(
|
||||
self,
|
||||
custom_score: TCustomScorer,
|
||||
) -> impl Collector<Fruit = Vec<(TScore, DocAddress)>>
|
||||
where
|
||||
TScore: 'static + Send + Sync + Clone + PartialOrd,
|
||||
TCustomSegmentScorer: CustomSegmentScorer<TScore> + 'static,
|
||||
TCustomScorer: CustomScorer<TScore, Child = TCustomSegmentScorer>,
|
||||
{
|
||||
CustomScoreTopCollector::new(custom_score, self.0.limit())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,12 +436,13 @@ impl SegmentCollector for TopScoreSegmentCollector {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::TopDocs;
|
||||
use query::QueryParser;
|
||||
use schema::Schema;
|
||||
use schema::TEXT;
|
||||
use DocAddress;
|
||||
use Index;
|
||||
use Score;
|
||||
use crate::collector::Collector;
|
||||
use crate::query::{Query, QueryParser};
|
||||
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
||||
use crate::DocAddress;
|
||||
use crate::Index;
|
||||
use crate::IndexWriter;
|
||||
use crate::Score;
|
||||
|
||||
fn make_index() -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -200,4 +509,97 @@ mod tests {
|
||||
TopDocs::with_limit(0);
|
||||
}
|
||||
|
||||
const TITLE: &str = "title";
|
||||
const SIZE: &str = "size";
|
||||
|
||||
#[test]
|
||||
fn test_top_field_collector_not_at_capacity() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, query) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "growler of beer",
|
||||
size => 64u64,
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
title => "pint of beer",
|
||||
size => 16u64,
|
||||
));
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
|
||||
assert_eq!(
|
||||
top_docs,
|
||||
vec![
|
||||
(64, DocAddress(0, 1)),
|
||||
(16, DocAddress(0, 2)),
|
||||
(12, DocAddress(0, 0))
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_field_does_not_exist() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||
let schema = schema_builder.build();
|
||||
let (index, _) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field(2));
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
top_collector
|
||||
.for_segment(0, segment_reader)
|
||||
.expect("should panic");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Field requested is not a i64/u64 fast field")]
|
||||
fn test_field_not_fast_field() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||
let size = schema_builder.add_u64_field(SIZE, STORED);
|
||||
let schema = schema_builder.build();
|
||||
let (index, _) = index("beer", title, schema, |index_writer| {
|
||||
index_writer.add_document(doc!(
|
||||
title => "bottle of beer",
|
||||
size => 12u64,
|
||||
));
|
||||
});
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
|
||||
assert!(top_collector.for_segment(0, segment).is_ok());
|
||||
}
|
||||
|
||||
fn index(
|
||||
query: &str,
|
||||
query_field: Field,
|
||||
schema: Schema,
|
||||
mut doc_adder: impl FnMut(&mut IndexWriter) -> (),
|
||||
) -> (Index, Box<Query>) {
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
doc_adder(&mut index_writer);
|
||||
index_writer.commit().unwrap();
|
||||
let query_parser = QueryParser::for_index(&index, vec![query_field]);
|
||||
let query = query_parser.parse_query(query).unwrap();
|
||||
(index, query)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
129
src/collector/tweak_score_top_collector.rs
Normal file
129
src/collector/tweak_score_top_collector.rs
Normal file
@@ -0,0 +1,129 @@
|
||||
use crate::collector::top_collector::{TopCollector, TopSegmentCollector};
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::DocAddress;
|
||||
use crate::{DocId, Result, Score, SegmentReader};
|
||||
|
||||
pub(crate) struct TweakedScoreTopCollector<TScoreTweaker, TScore = Score> {
|
||||
score_tweaker: TScoreTweaker,
|
||||
collector: TopCollector<TScore>,
|
||||
}
|
||||
|
||||
impl<TScoreTweaker, TScore> TweakedScoreTopCollector<TScoreTweaker, TScore>
|
||||
where
|
||||
TScore: Clone + PartialOrd,
|
||||
{
|
||||
pub fn new(
|
||||
score_tweaker: TScoreTweaker,
|
||||
limit: usize,
|
||||
) -> TweakedScoreTopCollector<TScoreTweaker, TScore> {
|
||||
TweakedScoreTopCollector {
|
||||
score_tweaker,
|
||||
collector: TopCollector::with_limit(limit),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A `ScoreSegmentTweaker` makes it possible to modify the default score
|
||||
/// for a given document belonging to a specific segment.
|
||||
///
|
||||
/// It is the segment local version of the [`ScoreTweaker`](./trait.ScoreTweaker.html).
|
||||
pub trait ScoreSegmentTweaker<TScore>: 'static {
|
||||
/// Tweak the given `score` for the document `doc`.
|
||||
fn score(&self, doc: DocId, score: Score) -> TScore;
|
||||
}
|
||||
|
||||
/// `ScoreTweaker` makes it possible to tweak the score
|
||||
/// emitted by the scorer into another one.
|
||||
///
|
||||
/// The `ScoreTweaker` itself does not make much of the computation itself.
|
||||
/// Instead, it helps constructing `Self::Child` instances that will compute
|
||||
/// the score at a segment scale.
|
||||
pub trait ScoreTweaker<TScore>: Sync {
|
||||
/// Type of the associated [`ScoreSegmentTweaker`](./trait.ScoreSegmentTweaker.html).
|
||||
type Child: ScoreSegmentTweaker<TScore>;
|
||||
|
||||
/// Builds a child tweaker for a specific segment. The child scorer is associated to
|
||||
/// a specific segment.
|
||||
fn segment_tweaker(&self, segment_reader: &SegmentReader) -> Result<Self::Child>;
|
||||
}
|
||||
|
||||
impl<TScoreTweaker, TScore> Collector for TweakedScoreTopCollector<TScoreTweaker, TScore>
|
||||
where
|
||||
TScoreTweaker: ScoreTweaker<TScore>,
|
||||
TScore: 'static + PartialOrd + Clone + Send + Sync,
|
||||
{
|
||||
type Fruit = Vec<(TScore, DocAddress)>;
|
||||
|
||||
type Child = TopTweakedScoreSegmentCollector<TScoreTweaker::Child, TScore>;
|
||||
|
||||
fn for_segment(
|
||||
&self,
|
||||
segment_local_id: u32,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Result<Self::Child> {
|
||||
let segment_scorer = self.score_tweaker.segment_tweaker(segment_reader)?;
|
||||
let segment_collector = self
|
||||
.collector
|
||||
.for_segment(segment_local_id, segment_reader)?;
|
||||
Ok(TopTweakedScoreSegmentCollector {
|
||||
segment_collector,
|
||||
segment_scorer,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
||||
self.collector.merge_fruits(segment_fruits)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TopTweakedScoreSegmentCollector<TSegmentScoreTweaker, TScore>
|
||||
where
|
||||
TScore: 'static + PartialOrd + Clone + Send + Sync + Sized,
|
||||
TSegmentScoreTweaker: ScoreSegmentTweaker<TScore>,
|
||||
{
|
||||
segment_collector: TopSegmentCollector<TScore>,
|
||||
segment_scorer: TSegmentScoreTweaker,
|
||||
}
|
||||
|
||||
impl<TSegmentScoreTweaker, TScore> SegmentCollector
|
||||
for TopTweakedScoreSegmentCollector<TSegmentScoreTweaker, TScore>
|
||||
where
|
||||
TScore: 'static + PartialOrd + Clone + Send + Sync,
|
||||
TSegmentScoreTweaker: 'static + ScoreSegmentTweaker<TScore>,
|
||||
{
|
||||
type Fruit = Vec<(TScore, DocAddress)>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, score: Score) {
|
||||
let score = self.segment_scorer.score(doc, score);
|
||||
self.segment_collector.collect(doc, score);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Vec<(TScore, DocAddress)> {
|
||||
self.segment_collector.harvest()
|
||||
}
|
||||
}
|
||||
|
||||
impl<F, TScore, TSegmentScoreTweaker> ScoreTweaker<TScore> for F
|
||||
where
|
||||
F: 'static + Send + Sync + Fn(&SegmentReader) -> TSegmentScoreTweaker,
|
||||
TSegmentScoreTweaker: ScoreSegmentTweaker<TScore>,
|
||||
{
|
||||
type Child = TSegmentScoreTweaker;
|
||||
|
||||
fn segment_tweaker(&self, segment_reader: &SegmentReader) -> Result<Self::Child> {
|
||||
Ok((self)(segment_reader))
|
||||
}
|
||||
}
|
||||
|
||||
impl<F, TScore> ScoreSegmentTweaker<TScore> for F
|
||||
where
|
||||
F: 'static + Sync + Send + Fn(DocId, Score) -> TScore,
|
||||
{
|
||||
fn score(&self, doc: DocId, score: Score) -> TScore {
|
||||
(self)(doc, score)
|
||||
}
|
||||
}
|
||||
@@ -5,7 +5,7 @@ use std::u64;
|
||||
pub(crate) struct TinySet(u64);
|
||||
|
||||
impl fmt::Debug for TinySet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.into_iter().collect::<Vec<u32>>().fmt(f)
|
||||
}
|
||||
}
|
||||
@@ -204,12 +204,12 @@ mod tests {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use docset::DocSet;
|
||||
use query::BitSetDocSet;
|
||||
use crate::docset::DocSet;
|
||||
use crate::query::BitSetDocSet;
|
||||
use crate::tests;
|
||||
use crate::tests::generate_nonunique_unsorted;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashSet;
|
||||
use tests;
|
||||
use tests::generate_nonunique_unsorted;
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use common::VInt;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use space_usage::FieldUsage;
|
||||
use space_usage::PerFieldSpaceUsage;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CountingWriter;
|
||||
use crate::common::VInt;
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use crate::space_usage::FieldUsage;
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::io::{self, Read};
|
||||
@@ -185,10 +185,10 @@ impl CompositeFile {
|
||||
mod test {
|
||||
|
||||
use super::{CompositeFile, CompositeWrite};
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use schema::Field;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::VInt;
|
||||
use crate::directory::{Directory, RAMDirectory};
|
||||
use crate::schema::Field;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
|
||||
|
||||
@@ -13,7 +13,10 @@ pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
use std::io;
|
||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||
///
|
||||
/// We do not allow segments with more than
|
||||
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
||||
|
||||
/// Computes the number of bits that will be used for bitpacking.
|
||||
///
|
||||
@@ -52,11 +55,6 @@ pub(crate) fn is_power_of_2(n: usize) -> bool {
|
||||
(n > 0) && (n & (n - 1) == 0)
|
||||
}
|
||||
|
||||
/// Create a default io error given a string.
|
||||
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
}
|
||||
|
||||
/// Has length trait
|
||||
pub trait HasLen {
|
||||
/// Return length
|
||||
@@ -101,15 +99,54 @@ pub fn u64_to_i64(val: u64) -> i64 {
|
||||
(val ^ HIGHEST_BIT) as i64
|
||||
}
|
||||
|
||||
/// Maps a `f64` to `u64`
|
||||
///
|
||||
/// For simplicity, tantivy internally handles `f64` as `u64`.
|
||||
/// The mapping is defined by this function.
|
||||
///
|
||||
/// Maps `f64` to `u64` so that lexical order is preserved.
|
||||
///
|
||||
/// This is more suited than simply casting (`val as u64`)
|
||||
/// which would truncate the result
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
|
||||
#[inline(always)]
|
||||
pub fn f64_to_u64(val: f64) -> u64 {
|
||||
let bits = val.to_bits();
|
||||
if val.is_sign_positive() {
|
||||
bits ^ HIGHEST_BIT
|
||||
} else {
|
||||
!bits
|
||||
}
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline(always)]
|
||||
pub fn u64_to_f64(val: u64) -> f64 {
|
||||
f64::from_bits(
|
||||
if val & HIGHEST_BIT != 0 {
|
||||
val ^ HIGHEST_BIT
|
||||
} else {
|
||||
!val
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
|
||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64, f64_to_u64, u64_to_f64};
|
||||
use std::f64;
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
fn test_f64_converter_helper(val: f64) {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_i64_converter() {
|
||||
@@ -123,6 +160,28 @@ pub(crate) mod test {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_converter() {
|
||||
test_f64_converter_helper(f64::INFINITY);
|
||||
test_f64_converter_helper(f64::NEG_INFINITY);
|
||||
test_f64_converter_helper(0.0);
|
||||
test_f64_converter_helper(-0.0);
|
||||
test_f64_converter_helper(1.0);
|
||||
test_f64_converter_helper(-1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_order() {
|
||||
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY)).contains(&f64_to_u64(f64::NAN))); //nan is not a number
|
||||
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
|
||||
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
|
||||
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
@@ -134,4 +193,11 @@ pub(crate) mod test {
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_max_doc() {
|
||||
// this is the first time I write a unit test for a constant.
|
||||
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
|
||||
assert!((super::MAX_DOC_LIMIT as i32) < 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::common::Endianness;
|
||||
use crate::common::VInt;
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use common::Endianness;
|
||||
use common::VInt;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
@@ -102,6 +102,19 @@ impl FixedSize for i64 {
|
||||
const SIZE_IN_BYTES: usize = 8;
|
||||
}
|
||||
|
||||
impl BinarySerializable for f64 {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_f64::<Endianness>(*self)
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
reader.read_f64::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for f64 {
|
||||
const SIZE_IN_BYTES: usize = 8;
|
||||
}
|
||||
|
||||
impl BinarySerializable for u8 {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_u8(*self)
|
||||
@@ -136,7 +149,7 @@ impl BinarySerializable for String {
|
||||
pub mod test {
|
||||
|
||||
use super::*;
|
||||
use common::VInt;
|
||||
use crate::common::VInt;
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
@@ -172,6 +185,11 @@ pub mod test {
|
||||
fixed_size_test::<i64>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_f64() {
|
||||
fixed_size_test::<f64>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_u64() {
|
||||
fixed_size_test::<u64>();
|
||||
|
||||
@@ -30,16 +30,16 @@ pub fn serialize_vint_u32(val: u32) -> (u64, usize) {
|
||||
let val = u64::from(val);
|
||||
const STOP_BIT: u64 = 128u64;
|
||||
match val {
|
||||
0...STOP_1 => (val | STOP_BIT, 1),
|
||||
START_2...STOP_2 => (
|
||||
0..=STOP_1 => (val | STOP_BIT, 1),
|
||||
START_2..=STOP_2 => (
|
||||
(val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)),
|
||||
2,
|
||||
),
|
||||
START_3...STOP_3 => (
|
||||
START_3..=STOP_3 => (
|
||||
(val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | (STOP_BIT << (8 * 2)),
|
||||
3,
|
||||
),
|
||||
START_4...STOP_4 => (
|
||||
START_4..=STOP_4 => (
|
||||
(val & MASK_1)
|
||||
| ((val & MASK_2) << 1)
|
||||
| ((val & MASK_3) << 2)
|
||||
@@ -171,8 +171,8 @@ mod tests {
|
||||
|
||||
use super::serialize_vint_u32;
|
||||
use super::VInt;
|
||||
use crate::common::BinarySerializable;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use common::BinarySerializable;
|
||||
|
||||
fn aux_test_vint(val: u64) {
|
||||
let mut v = [14u8; 10];
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::Result;
|
||||
use crossbeam::channel;
|
||||
use scoped_pool::{Pool, ThreadConfig};
|
||||
use Result;
|
||||
|
||||
/// Search executor whether search request are single thread or multithread.
|
||||
///
|
||||
|
||||
@@ -1,43 +1,43 @@
|
||||
use super::segment::create_segment;
|
||||
use super::segment::Segment;
|
||||
use core::Executor;
|
||||
use core::IndexMeta;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::META_FILEPATH;
|
||||
use directory::ManagedDirectory;
|
||||
use crate::core::Executor;
|
||||
use crate::core::IndexMeta;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::core::SegmentMetaInventory;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use directory::INDEX_WRITER_LOCK;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use error::DataCorruption;
|
||||
use error::TantivyError;
|
||||
use indexer::index_writer::open_index_writer;
|
||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||
use indexer::segment_updater::save_new_metas;
|
||||
use crate::directory::MmapDirectory;
|
||||
use crate::directory::INDEX_WRITER_LOCK;
|
||||
use crate::directory::{Directory, RAMDirectory};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::error::TantivyError;
|
||||
use crate::indexer::index_writer::HEAP_SIZE_MIN;
|
||||
use crate::indexer::segment_updater::save_new_metas;
|
||||
use crate::reader::IndexReader;
|
||||
use crate::reader::IndexReaderBuilder;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Schema;
|
||||
use crate::tokenizer::BoxedTokenizer;
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::IndexWriter;
|
||||
use crate::Result;
|
||||
use num_cpus;
|
||||
use reader::IndexReader;
|
||||
use reader::IndexReaderBuilder;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::fmt;
|
||||
#[cfg(feature = "mmap")]
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::TokenizerManager;
|
||||
use IndexWriter;
|
||||
use Result;
|
||||
|
||||
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||
fn load_metas(directory: &dyn Directory, inventory: &SegmentMetaInventory) -> Result<IndexMeta> {
|
||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||
let meta_string = String::from_utf8_lossy(&meta_data);
|
||||
serde_json::from_str(&meta_string)
|
||||
IndexMeta::deserialize(&meta_string, &inventory)
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
META_FILEPATH.clone(),
|
||||
META_FILEPATH.to_path_buf(),
|
||||
format!("Meta file cannot be deserialized. {:?}.", e),
|
||||
)
|
||||
})
|
||||
@@ -51,6 +51,7 @@ pub struct Index {
|
||||
schema: Schema,
|
||||
executor: Arc<Executor>,
|
||||
tokenizers: TokenizerManager,
|
||||
inventory: SegmentMetaInventory,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
@@ -147,19 +148,23 @@ impl Index {
|
||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), directory.borrow_mut())?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
Index::create_from_metas(directory, &metas)
|
||||
Index::create_from_metas(directory, &metas, SegmentMetaInventory::default())
|
||||
}
|
||||
|
||||
/// Creates a new index given a directory and an `IndexMeta`.
|
||||
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
|
||||
fn create_from_metas(
|
||||
directory: ManagedDirectory,
|
||||
metas: &IndexMeta,
|
||||
inventory: SegmentMetaInventory,
|
||||
) -> Result<Index> {
|
||||
let schema = metas.schema.clone();
|
||||
let index = Index {
|
||||
Ok(Index {
|
||||
directory,
|
||||
schema,
|
||||
tokenizers: TokenizerManager::default(),
|
||||
executor: Arc::new(Executor::single_thread()),
|
||||
};
|
||||
Ok(index)
|
||||
inventory,
|
||||
})
|
||||
}
|
||||
|
||||
/// Accessor for the tokenizer manager.
|
||||
@@ -168,11 +173,11 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Helper to access the tokenizer associated to a specific field.
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<BoxedTokenizer>> {
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<dyn BoxedTokenizer>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||
let tokenizer_name_opt: Option<Box<BoxedTokenizer>> = match field_type {
|
||||
let tokenizer_name_opt: Option<Box<dyn BoxedTokenizer>> = match field_type {
|
||||
FieldType::Str(text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||
@@ -211,16 +216,21 @@ impl Index {
|
||||
Index::open(mmap_directory)
|
||||
}
|
||||
|
||||
pub(crate) fn inventory(&self) -> &SegmentMetaInventory {
|
||||
&self.inventory
|
||||
}
|
||||
|
||||
/// Open the index using the provided directory
|
||||
pub fn open<D: Directory>(directory: D) -> Result<Index> {
|
||||
let directory = ManagedDirectory::wrap(directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
let inventory = SegmentMetaInventory::default();
|
||||
let metas = load_metas(&directory, &inventory)?;
|
||||
Index::create_from_metas(directory, &metas, inventory)
|
||||
}
|
||||
|
||||
/// Reads the index meta file from the directory.
|
||||
pub fn load_metas(&self) -> Result<IndexMeta> {
|
||||
load_metas(self.directory())
|
||||
load_metas(self.directory(), &self.inventory)
|
||||
}
|
||||
|
||||
/// Open a new index writer. Attempts to acquire a lockfile.
|
||||
@@ -264,7 +274,7 @@ impl Index {
|
||||
)
|
||||
})?;
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
open_index_writer(
|
||||
IndexWriter::new(
|
||||
self,
|
||||
num_threads,
|
||||
heap_size_in_bytes_per_thread,
|
||||
@@ -314,7 +324,9 @@ impl Index {
|
||||
|
||||
/// Creates a new segment.
|
||||
pub fn new_segment(&self) -> Segment {
|
||||
let segment_meta = SegmentMeta::new(SegmentId::generate_random(), 0);
|
||||
let segment_meta = self
|
||||
.inventory
|
||||
.new_segment_meta(SegmentId::generate_random(), 0);
|
||||
self.segment(segment_meta)
|
||||
}
|
||||
|
||||
@@ -339,30 +351,28 @@ impl Index {
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.map(SegmentMeta::id)
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Index {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "Index({:?})", self.directory)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use directory::RAMDirectory;
|
||||
use schema::Field;
|
||||
use schema::{Schema, INDEXED, TEXT};
|
||||
use std::path::PathBuf;
|
||||
use crate::directory::RAMDirectory;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::{Schema, INDEXED, TEXT};
|
||||
use crate::Index;
|
||||
use crate::IndexReader;
|
||||
use crate::IndexWriter;
|
||||
use crate::ReloadPolicy;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tempdir::TempDir;
|
||||
use Index;
|
||||
use IndexReader;
|
||||
use IndexWriter;
|
||||
use ReloadPolicy;
|
||||
|
||||
#[test]
|
||||
fn test_indexer_for_field() {
|
||||
@@ -444,61 +454,69 @@ mod tests {
|
||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_mmap() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||
}
|
||||
#[cfg(feature = "mmap")]
|
||||
mod mmap_specific {
|
||||
|
||||
#[test]
|
||||
fn test_index_manual_policy_mmap() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64));
|
||||
writer.commit().unwrap();
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
reader.reload().unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 1);
|
||||
}
|
||||
use super::*;
|
||||
use std::path::PathBuf;
|
||||
use tempdir::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_different_directories() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
|
||||
let reader = read_index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_mmap() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_manual_policy_mmap() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64));
|
||||
writer.commit().unwrap();
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
reader.reload().unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_different_directories() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
|
||||
let reader = read_index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||
}
|
||||
}
|
||||
|
||||
fn test_index_on_commit_reload_policy_aux(
|
||||
@@ -530,4 +548,38 @@ mod tests {
|
||||
}
|
||||
assert_eq!(count, 2);
|
||||
}
|
||||
|
||||
// This test will not pass on windows, because windows
|
||||
// prevent deleting files that are MMapped.
|
||||
#[cfg(not(target_os = "windows"))]
|
||||
#[test]
|
||||
fn garbage_collect_works_as_intended() {
|
||||
let directory = RAMDirectory::create();
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create(directory.clone(), schema).unwrap();
|
||||
|
||||
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
|
||||
for i in 0u64..8_000u64 {
|
||||
writer.add_document(doc!(field => i));
|
||||
}
|
||||
writer.commit().unwrap();
|
||||
let mem_right_after_commit = directory.total_mem_usage();
|
||||
thread::sleep(Duration::from_millis(1_000));
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(reader.searcher().num_docs(), 8_000);
|
||||
writer.wait_merging_threads().unwrap();
|
||||
let mem_right_after_merge_finished = directory.total_mem_usage();
|
||||
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 8_000);
|
||||
assert!(mem_right_after_merge_finished < mem_right_after_commit);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,7 +1,185 @@
|
||||
use core::SegmentMeta;
|
||||
use schema::Schema;
|
||||
use super::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use census::{Inventory, TrackedObject};
|
||||
use serde;
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
num_deleted_docs: u32,
|
||||
opstamp: Opstamp,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct SegmentMetaInventory {
|
||||
inventory: Inventory<InnerSegmentMeta>,
|
||||
}
|
||||
|
||||
impl SegmentMetaInventory {
|
||||
/// Lists all living `SegmentMeta` object at the time of the call.
|
||||
pub fn all(&self) -> Vec<SegmentMeta> {
|
||||
self.inventory
|
||||
.list()
|
||||
.into_iter()
|
||||
.map(SegmentMeta::from)
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn new_segment_meta(&self, segment_id: SegmentId, max_doc: u32) -> SegmentMeta {
|
||||
let inner = InnerSegmentMeta {
|
||||
segment_id,
|
||||
max_doc,
|
||||
deletes: None,
|
||||
};
|
||||
SegmentMeta::from(self.inventory.track(inner))
|
||||
}
|
||||
}
|
||||
|
||||
/// `SegmentMeta` contains simple meta information about a segment.
|
||||
///
|
||||
/// For instance the number of docs it contains,
|
||||
/// how many are deleted, etc.
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentMeta {
|
||||
tracked: TrackedObject<InnerSegmentMeta>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentMeta {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
self.tracked.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl serde::Serialize for SegmentMeta {
|
||||
fn serialize<S>(
|
||||
&self,
|
||||
serializer: S,
|
||||
) -> Result<<S as serde::Serializer>::Ok, <S as serde::Serializer>::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
self.tracked.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TrackedObject<InnerSegmentMeta>> for SegmentMeta {
|
||||
fn from(tracked: TrackedObject<InnerSegmentMeta>) -> SegmentMeta {
|
||||
SegmentMeta { tracked }
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentMeta {
|
||||
// Creates a new `SegmentMeta` object.
|
||||
|
||||
/// Returns the segment id.
|
||||
pub fn id(&self) -> SegmentId {
|
||||
self.tracked.segment_id
|
||||
}
|
||||
|
||||
/// Returns the number of deleted documents.
|
||||
pub fn num_deleted_docs(&self) -> u32 {
|
||||
self.tracked
|
||||
.deletes
|
||||
.as_ref()
|
||||
.map(|delete_meta| delete_meta.num_deleted_docs)
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
/// Returns the list of files that
|
||||
/// are required for the segment meta.
|
||||
///
|
||||
/// This is useful as the way tantivy removes files
|
||||
/// is by removing all files that have been created by tantivy
|
||||
/// and are not used by any segment anymore.
|
||||
pub fn list_files(&self) -> HashSet<PathBuf> {
|
||||
SegmentComponent::iterator()
|
||||
.map(|component| self.relative_path(*component))
|
||||
.collect::<HashSet<PathBuf>>()
|
||||
}
|
||||
|
||||
/// Returns the relative path of a component of our segment.
|
||||
///
|
||||
/// It just joins the segment id with the extension
|
||||
/// associated to a segment component.
|
||||
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
|
||||
let mut path = self.id().uuid_string();
|
||||
path.push_str(&*match component {
|
||||
SegmentComponent::POSTINGS => ".idx".to_string(),
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSITIONSSKIP => ".posidx".to_string(),
|
||||
SegmentComponent::TERMS => ".term".to_string(),
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
||||
});
|
||||
PathBuf::from(path)
|
||||
}
|
||||
|
||||
/// Return the highest doc id + 1
|
||||
///
|
||||
/// If there are no deletes, then num_docs = max_docs
|
||||
/// and all the doc ids contains in this segment
|
||||
/// are exactly (0..max_doc).
|
||||
pub fn max_doc(&self) -> u32 {
|
||||
self.tracked.max_doc
|
||||
}
|
||||
|
||||
/// Return the number of documents in the segment.
|
||||
pub fn num_docs(&self) -> u32 {
|
||||
self.max_doc() - self.num_deleted_docs()
|
||||
}
|
||||
|
||||
/// Returns the `Opstamp` of the last delete operation
|
||||
/// taken in account in this segment.
|
||||
pub fn delete_opstamp(&self) -> Option<Opstamp> {
|
||||
self.tracked
|
||||
.deletes
|
||||
.as_ref()
|
||||
.map(|delete_meta| delete_meta.opstamp)
|
||||
}
|
||||
|
||||
/// Returns true iff the segment meta contains
|
||||
/// delete information.
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.num_deleted_docs() > 0
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
||||
let delete_meta = DeleteMeta {
|
||||
num_deleted_docs,
|
||||
opstamp,
|
||||
};
|
||||
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||
segment_id: inner_meta.segment_id,
|
||||
max_doc: inner_meta.max_doc,
|
||||
deletes: Some(delete_meta),
|
||||
});
|
||||
SegmentMeta { tracked }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct InnerSegmentMeta {
|
||||
segment_id: SegmentId,
|
||||
max_doc: u32,
|
||||
deletes: Option<DeleteMeta>,
|
||||
}
|
||||
|
||||
impl InnerSegmentMeta {
|
||||
pub fn track(self, inventory: &SegmentMetaInventory) -> SegmentMeta {
|
||||
SegmentMeta {
|
||||
tracked: inventory.inventory.track(self),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
@@ -11,16 +189,53 @@ use std::fmt;
|
||||
/// * the index `docstamp`
|
||||
/// * the schema
|
||||
///
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize)]
|
||||
pub struct IndexMeta {
|
||||
/// List of `SegmentMeta` informations associated to each finalized segment of the index.
|
||||
pub segments: Vec<SegmentMeta>,
|
||||
/// Index `Schema`
|
||||
pub schema: Schema,
|
||||
pub opstamp: u64,
|
||||
/// Opstamp associated to the last `commit` operation.
|
||||
pub opstamp: Opstamp,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
/// Payload associated to the last commit.
|
||||
///
|
||||
/// Upon commit, clients can optionally add a small `Striing` payload to their commit
|
||||
/// to help identify this commit.
|
||||
/// This payload is entirely unused by tantivy.
|
||||
pub payload: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct UntrackedIndexMeta {
|
||||
pub segments: Vec<InnerSegmentMeta>,
|
||||
pub schema: Schema,
|
||||
pub opstamp: Opstamp,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub payload: Option<String>,
|
||||
}
|
||||
|
||||
impl UntrackedIndexMeta {
|
||||
pub fn track(self, inventory: &SegmentMetaInventory) -> IndexMeta {
|
||||
IndexMeta {
|
||||
segments: self
|
||||
.segments
|
||||
.into_iter()
|
||||
.map(|inner_seg_meta| inner_seg_meta.track(inventory))
|
||||
.collect::<Vec<SegmentMeta>>(),
|
||||
schema: self.schema,
|
||||
opstamp: self.opstamp,
|
||||
payload: self.payload,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexMeta {
|
||||
/// Create an `IndexMeta` object representing a brand new `Index`
|
||||
/// with the given index.
|
||||
///
|
||||
/// This new index does not contains any segments.
|
||||
/// Opstamp will the value `0u64`.
|
||||
pub fn with_schema(schema: Schema) -> IndexMeta {
|
||||
IndexMeta {
|
||||
segments: vec![],
|
||||
@@ -29,10 +244,18 @@ impl IndexMeta {
|
||||
payload: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn deserialize(
|
||||
meta_json: &str,
|
||||
inventory: &SegmentMetaInventory,
|
||||
) -> serde_json::Result<IndexMeta> {
|
||||
let untracked_meta_json: UntrackedIndexMeta = serde_json::from_str(meta_json)?;
|
||||
Ok(untracked_meta_json.track(inventory))
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for IndexMeta {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}",
|
||||
@@ -46,7 +269,7 @@ impl fmt::Debug for IndexMeta {
|
||||
mod tests {
|
||||
|
||||
use super::IndexMeta;
|
||||
use schema::{Schema, TEXT};
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use serde_json;
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use common::BinarySerializable;
|
||||
use directory::ReadOnlySource;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::termdict::TermDictionary;
|
||||
use owned_read::OwnedRead;
|
||||
use positions::PositionReader;
|
||||
use postings::TermInfo;
|
||||
use postings::{BlockSegmentPostings, SegmentPostings};
|
||||
use schema::FieldType;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use termdict::TermDictionary;
|
||||
|
||||
/// The inverted index reader is in charge of accessing
|
||||
/// the inverted index associated to a specific field.
|
||||
@@ -32,7 +32,7 @@ pub struct InvertedIndexReader {
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symetry
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symmetry
|
||||
pub(crate) fn new(
|
||||
termdict: TermDictionary,
|
||||
postings_source: ReadOnlySource,
|
||||
|
||||
@@ -6,33 +6,29 @@ pub mod searcher;
|
||||
mod segment;
|
||||
mod segment_component;
|
||||
mod segment_id;
|
||||
mod segment_meta;
|
||||
mod segment_reader;
|
||||
|
||||
pub use self::executor::Executor;
|
||||
pub use self::index::Index;
|
||||
pub use self::index_meta::IndexMeta;
|
||||
pub use self::index_meta::{IndexMeta, SegmentMeta, SegmentMetaInventory};
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
pub use self::searcher::Searcher;
|
||||
pub use self::segment::Segment;
|
||||
pub use self::segment::SerializableSegment;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_meta::SegmentMeta;
|
||||
pub use self::segment_reader::SegmentReader;
|
||||
|
||||
use std::path::PathBuf;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::path::Path;
|
||||
|
||||
lazy_static! {
|
||||
/// The meta file contains all the information about the list of segments and the schema
|
||||
/// of the index.
|
||||
pub static META_FILEPATH: Lazy<&'static Path> = Lazy::new(|| Path::new("meta.json"));
|
||||
|
||||
/// The meta file contains all the information about the list of segments and the schema
|
||||
/// of the index.
|
||||
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");
|
||||
|
||||
/// The managed file contains a list of files that were created by the tantivy
|
||||
/// and will therefore be garbage collected when they are deemed useless by tantivy.
|
||||
///
|
||||
/// Removing this file is safe, but will prevent the garbage collection of all of the file that
|
||||
/// are currently in the directory
|
||||
pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json");
|
||||
}
|
||||
/// The managed file contains a list of files that were created by the tantivy
|
||||
/// and will therefore be garbage collected when they are deemed useless by tantivy.
|
||||
///
|
||||
/// Removing this file is safe, but will prevent the garbage collection of all of the file that
|
||||
/// are currently in the directory
|
||||
pub static MANAGED_FILEPATH: Lazy<&'static Path> = Lazy::new(|| Path::new(".managed.json"));
|
||||
|
||||
@@ -1,26 +1,26 @@
|
||||
use collector::Collector;
|
||||
use collector::SegmentCollector;
|
||||
use core::Executor;
|
||||
use core::InvertedIndexReader;
|
||||
use core::SegmentReader;
|
||||
use query::Query;
|
||||
use query::Scorer;
|
||||
use query::Weight;
|
||||
use schema::Document;
|
||||
use schema::Schema;
|
||||
use schema::{Field, Term};
|
||||
use space_usage::SearcherSpaceUsage;
|
||||
use crate::collector::Collector;
|
||||
use crate::collector::SegmentCollector;
|
||||
use crate::core::Executor;
|
||||
use crate::core::InvertedIndexReader;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::query::Query;
|
||||
use crate::query::Scorer;
|
||||
use crate::query::Weight;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::{Field, Term};
|
||||
use crate::space_usage::SearcherSpaceUsage;
|
||||
use crate::store::StoreReader;
|
||||
use crate::termdict::TermMerger;
|
||||
use crate::DocAddress;
|
||||
use crate::Index;
|
||||
use crate::Result;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use store::StoreReader;
|
||||
use termdict::TermMerger;
|
||||
use DocAddress;
|
||||
use Index;
|
||||
use Result;
|
||||
|
||||
fn collect_segment<C: Collector>(
|
||||
collector: &C,
|
||||
weight: &Weight,
|
||||
weight: &dyn Weight,
|
||||
segment_ord: u32,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Result<C::Fruit> {
|
||||
@@ -28,7 +28,7 @@ fn collect_segment<C: Collector>(
|
||||
let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?;
|
||||
if let Some(delete_bitset) = segment_reader.delete_bitset() {
|
||||
scorer.for_each(&mut |doc, score| {
|
||||
if !delete_bitset.is_deleted(doc) {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
segment_collector.collect(doc, score);
|
||||
}
|
||||
});
|
||||
@@ -59,7 +59,7 @@ impl Searcher {
|
||||
) -> Searcher {
|
||||
let store_readers = segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.get_store_reader())
|
||||
.map(SegmentReader::get_store_reader)
|
||||
.collect();
|
||||
Searcher {
|
||||
schema,
|
||||
@@ -132,7 +132,7 @@ impl Searcher {
|
||||
///
|
||||
/// Finally, the Collector merges each of the child collectors into itself for result usability
|
||||
/// by the caller.
|
||||
pub fn search<C: Collector>(&self, query: &Query, collector: &C) -> Result<C::Fruit> {
|
||||
pub fn search<C: Collector>(&self, query: &dyn Query, collector: &C) -> Result<C::Fruit> {
|
||||
let executor = self.index.search_executor();
|
||||
self.search_with_executor(query, collector, executor)
|
||||
}
|
||||
@@ -151,7 +151,7 @@ impl Searcher {
|
||||
/// hurt it. It will however, decrease the average response time.
|
||||
pub fn search_with_executor<C: Collector>(
|
||||
&self,
|
||||
query: &Query,
|
||||
query: &dyn Query,
|
||||
collector: &C,
|
||||
executor: &Executor,
|
||||
) -> Result<C::Fruit> {
|
||||
@@ -203,7 +203,7 @@ impl FieldSearcher {
|
||||
|
||||
/// Returns a Stream over all of the sorted unique terms of
|
||||
/// for the given field.
|
||||
pub fn terms(&self) -> TermMerger {
|
||||
pub fn terms(&self) -> TermMerger<'_> {
|
||||
let term_streamers: Vec<_> = self
|
||||
.inv_index_readers
|
||||
.iter()
|
||||
@@ -214,11 +214,11 @@ impl FieldSearcher {
|
||||
}
|
||||
|
||||
impl fmt::Debug for Searcher {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let segment_ids = self
|
||||
.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.segment_id())
|
||||
.map(SegmentReader::segment_id)
|
||||
.collect::<Vec<_>>();
|
||||
write!(f, "Searcher({:?})", segment_ids)
|
||||
}
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
use super::SegmentComponent;
|
||||
use core::Index;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use directory::error::{OpenReadError, OpenWriteError};
|
||||
use directory::Directory;
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use schema::Schema;
|
||||
use crate::core::Index;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::directory::error::{OpenReadError, OpenWriteError};
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::{ReadOnlySource, WritePtr};
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use crate::Result;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::result;
|
||||
use Result;
|
||||
|
||||
/// A segment is a piece of the index.
|
||||
#[derive(Clone)]
|
||||
@@ -20,7 +21,7 @@ pub struct Segment {
|
||||
}
|
||||
|
||||
impl fmt::Debug for Segment {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "Segment({:?})", self.id().uuid_string())
|
||||
}
|
||||
}
|
||||
@@ -50,7 +51,7 @@ impl Segment {
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> Segment {
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
|
||||
Segment {
|
||||
index: self.index,
|
||||
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),
|
||||
|
||||
@@ -2,6 +2,8 @@ use std::cmp::{Ord, Ordering};
|
||||
use std::fmt;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[cfg(test)]
|
||||
use once_cell::sync::Lazy;
|
||||
#[cfg(test)]
|
||||
use std::sync::atomic;
|
||||
|
||||
@@ -17,10 +19,10 @@ use std::sync::atomic;
|
||||
pub struct SegmentId(Uuid);
|
||||
|
||||
#[cfg(test)]
|
||||
lazy_static! {
|
||||
static ref AUTO_INC_COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::default();
|
||||
static ref ZERO_ARRAY: [u8; 8] = [0u8; 8];
|
||||
}
|
||||
static AUTO_INC_COUNTER: Lazy<atomic::AtomicUsize> = Lazy::new(|| atomic::AtomicUsize::default());
|
||||
|
||||
#[cfg(test)]
|
||||
const ZERO_ARRAY: [u8; 8] = [0u8; 8];
|
||||
|
||||
// During tests, we generate the segment id in a autoincrement manner
|
||||
// for consistency of segment id between run.
|
||||
@@ -30,7 +32,7 @@ lazy_static! {
|
||||
#[cfg(test)]
|
||||
fn create_uuid() -> Uuid {
|
||||
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
|
||||
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*ZERO_ARRAY).unwrap()
|
||||
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &ZERO_ARRAY).unwrap()
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
@@ -62,7 +64,7 @@ impl SegmentId {
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "Seg({:?})", self.short_uuid_string())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,174 +0,0 @@
|
||||
use super::SegmentComponent;
|
||||
use census::{Inventory, TrackedObject};
|
||||
use core::SegmentId;
|
||||
use serde;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
|
||||
lazy_static! {
|
||||
static ref INVENTORY: Inventory<InnerSegmentMeta> = { Inventory::new() };
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
num_deleted_docs: u32,
|
||||
opstamp: u64,
|
||||
}
|
||||
|
||||
/// `SegmentMeta` contains simple meta information about a segment.
|
||||
///
|
||||
/// For instance the number of docs it contains,
|
||||
/// how many are deleted, etc.
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentMeta {
|
||||
tracked: TrackedObject<InnerSegmentMeta>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentMeta {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
self.tracked.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl serde::Serialize for SegmentMeta {
|
||||
fn serialize<S>(
|
||||
&self,
|
||||
serializer: S,
|
||||
) -> Result<<S as serde::Serializer>::Ok, <S as serde::Serializer>::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
self.tracked.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> serde::Deserialize<'a> for SegmentMeta {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, <D as serde::Deserializer<'a>>::Error>
|
||||
where
|
||||
D: serde::Deserializer<'a>,
|
||||
{
|
||||
let inner = InnerSegmentMeta::deserialize(deserializer)?;
|
||||
let tracked = INVENTORY.track(inner);
|
||||
Ok(SegmentMeta { tracked })
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentMeta {
|
||||
/// Lists all living `SegmentMeta` object at the time of the call.
|
||||
pub fn all() -> Vec<SegmentMeta> {
|
||||
INVENTORY
|
||||
.list()
|
||||
.into_iter()
|
||||
.map(|inner| SegmentMeta { tracked: inner })
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
/// Creates a new `SegmentMeta` object.
|
||||
#[doc(hidden)]
|
||||
pub fn new(segment_id: SegmentId, max_doc: u32) -> SegmentMeta {
|
||||
let inner = InnerSegmentMeta {
|
||||
segment_id,
|
||||
max_doc,
|
||||
deletes: None,
|
||||
};
|
||||
SegmentMeta {
|
||||
tracked: INVENTORY.track(inner),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the segment id.
|
||||
pub fn id(&self) -> SegmentId {
|
||||
self.tracked.segment_id
|
||||
}
|
||||
|
||||
/// Returns the number of deleted documents.
|
||||
pub fn num_deleted_docs(&self) -> u32 {
|
||||
self.tracked
|
||||
.deletes
|
||||
.as_ref()
|
||||
.map(|delete_meta| delete_meta.num_deleted_docs)
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
/// Returns the list of files that
|
||||
/// are required for the segment meta.
|
||||
///
|
||||
/// This is useful as the way tantivy removes files
|
||||
/// is by removing all files that have been created by tantivy
|
||||
/// and are not used by any segment anymore.
|
||||
pub fn list_files(&self) -> HashSet<PathBuf> {
|
||||
SegmentComponent::iterator()
|
||||
.map(|component| self.relative_path(*component))
|
||||
.collect::<HashSet<PathBuf>>()
|
||||
}
|
||||
|
||||
/// Returns the relative path of a component of our segment.
|
||||
///
|
||||
/// It just joins the segment id with the extension
|
||||
/// associated to a segment component.
|
||||
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
|
||||
let mut path = self.id().uuid_string();
|
||||
path.push_str(&*match component {
|
||||
SegmentComponent::POSTINGS => ".idx".to_string(),
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSITIONSSKIP => ".posidx".to_string(),
|
||||
SegmentComponent::TERMS => ".term".to_string(),
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
||||
});
|
||||
PathBuf::from(path)
|
||||
}
|
||||
|
||||
/// Return the highest doc id + 1
|
||||
///
|
||||
/// If there are no deletes, then num_docs = max_docs
|
||||
/// and all the doc ids contains in this segment
|
||||
/// are exactly (0..max_doc).
|
||||
pub fn max_doc(&self) -> u32 {
|
||||
self.tracked.max_doc
|
||||
}
|
||||
|
||||
/// Return the number of documents in the segment.
|
||||
pub fn num_docs(&self) -> u32 {
|
||||
self.max_doc() - self.num_deleted_docs()
|
||||
}
|
||||
|
||||
/// Returns the opstamp of the last delete operation
|
||||
/// taken in account in this segment.
|
||||
pub fn delete_opstamp(&self) -> Option<u64> {
|
||||
self.tracked
|
||||
.deletes
|
||||
.as_ref()
|
||||
.map(|delete_meta| delete_meta.opstamp)
|
||||
}
|
||||
|
||||
/// Returns true iff the segment meta contains
|
||||
/// delete information.
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.num_deleted_docs() > 0
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> SegmentMeta {
|
||||
let delete_meta = DeleteMeta {
|
||||
num_deleted_docs,
|
||||
opstamp,
|
||||
};
|
||||
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||
segment_id: inner_meta.segment_id,
|
||||
max_doc: inner_meta.max_doc,
|
||||
deletes: Some(delete_meta),
|
||||
});
|
||||
SegmentMeta { tracked }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct InnerSegmentMeta {
|
||||
segment_id: SegmentId,
|
||||
max_doc: u32,
|
||||
deletes: Option<DeleteMeta>,
|
||||
}
|
||||
@@ -1,30 +1,27 @@
|
||||
use common::CompositeFile;
|
||||
use common::HasLen;
|
||||
use core::InvertedIndexReader;
|
||||
use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use directory::ReadOnlySource;
|
||||
use error::TantivyError;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fastfield::FacetReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
||||
use fieldnorm::FieldNormReader;
|
||||
use schema::Cardinality;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use space_usage::SegmentSpaceUsage;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::common::HasLen;
|
||||
use crate::core::InvertedIndexReader;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::FacetReader;
|
||||
use crate::fastfield::FastFieldReaders;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Schema;
|
||||
use crate::space_usage::SegmentSpaceUsage;
|
||||
use crate::store::StoreReader;
|
||||
use crate::termdict::TermDictionary;
|
||||
use crate::DocId;
|
||||
use crate::Result;
|
||||
use fail::fail_point;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use store::StoreReader;
|
||||
use termdict::TermDictionary;
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
/// Entry point to access all of the datastructures of the `Segment`
|
||||
///
|
||||
@@ -51,7 +48,7 @@ pub struct SegmentReader {
|
||||
postings_composite: CompositeFile,
|
||||
positions_composite: CompositeFile,
|
||||
positions_idx_composite: CompositeFile,
|
||||
fast_fields_composite: CompositeFile,
|
||||
fast_fields_readers: Arc<FastFieldReaders>,
|
||||
fieldnorms_composite: CompositeFile,
|
||||
|
||||
store_source: ReadOnlySource,
|
||||
@@ -105,93 +102,21 @@ impl SegmentReader {
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
pub fn fast_field_reader<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
|
||||
{
|
||||
self.fast_fields_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
|
||||
Ok(FastFieldReader::open(ff_source))
|
||||
} else {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
|
||||
/// May panick if the field is not a multivalued fastfield of the type `Item`.
|
||||
pub fn multi_fast_field_reader<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
|
||||
{
|
||||
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
|
||||
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
|
||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
|
||||
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Bytes => {}
|
||||
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
|
||||
}
|
||||
let idx_reader = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let values = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||
Ok(BytesFastFieldReader::open(idx_reader, values))
|
||||
pub fn fast_fields(&self) -> &FastFieldReaders {
|
||||
&self.fast_fields_readers
|
||||
}
|
||||
|
||||
/// Accessor to the `FacetReader` associated to a given `Field`.
|
||||
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
|
||||
pub fn facet_reader(&self, field: Field) -> Option<FacetReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if field_entry.field_type() != &FieldType::HierarchicalFacet {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"The field {:?} is not a \
|
||||
hierarchical facet.",
|
||||
field_entry
|
||||
)));
|
||||
return None;
|
||||
}
|
||||
let term_ords_reader = self.multi_fast_field_reader(field)?;
|
||||
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"The field \"{}\" is a hierarchical \
|
||||
but this segment does not seem to have the field term \
|
||||
dictionary.",
|
||||
field_entry.name()
|
||||
))
|
||||
})?;
|
||||
let term_ords_reader = self.fast_fields().u64s(field)?;
|
||||
let termdict_source = self.termdict_composite.open_read(field)?;
|
||||
let termdict = TermDictionary::from_source(&termdict_source);
|
||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||
Ok(facet_reader)
|
||||
Some(facet_reader)
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `Field norms`'s reader.
|
||||
@@ -247,8 +172,12 @@ impl SegmentReader {
|
||||
}
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
let fast_field_readers =
|
||||
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
|
||||
|
||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
||||
@@ -260,14 +189,13 @@ impl SegmentReader {
|
||||
None
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
max_doc: segment.meta().max_doc(),
|
||||
num_docs: segment.meta().num_docs(),
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_composite,
|
||||
fast_fields_readers: fast_field_readers,
|
||||
fieldnorms_composite,
|
||||
segment_id: segment.id(),
|
||||
store_source,
|
||||
@@ -316,10 +244,9 @@ impl SegmentReader {
|
||||
|
||||
let postings_source = postings_source_opt.unwrap();
|
||||
|
||||
let termdict_source = self
|
||||
.termdict_composite
|
||||
.open_read(field)
|
||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
|
||||
let termdict_source = self.termdict_composite.open_read(field).expect(
|
||||
"Failed to open field term dictionary in composite file. Is the field indexed?",
|
||||
);
|
||||
|
||||
let positions_source = self
|
||||
.positions_composite
|
||||
@@ -369,7 +296,7 @@ impl SegmentReader {
|
||||
}
|
||||
|
||||
/// Returns an iterator that will iterate over the alive document ids
|
||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator<'_> {
|
||||
SegmentReaderAliveDocsIterator::new(&self)
|
||||
}
|
||||
|
||||
@@ -381,19 +308,19 @@ impl SegmentReader {
|
||||
self.postings_composite.space_usage(),
|
||||
self.positions_composite.space_usage(),
|
||||
self.positions_idx_composite.space_usage(),
|
||||
self.fast_fields_composite.space_usage(),
|
||||
self.fast_fields_readers.space_usage(),
|
||||
self.fieldnorms_composite.space_usage(),
|
||||
self.get_store_reader().space_usage(),
|
||||
self.delete_bitset_opt
|
||||
.as_ref()
|
||||
.map(|x| x.space_usage())
|
||||
.map(DeleteBitSet::space_usage)
|
||||
.unwrap_or(0),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentReader {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "SegmentReader({:?})", self.segment_id)
|
||||
}
|
||||
}
|
||||
@@ -446,9 +373,9 @@ impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use core::Index;
|
||||
use schema::{Schema, Term, STORED, TEXT};
|
||||
use DocId;
|
||||
use crate::core::Index;
|
||||
use crate::schema::{Schema, Term, STORED, TEXT};
|
||||
use crate::DocId;
|
||||
|
||||
#[test]
|
||||
fn test_alive_docs_iterator() {
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use directory::directory_lock::Lock;
|
||||
use directory::error::LockError;
|
||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use directory::WatchCallback;
|
||||
use directory::WatchHandle;
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use crate::directory::directory_lock::Lock;
|
||||
use crate::directory::error::LockError;
|
||||
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::WatchCallback;
|
||||
use crate::directory::WatchHandle;
|
||||
use crate::directory::{ReadOnlySource, WritePtr};
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
@@ -48,14 +48,14 @@ impl RetryPolicy {
|
||||
///
|
||||
/// It is transparently associated to a lock file, that gets deleted
|
||||
/// on `Drop.` The lock is released automatically on `Drop`.
|
||||
pub struct DirectoryLock(Box<Drop + Send + 'static>);
|
||||
pub struct DirectoryLock(Box<dyn Drop + Send + Sync + 'static>);
|
||||
|
||||
struct DirectoryLockGuard {
|
||||
directory: Box<Directory>,
|
||||
directory: Box<dyn Directory>,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl<T: Drop + Send + 'static> From<Box<T>> for DirectoryLock {
|
||||
impl<T: Drop + Send + Sync + 'static> From<Box<T>> for DirectoryLock {
|
||||
fn from(underlying: Box<T>) -> Self {
|
||||
DirectoryLock(underlying)
|
||||
}
|
||||
@@ -76,7 +76,7 @@ enum TryAcquireLockError {
|
||||
|
||||
fn try_acquire_lock(
|
||||
filepath: &Path,
|
||||
directory: &mut Directory,
|
||||
directory: &mut dyn Directory,
|
||||
) -> Result<DirectoryLock, TryAcquireLockError> {
|
||||
let mut write = directory.open_write(filepath).map_err(|e| match e {
|
||||
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
|
||||
@@ -204,20 +204,20 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Internally, tantivy only uses this API to detect new commits to implement the
|
||||
/// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
|
||||
/// `OnCommit` `ReloadPolicy` to work properly.
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle;
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle>;
|
||||
}
|
||||
|
||||
/// DirectoryClone
|
||||
pub trait DirectoryClone {
|
||||
/// Clones the directory and boxes the clone
|
||||
fn box_clone(&self) -> Box<Directory>;
|
||||
fn box_clone(&self) -> Box<dyn Directory>;
|
||||
}
|
||||
|
||||
impl<T> DirectoryClone for T
|
||||
where
|
||||
T: 'static + Directory + Clone,
|
||||
{
|
||||
fn box_clone(&self) -> Box<Directory> {
|
||||
fn box_clone(&self) -> Box<dyn Directory> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use once_cell::sync::Lazy;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// A directory lock.
|
||||
@@ -28,29 +29,27 @@ pub struct Lock {
|
||||
pub is_blocking: bool,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
/// Only one process should be able to write tantivy's index at a time.
|
||||
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
|
||||
///
|
||||
/// If the process is killed and this file remains, it is safe to remove it manually.
|
||||
///
|
||||
/// Failing to acquire this lock usually means a misuse of tantivy's API,
|
||||
/// (creating more than one instance of the `IndexWriter`), are a spurious
|
||||
/// lock file remaining after a crash. In the latter case, removing the file after
|
||||
/// checking no process running tantivy is running is safe.
|
||||
pub static ref INDEX_WRITER_LOCK: Lock = Lock {
|
||||
filepath: PathBuf::from(".tantivy-writer.lock"),
|
||||
is_blocking: false
|
||||
};
|
||||
/// The meta lock file is here to protect the segment files being opened by
|
||||
/// `IndexReader::reload()` from being garbage collected.
|
||||
/// It makes it possible for another process to safely consume
|
||||
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
|
||||
/// here, but it is difficult to achieve on Windows.
|
||||
///
|
||||
/// Opening segment readers is a very fast process.
|
||||
pub static ref META_LOCK: Lock = Lock {
|
||||
filepath: PathBuf::from(".tantivy-meta.lock"),
|
||||
is_blocking: true
|
||||
};
|
||||
}
|
||||
/// Only one process should be able to write tantivy's index at a time.
|
||||
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
|
||||
///
|
||||
/// If the process is killed and this file remains, it is safe to remove it manually.
|
||||
///
|
||||
/// Failing to acquire this lock usually means a misuse of tantivy's API,
|
||||
/// (creating more than one instance of the `IndexWriter`), are a spurious
|
||||
/// lock file remaining after a crash. In the latter case, removing the file after
|
||||
/// checking no process running tantivy is running is safe.
|
||||
pub static INDEX_WRITER_LOCK: Lazy<Lock> = Lazy::new(|| Lock {
|
||||
filepath: PathBuf::from(".tantivy-writer.lock"),
|
||||
is_blocking: false,
|
||||
});
|
||||
/// The meta lock file is here to protect the segment files being opened by
|
||||
/// `IndexReader::reload()` from being garbage collected.
|
||||
/// It makes it possible for another process to safely consume
|
||||
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
|
||||
/// here, but it is difficult to achieve on Windows.
|
||||
///
|
||||
/// Opening segment readers is a very fast process.
|
||||
pub static META_LOCK: Lazy<Lock> = Lazy::new(|| Lock {
|
||||
filepath: PathBuf::from(".tantivy-meta.lock"),
|
||||
is_blocking: true,
|
||||
});
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::path::PathBuf;
|
||||
/// Error while trying to acquire a directory lock.
|
||||
#[derive(Debug, Fail)]
|
||||
pub enum LockError {
|
||||
/// Failed to acquired a lock as it is already hold by another
|
||||
/// Failed to acquired a lock as it is already held by another
|
||||
/// client.
|
||||
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
|
||||
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
|
||||
@@ -33,7 +33,7 @@ impl Into<io::Error> for IOError {
|
||||
}
|
||||
|
||||
impl fmt::Display for IOError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self.path {
|
||||
Some(ref path) => write!(f, "io error occurred on path '{:?}': '{}'", path, self.err),
|
||||
None => write!(f, "io error occurred: '{}'", self.err),
|
||||
@@ -46,7 +46,7 @@ impl StdError for IOError {
|
||||
"io error occurred"
|
||||
}
|
||||
|
||||
fn cause(&self) -> Option<&StdError> {
|
||||
fn cause(&self) -> Option<&dyn StdError> {
|
||||
Some(&self.err)
|
||||
}
|
||||
}
|
||||
@@ -84,7 +84,7 @@ impl From<io::Error> for OpenDirectoryError {
|
||||
}
|
||||
|
||||
impl fmt::Display for OpenDirectoryError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match *self {
|
||||
OpenDirectoryError::DoesNotExist(ref path) => {
|
||||
write!(f, "the underlying directory '{:?}' does not exist", path)
|
||||
@@ -106,7 +106,7 @@ impl StdError for OpenDirectoryError {
|
||||
"error occurred while opening a directory"
|
||||
}
|
||||
|
||||
fn cause(&self) -> Option<&StdError> {
|
||||
fn cause(&self) -> Option<&dyn StdError> {
|
||||
None
|
||||
}
|
||||
}
|
||||
@@ -129,7 +129,7 @@ impl From<IOError> for OpenWriteError {
|
||||
}
|
||||
|
||||
impl fmt::Display for OpenWriteError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match *self {
|
||||
OpenWriteError::FileAlreadyExists(ref path) => {
|
||||
write!(f, "the file '{:?}' already exists", path)
|
||||
@@ -148,7 +148,7 @@ impl StdError for OpenWriteError {
|
||||
"error occurred while opening a file for writing"
|
||||
}
|
||||
|
||||
fn cause(&self) -> Option<&StdError> {
|
||||
fn cause(&self) -> Option<&dyn StdError> {
|
||||
match *self {
|
||||
OpenWriteError::FileAlreadyExists(_) => None,
|
||||
OpenWriteError::IOError(ref err) => Some(err),
|
||||
@@ -173,7 +173,7 @@ impl From<IOError> for OpenReadError {
|
||||
}
|
||||
|
||||
impl fmt::Display for OpenReadError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match *self {
|
||||
OpenReadError::FileDoesNotExist(ref path) => {
|
||||
write!(f, "the file '{:?}' does not exist", path)
|
||||
@@ -192,7 +192,7 @@ impl StdError for OpenReadError {
|
||||
"error occurred while opening a file for reading"
|
||||
}
|
||||
|
||||
fn cause(&self) -> Option<&StdError> {
|
||||
fn cause(&self) -> Option<&dyn StdError> {
|
||||
match *self {
|
||||
OpenReadError::FileDoesNotExist(_) => None,
|
||||
OpenReadError::IOError(ref err) => Some(err),
|
||||
@@ -217,7 +217,7 @@ impl From<IOError> for DeleteError {
|
||||
}
|
||||
|
||||
impl fmt::Display for DeleteError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match *self {
|
||||
DeleteError::FileDoesNotExist(ref path) => {
|
||||
write!(f, "the file '{:?}' does not exist", path)
|
||||
@@ -234,7 +234,7 @@ impl StdError for DeleteError {
|
||||
"error occurred while deleting a file"
|
||||
}
|
||||
|
||||
fn cause(&self) -> Option<&StdError> {
|
||||
fn cause(&self) -> Option<&dyn StdError> {
|
||||
match *self {
|
||||
DeleteError::FileDoesNotExist(_) => None,
|
||||
DeleteError::IOError(ref err) => Some(err),
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
use core::MANAGED_FILEPATH;
|
||||
use directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
|
||||
use directory::DirectoryLock;
|
||||
use directory::Lock;
|
||||
use directory::META_LOCK;
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use directory::{WatchCallback, WatchHandle};
|
||||
use error::DataCorruption;
|
||||
use crate::core::MANAGED_FILEPATH;
|
||||
use crate::directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::DirectoryLock;
|
||||
use crate::directory::Lock;
|
||||
use crate::directory::META_LOCK;
|
||||
use crate::directory::{ReadOnlySource, WritePtr};
|
||||
use crate::directory::{WatchCallback, WatchHandle};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::Directory;
|
||||
use crate::Result;
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
use std::io;
|
||||
@@ -14,8 +16,6 @@ use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::RwLockWriteGuard;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use Directory;
|
||||
use Result;
|
||||
|
||||
/// Returns true iff the file is "managed".
|
||||
/// Non-managed file are not subject to garbage collection.
|
||||
@@ -39,7 +39,7 @@ fn is_managed(path: &Path) -> bool {
|
||||
/// useful anymore.
|
||||
#[derive(Debug)]
|
||||
pub struct ManagedDirectory {
|
||||
directory: Box<Directory>,
|
||||
directory: Box<dyn Directory>,
|
||||
meta_informations: Arc<RwLock<MetaInformation>>,
|
||||
}
|
||||
|
||||
@@ -51,8 +51,8 @@ struct MetaInformation {
|
||||
/// Saves the file containing the list of existing files
|
||||
/// that were created by tantivy.
|
||||
fn save_managed_paths(
|
||||
directory: &mut Directory,
|
||||
wlock: &RwLockWriteGuard<MetaInformation>,
|
||||
directory: &mut dyn Directory,
|
||||
wlock: &RwLockWriteGuard<'_, MetaInformation>,
|
||||
) -> io::Result<()> {
|
||||
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
|
||||
writeln!(&mut w)?;
|
||||
@@ -69,7 +69,7 @@ impl ManagedDirectory {
|
||||
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
MANAGED_FILEPATH.clone(),
|
||||
MANAGED_FILEPATH.to_path_buf(),
|
||||
format!("Managed file cannot be deserialized: {:?}. ", e),
|
||||
)
|
||||
})?;
|
||||
@@ -135,28 +135,28 @@ impl ManagedDirectory {
|
||||
files_to_delete.push(managed_path.clone());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
error!("Failed to acquire lock for GC");
|
||||
}
|
||||
}
|
||||
|
||||
let mut deleted_files = vec![];
|
||||
{
|
||||
for file_to_delete in files_to_delete {
|
||||
match self.delete(&file_to_delete) {
|
||||
Ok(_) => {
|
||||
info!("Deleted {:?}", file_to_delete);
|
||||
deleted_files.push(file_to_delete);
|
||||
}
|
||||
Err(file_error) => {
|
||||
match file_error {
|
||||
DeleteError::FileDoesNotExist(_) => {
|
||||
deleted_files.push(file_to_delete);
|
||||
}
|
||||
DeleteError::IOError(_) => {
|
||||
if !cfg!(target_os = "windows") {
|
||||
// On windows, delete is expected to fail if the file
|
||||
// is mmapped.
|
||||
error!("Failed to delete {:?}", file_to_delete);
|
||||
}
|
||||
for file_to_delete in files_to_delete {
|
||||
match self.delete(&file_to_delete) {
|
||||
Ok(_) => {
|
||||
info!("Deleted {:?}", file_to_delete);
|
||||
deleted_files.push(file_to_delete);
|
||||
}
|
||||
Err(file_error) => {
|
||||
match file_error {
|
||||
DeleteError::FileDoesNotExist(_) => {
|
||||
deleted_files.push(file_to_delete);
|
||||
}
|
||||
DeleteError::IOError(_) => {
|
||||
if !cfg!(target_os = "windows") {
|
||||
// On windows, delete is expected to fail if the file
|
||||
// is mmapped.
|
||||
error!("Failed to delete {:?}", file_to_delete);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -171,11 +171,9 @@ impl ManagedDirectory {
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed directory wlock poisoned (2).");
|
||||
{
|
||||
let managed_paths_write = &mut meta_informations_wlock.managed_paths;
|
||||
for delete_file in &deleted_files {
|
||||
managed_paths_write.remove(delete_file);
|
||||
}
|
||||
let managed_paths_write = &mut meta_informations_wlock.managed_paths;
|
||||
for delete_file in &deleted_files {
|
||||
managed_paths_write.remove(delete_file);
|
||||
}
|
||||
if save_managed_paths(self.directory.as_mut(), &meta_informations_wlock).is_err() {
|
||||
error!("Failed to save the list of managed files.");
|
||||
@@ -243,7 +241,7 @@ impl Directory for ManagedDirectory {
|
||||
self.directory.acquire_lock(lock)
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
self.directory.watch(watch_callback)
|
||||
}
|
||||
}
|
||||
@@ -257,73 +255,55 @@ impl Clone for ManagedDirectory {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
mod tests_mmap_specific {
|
||||
|
||||
use super::*;
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use crate::directory::{Directory, ManagedDirectory, MmapDirectory};
|
||||
use std::collections::HashSet;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tempdir::TempDir;
|
||||
|
||||
lazy_static! {
|
||||
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
|
||||
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_managed_directory() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir = TempDir::new("tantivy-test").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
|
||||
let test_path1: &'static Path = Path::new("some_path_for_test");
|
||||
let test_path2: &'static Path = Path::new("some_path_for_test_2");
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
{
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
let mut write_file = managed_directory.open_write(test_path1).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
managed_directory
|
||||
.atomic_write(test_path2, &[0u8, 1u8])
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
assert!(managed_directory.exists(test_path2));
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[test_path1.to_owned()].into_iter().cloned().collect();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
assert!(!managed_directory.exists(test_path2));
|
||||
}
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
assert!(!managed_directory.exists(test_path2));
|
||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(!managed_directory.exists(test_path1));
|
||||
assert!(!managed_directory.exists(test_path2));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap ")]
|
||||
fn test_managed_directory_gc_while_mmapped() {
|
||||
let test_path1: &'static Path = Path::new("some_path_for_test");
|
||||
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let living_files = HashSet::new();
|
||||
@@ -331,23 +311,23 @@ mod tests {
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||
.atomic_write(test_path1, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
|
||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||
let _mmap_read = managed_directory.open_read(test_path1).unwrap();
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
if cfg!(target_os = "windows") {
|
||||
// On Windows, gc should try and fail the file as it is mmapped.
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
// unmap should happen here.
|
||||
drop(_mmap_read);
|
||||
// The file should still be in the list of managed file and
|
||||
// eventually be deleted once mmap is released.
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(test_path1));
|
||||
} else {
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(test_path1));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,24 +1,25 @@
|
||||
extern crate fs2;
|
||||
extern crate notify;
|
||||
use fs2;
|
||||
use notify;
|
||||
|
||||
use self::fs2::FileExt;
|
||||
use self::notify::RawEvent;
|
||||
use self::notify::RecursiveMode;
|
||||
use self::notify::Watcher;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::error::LockError;
|
||||
use crate::directory::error::{
|
||||
DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError,
|
||||
};
|
||||
use crate::directory::read_only_source::BoxedData;
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::DirectoryLock;
|
||||
use crate::directory::Lock;
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::directory::WatchCallback;
|
||||
use crate::directory::WatchCallbackList;
|
||||
use crate::directory::WatchHandle;
|
||||
use crate::directory::WritePtr;
|
||||
use atomicwrites;
|
||||
use common::make_io_err;
|
||||
use core::META_FILEPATH;
|
||||
use directory::error::LockError;
|
||||
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use directory::read_only_source::BoxedData;
|
||||
use directory::Directory;
|
||||
use directory::DirectoryLock;
|
||||
use directory::Lock;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WatchCallback;
|
||||
use directory::WatchCallbackList;
|
||||
use directory::WatchHandle;
|
||||
use directory::WritePtr;
|
||||
use memmap::Mmap;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::From;
|
||||
@@ -37,6 +38,11 @@ use std::sync::Weak;
|
||||
use std::thread;
|
||||
use tempdir::TempDir;
|
||||
|
||||
/// Create a default io error given a string.
|
||||
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
}
|
||||
|
||||
/// Returns None iff the file exists, can be read, but is empty (and hence
|
||||
/// cannot be mmapped)
|
||||
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
|
||||
@@ -155,7 +161,7 @@ impl InnerWatcherWrapper {
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct WatcherWrapper {
|
||||
struct WatcherWrapper {
|
||||
inner: Arc<InnerWatcherWrapper>,
|
||||
}
|
||||
|
||||
@@ -225,7 +231,7 @@ struct MmapDirectoryInner {
|
||||
root_path: PathBuf,
|
||||
mmap_cache: RwLock<MmapCache>,
|
||||
_temp_directory: Option<TempDir>,
|
||||
watcher: RwLock<WatcherWrapper>,
|
||||
watcher: RwLock<Option<WatcherWrapper>>,
|
||||
}
|
||||
|
||||
impl MmapDirectoryInner {
|
||||
@@ -233,24 +239,41 @@ impl MmapDirectoryInner {
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
) -> Result<MmapDirectoryInner, OpenDirectoryError> {
|
||||
let watch_wrapper = WatcherWrapper::new(&root_path)?;
|
||||
let mmap_directory_inner = MmapDirectoryInner {
|
||||
root_path,
|
||||
mmap_cache: Default::default(),
|
||||
_temp_directory: temp_directory,
|
||||
watcher: RwLock::new(watch_wrapper),
|
||||
watcher: RwLock::new(None),
|
||||
};
|
||||
Ok(mmap_directory_inner)
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
let mut wlock = self.watcher.write().unwrap();
|
||||
wlock.watch(watch_callback)
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
// a lot of juggling here, to ensure we don't do anything that panics
|
||||
// while the rwlock is held. That way we ensure that the rwlock cannot
|
||||
// be poisoned.
|
||||
//
|
||||
// The downside is that we might create a watch wrapper that is not useful.
|
||||
let need_initialization = self.watcher.read().unwrap().is_none();
|
||||
if need_initialization {
|
||||
let watch_wrapper = WatcherWrapper::new(&self.root_path)?;
|
||||
let mut watch_wlock = self.watcher.write().unwrap();
|
||||
// the watcher could have been initialized when we released the lock, and
|
||||
// we do not want to lose the watched files that were set.
|
||||
if watch_wlock.is_none() {
|
||||
*watch_wlock = Some(watch_wrapper);
|
||||
}
|
||||
}
|
||||
if let Some(watch_wrapper) = self.watcher.write().unwrap().as_mut() {
|
||||
return Ok(watch_wrapper.watch(watch_callback));
|
||||
} else {
|
||||
unreachable!("At this point, watch wrapper is supposed to be initialized");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for MmapDirectory {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "MmapDirectory({:?})", self.inner.root_path)
|
||||
}
|
||||
}
|
||||
@@ -316,7 +339,7 @@ impl MmapDirectory {
|
||||
#[cfg(windows)]
|
||||
{
|
||||
use std::os::windows::fs::OpenOptionsExt;
|
||||
use winapi::winbase;
|
||||
use winapi::um::winbase;
|
||||
|
||||
open_opts
|
||||
.write(true)
|
||||
@@ -411,7 +434,6 @@ impl Directory for MmapDirectory {
|
||||
/// Any entry associated to the path in the mmap will be
|
||||
/// removed before the file is deleted.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
debug!("Deleting file {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => self
|
||||
@@ -509,7 +531,7 @@ impl Directory for MmapDirectory {
|
||||
})))
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
self.inner.watch(watch_callback)
|
||||
}
|
||||
}
|
||||
@@ -521,13 +543,13 @@ mod tests {
|
||||
// The following tests are specific to the MmapDirectory
|
||||
|
||||
use super::*;
|
||||
use schema::{Schema, SchemaBuilder, TEXT};
|
||||
use crate::schema::{Schema, SchemaBuilder, TEXT};
|
||||
use crate::Index;
|
||||
use crate::ReloadPolicy;
|
||||
use std::fs;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use Index;
|
||||
use ReloadPolicy;
|
||||
|
||||
#[test]
|
||||
fn test_open_non_existant_path() {
|
||||
|
||||
@@ -24,22 +24,18 @@ pub use self::ram_directory::RAMDirectory;
|
||||
pub use self::read_only_source::ReadOnlySource;
|
||||
pub(crate) use self::watch_event_router::WatchCallbackList;
|
||||
pub use self::watch_event_router::{WatchCallback, WatchHandle};
|
||||
use std::io::{BufWriter, Seek, Write};
|
||||
use std::io::{BufWriter, Write};
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
pub use self::mmap_directory::MmapDirectory;
|
||||
|
||||
pub(crate) use self::managed_directory::ManagedDirectory;
|
||||
|
||||
/// Synonym of Seek + Write
|
||||
pub trait SeekableWrite: Seek + Write {}
|
||||
impl<T: Seek + Write> SeekableWrite for T {}
|
||||
pub use self::managed_directory::ManagedDirectory;
|
||||
|
||||
/// Write object for Directory.
|
||||
///
|
||||
/// `WritePtr` are required to implement both Write
|
||||
/// and Seek.
|
||||
pub type WritePtr = BufWriter<Box<SeekableWrite>>;
|
||||
pub type WritePtr = BufWriter<Box<dyn Write>>;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use core::META_FILEPATH;
|
||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use directory::WatchCallbackList;
|
||||
use directory::WritePtr;
|
||||
use directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle};
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use crate::directory::WatchCallbackList;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle};
|
||||
use fail::fail_point;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
|
||||
@@ -86,7 +87,7 @@ impl InnerDirectory {
|
||||
self.fs
|
||||
.get(path)
|
||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||
.map(|el| el.clone())
|
||||
.map(Clone::clone)
|
||||
}
|
||||
|
||||
fn delete(&mut self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
@@ -103,10 +104,14 @@ impl InnerDirectory {
|
||||
fn watch(&mut self, watch_handle: WatchCallback) -> WatchHandle {
|
||||
self.watch_router.subscribe(watch_handle)
|
||||
}
|
||||
|
||||
fn total_mem_usage(&self) -> usize {
|
||||
self.fs.values().map(|f| f.len()).sum()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for RAMDirectory {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "RAMDirectory")
|
||||
}
|
||||
}
|
||||
@@ -126,6 +131,12 @@ impl RAMDirectory {
|
||||
pub fn create() -> RAMDirectory {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Returns the sum of the size of the different files
|
||||
/// in the RAMDirectory.
|
||||
pub fn total_mem_usage(&self) -> usize {
|
||||
self.fs.read().unwrap().total_mem_usage()
|
||||
}
|
||||
}
|
||||
|
||||
impl Directory for RAMDirectory {
|
||||
@@ -134,6 +145,11 @@ impl Directory for RAMDirectory {
|
||||
}
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
fail_point!("RAMDirectory::delete", |_| {
|
||||
use crate::directory::error::IOError;
|
||||
let io_error = IOError::from(io::Error::from(io::ErrorKind::Other));
|
||||
Err(DeleteError::from(io_error))
|
||||
});
|
||||
self.fs.write().unwrap().delete(path)
|
||||
}
|
||||
|
||||
@@ -177,7 +193,7 @@ impl Directory for RAMDirectory {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
self.fs.write().unwrap().watch(watch_callback)
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
Ok(self.fs.write().unwrap().watch(watch_callback))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use common::HasLen;
|
||||
use crate::common::HasLen;
|
||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub type BoxedData = Box<Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
pub type BoxedData = Box<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
///
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use super::*;
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
@@ -9,10 +9,6 @@ use std::thread;
|
||||
use std::time;
|
||||
use std::time::Duration;
|
||||
|
||||
lazy_static! {
|
||||
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ram_directory() {
|
||||
let mut ram_directory = RAMDirectory::create();
|
||||
@@ -29,98 +25,82 @@ fn test_mmap_directory() {
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn ram_directory_panics_if_flush_forgotten() {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
let mut ram_directory = RAMDirectory::create();
|
||||
let mut write_file = ram_directory.open_write(*TEST_PATH).unwrap();
|
||||
let mut write_file = ram_directory.open_write(test_path).unwrap();
|
||||
assert!(write_file.write_all(&[4]).is_ok());
|
||||
}
|
||||
|
||||
fn test_simple(directory: &mut Directory) {
|
||||
fn test_simple(directory: &mut dyn Directory) {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
{
|
||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
let mut write_file = directory.open_write(test_path).unwrap();
|
||||
assert!(directory.exists(test_path));
|
||||
write_file.write_all(&[4]).unwrap();
|
||||
write_file.write_all(&[3]).unwrap();
|
||||
write_file.write_all(&[7, 3, 5]).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
{
|
||||
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
||||
let read_file = directory.open_read(test_path).unwrap();
|
||||
let data: &[u8] = &*read_file;
|
||||
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
|
||||
}
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
assert!(!directory.exists(*TEST_PATH));
|
||||
assert!(directory.delete(test_path).is_ok());
|
||||
assert!(!directory.exists(test_path));
|
||||
}
|
||||
|
||||
fn test_seek(directory: &mut Directory) {
|
||||
fn test_rewrite_forbidden(directory: &mut dyn Directory) {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
{
|
||||
{
|
||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
|
||||
write_file.seek(SeekFrom::Start(0)).unwrap();
|
||||
write_file.write_all(&[3, 1]).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
||||
let data: &[u8] = &*read_file;
|
||||
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
|
||||
directory.open_write(test_path).unwrap();
|
||||
assert!(directory.exists(test_path));
|
||||
}
|
||||
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
{
|
||||
assert!(directory.open_write(test_path).is_err());
|
||||
}
|
||||
assert!(directory.delete(test_path).is_ok());
|
||||
}
|
||||
|
||||
fn test_rewrite_forbidden(directory: &mut Directory) {
|
||||
fn test_write_create_the_file(directory: &mut dyn Directory) {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
{
|
||||
directory.open_write(*TEST_PATH).unwrap();
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
}
|
||||
{
|
||||
assert!(directory.open_write(*TEST_PATH).is_err());
|
||||
}
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
}
|
||||
|
||||
fn test_write_create_the_file(directory: &mut Directory) {
|
||||
{
|
||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||
let _w = directory.open_write(*TEST_PATH).unwrap();
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
assert!(directory.open_read(*TEST_PATH).is_ok());
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
assert!(directory.open_read(test_path).is_err());
|
||||
let _w = directory.open_write(test_path).unwrap();
|
||||
assert!(directory.exists(test_path));
|
||||
assert!(directory.open_read(test_path).is_ok());
|
||||
assert!(directory.delete(test_path).is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
fn test_directory_delete(directory: &mut Directory) {
|
||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||
fn test_directory_delete(directory: &mut dyn Directory) {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
assert!(directory.open_read(test_path).is_err());
|
||||
let mut write_file = directory.open_write(&test_path).unwrap();
|
||||
write_file.write_all(&[1, 2, 3, 4]).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
{
|
||||
let read_handle = directory.open_read(*TEST_PATH).unwrap();
|
||||
{
|
||||
let read_handle = directory.open_read(&test_path).unwrap();
|
||||
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
||||
// Mapped files can't be deleted on Windows
|
||||
if !cfg!(windows) {
|
||||
assert!(directory.delete(&test_path).is_ok());
|
||||
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
||||
|
||||
// Mapped files can't be deleted on Windows
|
||||
if !cfg!(windows) {
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
||||
}
|
||||
|
||||
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
|
||||
}
|
||||
|
||||
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
|
||||
}
|
||||
|
||||
if cfg!(windows) {
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
assert!(directory.delete(&test_path).is_ok());
|
||||
}
|
||||
|
||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||
assert!(directory.delete(*TEST_PATH).is_err());
|
||||
assert!(directory.open_read(&test_path).is_err());
|
||||
assert!(directory.delete(&test_path).is_err());
|
||||
}
|
||||
|
||||
fn test_directory(directory: &mut Directory) {
|
||||
fn test_directory(directory: &mut dyn Directory) {
|
||||
test_simple(directory);
|
||||
test_seek(directory);
|
||||
test_rewrite_forbidden(directory);
|
||||
test_write_create_the_file(directory);
|
||||
test_directory_delete(directory);
|
||||
@@ -129,7 +109,7 @@ fn test_directory(directory: &mut Directory) {
|
||||
test_watch(directory);
|
||||
}
|
||||
|
||||
fn test_watch(directory: &mut Directory) {
|
||||
fn test_watch(directory: &mut dyn Directory) {
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let counter_clone = counter.clone();
|
||||
let watch_callback = Box::new(move || {
|
||||
@@ -141,7 +121,7 @@ fn test_watch(directory: &mut Directory) {
|
||||
thread::sleep(Duration::new(0, 10_000));
|
||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
||||
|
||||
let watch_handle = directory.watch(watch_callback);
|
||||
let watch_handle = directory.watch(watch_callback).unwrap();
|
||||
for i in 0..10 {
|
||||
assert_eq!(i, counter.load(Ordering::SeqCst));
|
||||
assert!(directory
|
||||
@@ -163,7 +143,7 @@ fn test_watch(directory: &mut Directory) {
|
||||
assert_eq!(10, counter.load(Ordering::SeqCst));
|
||||
}
|
||||
|
||||
fn test_lock_non_blocking(directory: &mut Directory) {
|
||||
fn test_lock_non_blocking(directory: &mut dyn Directory) {
|
||||
{
|
||||
let lock_a_res = directory.acquire_lock(&Lock {
|
||||
filepath: PathBuf::from("a.lock"),
|
||||
@@ -188,7 +168,7 @@ fn test_lock_non_blocking(directory: &mut Directory) {
|
||||
assert!(lock_a_res.is_ok());
|
||||
}
|
||||
|
||||
fn test_lock_blocking(directory: &mut Directory) {
|
||||
fn test_lock_blocking(directory: &mut dyn Directory) {
|
||||
let lock_a_res = directory.acquire_lock(&Lock {
|
||||
filepath: PathBuf::from("a.lock"),
|
||||
is_blocking: true,
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::sync::RwLock;
|
||||
use std::sync::Weak;
|
||||
|
||||
/// Type alias for callbacks registered when watching files of a `Directory`.
|
||||
pub type WatchCallback = Box<Fn() -> () + Sync + Send>;
|
||||
pub type WatchCallback = Box<dyn Fn() -> () + Sync + Send>;
|
||||
|
||||
/// Helper struct to implement the watch method in `Directory` implementations.
|
||||
///
|
||||
@@ -67,7 +67,7 @@ impl WatchCallbackList {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use directory::WatchCallbackList;
|
||||
use crate::directory::WatchCallbackList;
|
||||
use std::mem;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use common::BitSet;
|
||||
use crate::common::BitSet;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::DocId;
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::cmp::Ordering;
|
||||
use DocId;
|
||||
|
||||
/// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`.
|
||||
#[derive(PartialEq, Eq, Debug)]
|
||||
@@ -95,9 +96,23 @@ pub trait DocSet {
|
||||
}
|
||||
|
||||
/// Returns the number documents matching.
|
||||
///
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
fn count(&mut self) -> u32 {
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
let mut count = 0u32;
|
||||
while self.advance() {
|
||||
if !delete_bitset.is_deleted(self.doc()) {
|
||||
count += 1u32;
|
||||
}
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
/// Returns the count of documents, deleted or not.
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
///
|
||||
/// Of course, the result is an upper bound of the result
|
||||
/// given by `count()`.
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
let mut count = 0u32;
|
||||
while self.advance() {
|
||||
count += 1u32;
|
||||
@@ -127,9 +142,14 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
unboxed.size_hint()
|
||||
}
|
||||
|
||||
fn count(&mut self) -> u32 {
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count()
|
||||
unboxed.count(delete_bitset)
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count_including_deleted()
|
||||
}
|
||||
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
|
||||
17
src/error.rs
17
src/error.rs
@@ -2,11 +2,11 @@
|
||||
|
||||
use std::io;
|
||||
|
||||
use directory::error::LockError;
|
||||
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use fastfield::FastFieldNotAvailableError;
|
||||
use query;
|
||||
use schema;
|
||||
use crate::directory::error::LockError;
|
||||
use crate::directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use crate::fastfield::FastFieldNotAvailableError;
|
||||
use crate::query;
|
||||
use crate::schema;
|
||||
use serde_json;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
@@ -34,7 +34,7 @@ impl DataCorruption {
|
||||
}
|
||||
|
||||
impl fmt::Debug for DataCorruption {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
write!(f, "Data corruption: ")?;
|
||||
if let Some(ref filepath) = &self.filepath {
|
||||
write!(f, "(in file `{:?}`)", filepath)?;
|
||||
@@ -77,9 +77,6 @@ pub enum TantivyError {
|
||||
/// An Error appeared related to the schema.
|
||||
#[fail(display = "Schema error: '{}'", _0)]
|
||||
SchemaError(String),
|
||||
/// Tried to access a fastfield reader for a field not configured accordingly.
|
||||
#[fail(display = "Fast field not available: '{:?}'", _0)]
|
||||
FastFieldError(#[cause] FastFieldNotAvailableError),
|
||||
/// System error. (e.g.: We failed spawning a new thread)
|
||||
#[fail(display = "System error.'{}'", _0)]
|
||||
SystemError(String),
|
||||
@@ -93,7 +90,7 @@ impl From<DataCorruption> for TantivyError {
|
||||
|
||||
impl From<FastFieldNotAvailableError> for TantivyError {
|
||||
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
|
||||
TantivyError::FastFieldError(fastfield_error)
|
||||
TantivyError::SchemaError(format!("{}", fastfield_error))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,8 +6,8 @@ pub use self::writer::BytesFastFieldWriter;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use schema::Schema;
|
||||
use Index;
|
||||
use crate::schema::Schema;
|
||||
use crate::Index;
|
||||
|
||||
#[test]
|
||||
fn test_bytes() {
|
||||
@@ -23,14 +23,14 @@ mod tests {
|
||||
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let bytes_reader = segment_reader.fast_fields().bytes(field).unwrap();
|
||||
|
||||
assert_eq!(bytes_reader.get_val(0), &[0u8, 1, 2, 3]);
|
||||
assert!(bytes_reader.get_val(1).is_empty());
|
||||
assert_eq!(bytes_reader.get_val(2), &[255u8]);
|
||||
assert_eq!(bytes_reader.get_val(3), &[1u8, 3, 5, 7, 9]);
|
||||
assert_eq!(bytes_reader.get_bytes(0), &[0u8, 1, 2, 3]);
|
||||
assert!(bytes_reader.get_bytes(1).is_empty());
|
||||
assert_eq!(bytes_reader.get_bytes(2), &[255u8]);
|
||||
assert_eq!(bytes_reader.get_bytes(3), &[1u8, 3, 5, 7, 9]);
|
||||
let long = vec![0u8; 1000];
|
||||
assert_eq!(bytes_reader.get_val(4), long.as_slice());
|
||||
assert_eq!(bytes_reader.get_bytes(4), long.as_slice());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use owning_ref::OwningRef;
|
||||
|
||||
use directory::ReadOnlySource;
|
||||
use fastfield::FastFieldReader;
|
||||
use DocId;
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for byte array fast fields
|
||||
///
|
||||
@@ -14,6 +14,7 @@ use DocId;
|
||||
///
|
||||
/// Reading the value for a document is done by reading the start index for it,
|
||||
/// and the start index for the next document, and keeping the bytes in between.
|
||||
#[derive(Clone)]
|
||||
pub struct BytesFastFieldReader {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
values: OwningRef<ReadOnlySource, [u8]>,
|
||||
@@ -28,10 +29,20 @@ impl BytesFastFieldReader {
|
||||
BytesFastFieldReader { idx_reader, values }
|
||||
}
|
||||
|
||||
/// Returns the bytes associated to the given `doc`
|
||||
pub fn get_val(&self, doc: DocId) -> &[u8] {
|
||||
fn range(&self, doc: DocId) -> (usize, usize) {
|
||||
let start = self.idx_reader.get(doc) as usize;
|
||||
let stop = self.idx_reader.get(doc + 1) as usize;
|
||||
(start, stop)
|
||||
}
|
||||
|
||||
/// Returns the bytes associated to the given `doc`
|
||||
pub fn get_bytes(&self, doc: DocId) -> &[u8] {
|
||||
let (start, stop) = self.range(doc);
|
||||
&self.values[start..stop]
|
||||
}
|
||||
|
||||
/// Returns the overall number of bytes in this bytes fast field.
|
||||
pub fn total_num_bytes(&self) -> usize {
|
||||
self.values.len()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::io;
|
||||
|
||||
use fastfield::serializer::FastFieldSerializer;
|
||||
use schema::{Document, Field, Value};
|
||||
use DocId;
|
||||
use crate::fastfield::serializer::FastFieldSerializer;
|
||||
use crate::schema::{Document, Field, Value};
|
||||
use crate::DocId;
|
||||
|
||||
/// Writer for byte array (as in, any number of bytes per document) fast fields
|
||||
///
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use crate::common::HasLen;
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::space_usage::ByteCount;
|
||||
use crate::DocId;
|
||||
use bit_set::BitSet;
|
||||
use common::HasLen;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WritePtr;
|
||||
use space_usage::ByteCount;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use DocId;
|
||||
|
||||
/// Write a delete `BitSet`
|
||||
///
|
||||
@@ -53,16 +53,18 @@ impl DeleteBitSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns whether the document has been marked as deleted.
|
||||
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
||||
pub fn is_alive(&self, doc: DocId) -> bool {
|
||||
!self.is_deleted(doc)
|
||||
}
|
||||
|
||||
/// Returns true iff the document has been marked as deleted.
|
||||
#[inline(always)]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
if self.len == 0 {
|
||||
false
|
||||
} else {
|
||||
let byte_offset = doc / 8u32;
|
||||
let b: u8 = (*self.data)[byte_offset as usize];
|
||||
let shift = (doc & 7u32) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
let byte_offset = doc / 8u32;
|
||||
let b: u8 = (*self.data)[byte_offset as usize];
|
||||
let shift = (doc & 7u32) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this bitset.
|
||||
@@ -80,8 +82,8 @@ impl HasLen for DeleteBitSet {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::directory::*;
|
||||
use bit_set::BitSet;
|
||||
use directory::*;
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn test_delete_bitset_helper(bitset: &BitSet) {
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use schema::FieldEntry;
|
||||
use crate::schema::FieldEntry;
|
||||
use std::result;
|
||||
|
||||
/// `FastFieldNotAvailableError` is returned when the
|
||||
/// user requested for a fast field reader, and the field was not
|
||||
/// defined in the schema as a fast field.
|
||||
#[derive(Debug, Fail)]
|
||||
#[fail(display = "field not available: '{:?}'", field_name)]
|
||||
#[fail(display = "Fast field not available: '{:?}'", field_name)]
|
||||
pub struct FastFieldNotAvailableError {
|
||||
field_name: String,
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use super::MultiValueIntFastFieldReader;
|
||||
use schema::Facet;
|
||||
use crate::schema::Facet;
|
||||
use crate::termdict::TermDictionary;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::DocId;
|
||||
use std::str;
|
||||
use termdict::TermDictionary;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
|
||||
/// The facet reader makes it possible to access the list of
|
||||
/// facets associated to a given document in a specific
|
||||
|
||||
@@ -30,12 +30,13 @@ pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use common;
|
||||
use schema::Cardinality;
|
||||
use schema::FieldType;
|
||||
use schema::Value;
|
||||
use crate::common;
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Value;
|
||||
|
||||
mod bytes;
|
||||
mod delete;
|
||||
@@ -43,11 +44,12 @@ mod error;
|
||||
mod facet_reader;
|
||||
mod multivalued;
|
||||
mod reader;
|
||||
mod readers;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
|
||||
/// Trait for types that are allowed for fast fields: (u64 or i64).
|
||||
pub trait FastValue: Default + Clone + Copy {
|
||||
/// Trait for types that are allowed for fast fields: (u64, i64 and f64).
|
||||
pub trait FastValue: Default + Clone + Copy + Send + Sync + PartialOrd {
|
||||
/// Converts a value from u64
|
||||
///
|
||||
/// Internally all fast field values are encoded as u64.
|
||||
@@ -78,10 +80,6 @@ impl FastValue for u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
@@ -89,6 +87,10 @@ impl FastValue for u64 {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for i64 {
|
||||
@@ -112,11 +114,33 @@ impl FastValue for i64 {
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for f64 {
|
||||
fn from_u64(val: u64) -> Self {
|
||||
common::u64_to_f64(val)
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
common::f64_to_u64(*self)
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::F64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
self.to_bits()
|
||||
}
|
||||
}
|
||||
|
||||
fn value_to_u64(value: &Value) -> u64 {
|
||||
match *value {
|
||||
Value::U64(ref val) => *val,
|
||||
Value::I64(ref val) => common::i64_to_u64(*val),
|
||||
_ => panic!("Expected a u64/i64 field, got {:?} ", value),
|
||||
Value::F64(ref val) => common::f64_to_u64(*val),
|
||||
_ => panic!("Expected a u64/i64/f64 field, got {:?} ", value),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,27 +148,27 @@ fn value_to_u64(value: &Value) -> u64 {
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use common::CompositeFile;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::FastFieldReader;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::{Directory, RAMDirectory, WritePtr};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::SeedableRng;
|
||||
use schema::Document;
|
||||
use schema::Field;
|
||||
use schema::Schema;
|
||||
use schema::FAST;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
lazy_static! {
|
||||
pub static ref SCHEMA: Schema = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_u64_field("field", FAST);
|
||||
schema_builder.build()
|
||||
};
|
||||
pub static ref FIELD: Field = { SCHEMA.get_field("field").unwrap() };
|
||||
}
|
||||
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_u64_field("field", FAST);
|
||||
schema_builder.build()
|
||||
});
|
||||
|
||||
pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield() {
|
||||
|
||||
@@ -7,16 +7,16 @@ pub use self::writer::MultiValueIntFastFieldWriter;
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
extern crate time;
|
||||
use time;
|
||||
|
||||
use self::time::Duration;
|
||||
use collector::TopDocs;
|
||||
use query::QueryParser;
|
||||
use schema::Cardinality;
|
||||
use schema::Facet;
|
||||
use schema::IntOptions;
|
||||
use schema::Schema;
|
||||
use Index;
|
||||
use crate::collector::TopDocs;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::IntOptions;
|
||||
use crate::schema::Schema;
|
||||
use crate::Index;
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_u64() {
|
||||
@@ -37,9 +37,7 @@ mod tests {
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = segment_reader
|
||||
.multi_fast_field_reader::<u64>(field)
|
||||
.unwrap();
|
||||
let multi_value_reader = segment_reader.fast_fields().u64s(field).unwrap();
|
||||
{
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[4u64]);
|
||||
@@ -198,9 +196,9 @@ mod tests {
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap();
|
||||
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
|
||||
{
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[-4i64]);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use fastfield::{FastFieldReader, FastValue};
|
||||
use DocId;
|
||||
use crate::fastfield::{FastFieldReader, FastValue};
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
///
|
||||
@@ -26,6 +26,13 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn into_u64s_reader(self) -> MultiValueIntFastFieldReader<u64> {
|
||||
MultiValueIntFastFieldReader {
|
||||
idx_reader: self.idx_reader,
|
||||
vals_reader: self.vals_reader.into_u64_reader(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
fn range(&self, doc: DocId) -> (u64, u64) {
|
||||
@@ -41,13 +48,24 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
vals.resize(len, Item::default());
|
||||
self.vals_reader.get_range_u64(start, &mut vals[..]);
|
||||
}
|
||||
|
||||
/// Returns the number of values associated with the document `DocId`.
|
||||
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||
let (start, stop) = self.range(doc);
|
||||
(stop - start) as usize
|
||||
}
|
||||
|
||||
/// Returns the overall number of values in this field .
|
||||
pub fn total_num_vals(&self) -> u64 {
|
||||
self.idx_reader.max_value()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use core::Index;
|
||||
use schema::{Document, Facet, Schema};
|
||||
use crate::core::Index;
|
||||
use crate::schema::{Facet, Schema};
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader() {
|
||||
@@ -58,22 +76,12 @@ mod tests {
|
||||
let mut index_writer = index
|
||||
.writer_with_num_threads(1, 30_000_000)
|
||||
.expect("Failed to create index writer.");
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat2");
|
||||
doc.add_facet(facet_field, "/category/cat1");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat2");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat3");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from("/category/cat2"),
|
||||
facet_field => Facet::from("/category/cat1"),
|
||||
));
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")));
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")));
|
||||
index_writer.commit().expect("Commit failed");
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use fastfield::serializer::FastSingleFieldSerializer;
|
||||
use fastfield::value_to_u64;
|
||||
use fastfield::FastFieldSerializer;
|
||||
use crate::fastfield::serializer::FastSingleFieldSerializer;
|
||||
use crate::fastfield::value_to_u64;
|
||||
use crate::fastfield::FastFieldSerializer;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Document, Field};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::DocId;
|
||||
use itertools::Itertools;
|
||||
use postings::UnorderedTermId;
|
||||
use schema::{Document, Field};
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
|
||||
/// Writer for multi-valued (as in, more than one value per document)
|
||||
/// int fast field.
|
||||
@@ -116,7 +116,7 @@ impl MultiValueIntFastFieldWriter {
|
||||
}
|
||||
{
|
||||
// writing the values themselves.
|
||||
let mut value_serializer: FastSingleFieldSerializer<_>;
|
||||
let mut value_serializer: FastSingleFieldSerializer<'_, _>;
|
||||
match mapping_opt {
|
||||
Some(mapping) => {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
use super::FastValue;
|
||||
use common::bitpacker::BitUnpacker;
|
||||
use common::compute_num_bits;
|
||||
use common::BinarySerializable;
|
||||
use common::CompositeFile;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::{Directory, RAMDirectory, WritePtr};
|
||||
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use crate::common::bitpacker::BitUnpacker;
|
||||
use crate::common::compute_num_bits;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::directory::{Directory, RAMDirectory, WritePtr};
|
||||
use crate::fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
use crate::DocId;
|
||||
use owning_ref::OwningRef;
|
||||
use schema::Schema;
|
||||
use schema::FAST;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::path::Path;
|
||||
use DocId;
|
||||
|
||||
/// Trait for accessing a fastfield.
|
||||
///
|
||||
@@ -50,6 +50,15 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn into_u64_reader(self) -> FastFieldReader<u64> {
|
||||
FastFieldReader {
|
||||
bit_unpacker: self.bit_unpacker,
|
||||
min_value_u64: self.min_value_u64,
|
||||
max_value_u64: self.max_value_u64,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
|
||||
229
src/fastfield/readers.rs
Normal file
229
src/fastfield/readers.rs
Normal file
@@ -0,0 +1,229 @@
|
||||
use crate::common::CompositeFile;
|
||||
use crate::fastfield::BytesFastFieldReader;
|
||||
use crate::fastfield::MultiValueIntFastFieldReader;
|
||||
use crate::fastfield::{FastFieldNotAvailableError, FastFieldReader};
|
||||
use crate::schema::{Cardinality, Field, FieldType, Schema};
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use crate::Result;
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Provides access to all of the FastFieldReader.
|
||||
///
|
||||
/// Internally, `FastFieldReaders` have preloaded fast field readers,
|
||||
/// and just wraps several `HashMap`.
|
||||
pub struct FastFieldReaders {
|
||||
fast_field_i64: HashMap<Field, FastFieldReader<i64>>,
|
||||
fast_field_u64: HashMap<Field, FastFieldReader<u64>>,
|
||||
fast_field_f64: HashMap<Field, FastFieldReader<f64>>,
|
||||
fast_field_i64s: HashMap<Field, MultiValueIntFastFieldReader<i64>>,
|
||||
fast_field_u64s: HashMap<Field, MultiValueIntFastFieldReader<u64>>,
|
||||
fast_field_f64s: HashMap<Field, MultiValueIntFastFieldReader<f64>>,
|
||||
fast_bytes: HashMap<Field, BytesFastFieldReader>,
|
||||
fast_fields_composite: CompositeFile,
|
||||
}
|
||||
|
||||
enum FastType {
|
||||
I64,
|
||||
U64,
|
||||
F64,
|
||||
}
|
||||
|
||||
fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
|
||||
match field_type {
|
||||
FieldType::U64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::U64, cardinality)),
|
||||
FieldType::I64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::I64, cardinality)),
|
||||
FieldType::F64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::F64, cardinality)),
|
||||
FieldType::HierarchicalFacet => Some((FastType::U64, Cardinality::MultiValues)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldReaders {
|
||||
pub(crate) fn load_all(
|
||||
schema: &Schema,
|
||||
fast_fields_composite: &CompositeFile,
|
||||
) -> Result<FastFieldReaders> {
|
||||
let mut fast_field_readers = FastFieldReaders {
|
||||
fast_field_i64: Default::default(),
|
||||
fast_field_u64: Default::default(),
|
||||
fast_field_f64: Default::default(),
|
||||
fast_field_i64s: Default::default(),
|
||||
fast_field_u64s: Default::default(),
|
||||
fast_field_f64s: Default::default(),
|
||||
fast_bytes: Default::default(),
|
||||
fast_fields_composite: fast_fields_composite.clone(),
|
||||
};
|
||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
let field_type = field_entry.field_type();
|
||||
if field_type == &FieldType::Bytes {
|
||||
let idx_reader = fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let data = fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||
fast_field_readers
|
||||
.fast_bytes
|
||||
.insert(field, BytesFastFieldReader::open(idx_reader, data));
|
||||
} else if let Some((fast_type, cardinality)) = type_and_cardinality(field_type) {
|
||||
match cardinality {
|
||||
Cardinality::SingleValue => {
|
||||
if let Some(fast_field_data) = fast_fields_composite.open_read(field) {
|
||||
match fast_type {
|
||||
FastType::U64 => {
|
||||
let fast_field_reader = FastFieldReader::open(fast_field_data);
|
||||
fast_field_readers
|
||||
.fast_field_u64
|
||||
.insert(field, fast_field_reader);
|
||||
}
|
||||
FastType::I64 => {
|
||||
fast_field_readers.fast_field_i64.insert(
|
||||
field,
|
||||
FastFieldReader::open(fast_field_data.clone()),
|
||||
);
|
||||
}
|
||||
FastType::F64 => {
|
||||
fast_field_readers.fast_field_f64.insert(
|
||||
field,
|
||||
FastFieldReader::open(fast_field_data.clone()),
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||
}
|
||||
}
|
||||
Cardinality::MultiValues => {
|
||||
let idx_opt = fast_fields_composite.open_read_with_idx(field, 0);
|
||||
let data_opt = fast_fields_composite.open_read_with_idx(field, 1);
|
||||
if let (Some(fast_field_idx), Some(fast_field_data)) = (idx_opt, data_opt) {
|
||||
let idx_reader = FastFieldReader::open(fast_field_idx);
|
||||
match fast_type {
|
||||
FastType::I64 => {
|
||||
let vals_reader = FastFieldReader::open(fast_field_data);
|
||||
let multivalued_int_fast_field =
|
||||
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
|
||||
fast_field_readers
|
||||
.fast_field_i64s
|
||||
.insert(field, multivalued_int_fast_field);
|
||||
}
|
||||
FastType::U64 => {
|
||||
let vals_reader = FastFieldReader::open(fast_field_data);
|
||||
let multivalued_int_fast_field =
|
||||
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
|
||||
fast_field_readers
|
||||
.fast_field_u64s
|
||||
.insert(field, multivalued_int_fast_field);
|
||||
}
|
||||
FastType::F64 => {
|
||||
let vals_reader = FastFieldReader::open(fast_field_data);
|
||||
let multivalued_int_fast_field =
|
||||
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
|
||||
fast_field_readers
|
||||
.fast_field_f64s
|
||||
.insert(field, multivalued_int_fast_field);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(fast_field_readers)
|
||||
}
|
||||
|
||||
pub(crate) fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||
self.fast_fields_composite.space_usage()
|
||||
}
|
||||
|
||||
/// Returns the `u64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 fast field, this method returns `None`.
|
||||
pub fn u64(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||
self.fast_field_u64.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// If the field is a u64-fast field return the associated reader.
|
||||
/// If the field is a i64-fast field, return the associated u64 reader. Values are
|
||||
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping. ///
|
||||
///
|
||||
///TODO should it also be lenient with f64?
|
||||
///
|
||||
/// This method is useful when merging segment reader.
|
||||
pub(crate) fn u64_lenient(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||
if let Some(u64_ff_reader) = self.u64(field) {
|
||||
return Some(u64_ff_reader);
|
||||
}
|
||||
if let Some(i64_ff_reader) = self.i64(field) {
|
||||
return Some(i64_ff_reader.into_u64_reader());
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns `None`.
|
||||
pub fn i64(&self, field: Field) -> Option<FastFieldReader<i64>> {
|
||||
self.fast_field_i64.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns the `f64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a f64 fast field, this method returns `None`.
|
||||
pub fn f64(&self, field: Field) -> Option<FastFieldReader<f64>> {
|
||||
self.fast_field_f64.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 multi-valued fast field, this method returns `None`.
|
||||
pub fn u64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
|
||||
self.fast_field_u64s.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// If the field is a u64s-fast field return the associated reader.
|
||||
/// If the field is a i64s-fast field, return the associated u64s reader. Values are
|
||||
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping.
|
||||
///
|
||||
/// This method is useful when merging segment reader.
|
||||
pub(crate) fn u64s_lenient(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
|
||||
if let Some(u64s_ff_reader) = self.u64s(field) {
|
||||
return Some(u64s_ff_reader);
|
||||
}
|
||||
if let Some(i64s_ff_reader) = self.i64s(field) {
|
||||
return Some(i64s_ff_reader.into_u64s_reader());
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Returns a `i64s` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 multi-valued fast field, this method returns `None`.
|
||||
pub fn i64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<i64>> {
|
||||
self.fast_field_i64s.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns a `f64s` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a f64 multi-valued fast field, this method returns `None`.
|
||||
pub fn f64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<f64>> {
|
||||
self.fast_field_f64s.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns the `bytes` fast field reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a bytes fast field, returns `None`.
|
||||
pub fn bytes(&self, field: Field) -> Option<BytesFastFieldReader> {
|
||||
self.fast_bytes.get(&field).cloned()
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,10 @@
|
||||
use common::bitpacker::BitPacker;
|
||||
use common::compute_num_bits;
|
||||
use common::BinarySerializable;
|
||||
use common::CompositeWrite;
|
||||
use common::CountingWriter;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use crate::common::bitpacker::BitPacker;
|
||||
use crate::common::compute_num_bits;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeWrite;
|
||||
use crate::common::CountingWriter;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// `FastFieldSerializer` is in charge of serializing
|
||||
@@ -45,7 +45,7 @@ impl FastFieldSerializer {
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
|
||||
}
|
||||
|
||||
@@ -56,7 +56,7 @@ impl FastFieldSerializer {
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
idx: usize,
|
||||
) -> io::Result<FastSingleFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastSingleFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
@@ -66,7 +66,7 @@ impl FastFieldSerializer {
|
||||
&mut self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> io::Result<FastBytesFieldSerializer<CountingWriter<WritePtr>>> {
|
||||
) -> io::Result<FastBytesFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastBytesFieldSerializer::open(field_write)
|
||||
}
|
||||
@@ -79,7 +79,7 @@ impl FastFieldSerializer {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
|
||||
pub struct FastSingleFieldSerializer<'a, W: Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
@@ -127,7 +127,7 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastBytesFieldSerializer<'a, W: Write + 'a> {
|
||||
pub struct FastBytesFieldSerializer<'a, W: Write> {
|
||||
write: &'a mut W,
|
||||
}
|
||||
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
use super::multivalued::MultiValueIntFastFieldWriter;
|
||||
use common;
|
||||
use common::BinarySerializable;
|
||||
use common::VInt;
|
||||
use fastfield::{BytesFastFieldWriter, FastFieldSerializer};
|
||||
use postings::UnorderedTermId;
|
||||
use schema::{Cardinality, Document, Field, FieldType, Schema};
|
||||
use crate::common;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::VInt;
|
||||
use crate::fastfield::{BytesFastFieldWriter, FastFieldSerializer};
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Cardinality, Document, Field, FieldType, Schema};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use termdict::TermOrdinal;
|
||||
|
||||
/// The fastfieldswriter regroup all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
@@ -25,13 +25,13 @@ impl FastFieldsWriter {
|
||||
|
||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
let default_value = if let FieldType::I64(_) = *field_entry.field_type() {
|
||||
common::i64_to_u64(0i64)
|
||||
} else {
|
||||
0u64
|
||||
let default_value = match *field_entry.field_type() {
|
||||
FieldType::I64(_) => common::i64_to_u64(0i64),
|
||||
FieldType::F64(_) => common::f64_to_u64(0.0f64),
|
||||
_ => 0u64,
|
||||
};
|
||||
match *field_entry.field_type() {
|
||||
FieldType::I64(ref int_options) | FieldType::U64(ref int_options) => {
|
||||
FieldType::I64(ref int_options) | FieldType::U64(ref int_options) | FieldType::F64(ref int_options) => {
|
||||
match int_options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
let mut fast_field_writer = IntFastFieldWriter::new(field);
|
||||
@@ -142,9 +142,9 @@ impl FastFieldsWriter {
|
||||
/// bitpacked and the number of bits required for bitpacking
|
||||
/// can only been known once we have seen all of the values.
|
||||
///
|
||||
/// Both u64, and i64 use the same writer.
|
||||
/// i64 are just remapped to the `0..2^64 - 1`
|
||||
/// using `common::i64_to_u64`.
|
||||
/// Both u64, i64 and f64 use the same writer.
|
||||
/// i64 and f64 are just remapped to the `0..2^64 - 1`
|
||||
/// using `common::i64_to_u64` and `common::f64_to_u64`.
|
||||
pub struct IntFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u8>,
|
||||
@@ -203,8 +203,8 @@ impl IntFastFieldWriter {
|
||||
/// Extract the value associated to the fast field for
|
||||
/// this document.
|
||||
///
|
||||
/// i64 are remapped to u64 using the logic
|
||||
/// in `common::i64_to_u64`.
|
||||
/// i64 and f64 are remapped to u64 using the logic
|
||||
/// in `common::i64_to_u64` and `common::f64_to_u64`.
|
||||
///
|
||||
/// If the value is missing, then the default value is used
|
||||
/// instead.
|
||||
|
||||
@@ -10,28 +10,263 @@ pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
.unwrap_or_else(|idx| idx - 1) as u8
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::unreadable_literal))]
|
||||
pub const FIELD_NORMS_TABLE: [u32; 256] = [
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
|
||||
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54, 56, 60,
|
||||
64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144, 152, 168, 184, 200, 216, 232,
|
||||
248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984,
|
||||
1_048, 1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608,
|
||||
3864, 4120, 4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336,
|
||||
14360, 15384, 16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984,
|
||||
45080, 49176, 53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904,
|
||||
131096, 147480, 163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472,
|
||||
393240, 426008, 458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064,
|
||||
1048600, 1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320,
|
||||
2621464, 2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192,
|
||||
6291480, 6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512,
|
||||
14680088, 15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152,
|
||||
31457304, 33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584,
|
||||
67108888, 75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752,
|
||||
150994968, 167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480,
|
||||
301989912, 335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936,
|
||||
603979800, 671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848,
|
||||
1207959576, 1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944,
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
18,
|
||||
19,
|
||||
20,
|
||||
21,
|
||||
22,
|
||||
23,
|
||||
24,
|
||||
25,
|
||||
26,
|
||||
27,
|
||||
28,
|
||||
29,
|
||||
30,
|
||||
31,
|
||||
32,
|
||||
33,
|
||||
34,
|
||||
35,
|
||||
36,
|
||||
37,
|
||||
38,
|
||||
39,
|
||||
40,
|
||||
42,
|
||||
44,
|
||||
46,
|
||||
48,
|
||||
50,
|
||||
52,
|
||||
54,
|
||||
56,
|
||||
60,
|
||||
64,
|
||||
68,
|
||||
72,
|
||||
76,
|
||||
80,
|
||||
84,
|
||||
88,
|
||||
96,
|
||||
104,
|
||||
112,
|
||||
120,
|
||||
128,
|
||||
136,
|
||||
144,
|
||||
152,
|
||||
168,
|
||||
184,
|
||||
200,
|
||||
216,
|
||||
232,
|
||||
248,
|
||||
264,
|
||||
280,
|
||||
312,
|
||||
344,
|
||||
376,
|
||||
408,
|
||||
440,
|
||||
472,
|
||||
504,
|
||||
536,
|
||||
600,
|
||||
664,
|
||||
728,
|
||||
792,
|
||||
856,
|
||||
920,
|
||||
984,
|
||||
1_048,
|
||||
1_176,
|
||||
1_304,
|
||||
1_432,
|
||||
1_560,
|
||||
1_688,
|
||||
1_816,
|
||||
1_944,
|
||||
2_072,
|
||||
2_328,
|
||||
2_584,
|
||||
2_840,
|
||||
3_096,
|
||||
3_352,
|
||||
3_608,
|
||||
3_864,
|
||||
4_120,
|
||||
4_632,
|
||||
5_144,
|
||||
5_656,
|
||||
6_168,
|
||||
6_680,
|
||||
7_192,
|
||||
7_704,
|
||||
8_216,
|
||||
9_240,
|
||||
10_264,
|
||||
11_288,
|
||||
12_312,
|
||||
13_336,
|
||||
14_360,
|
||||
15_384,
|
||||
16_408,
|
||||
18_456,
|
||||
20_504,
|
||||
22_552,
|
||||
24_600,
|
||||
26_648,
|
||||
28_696,
|
||||
30_744,
|
||||
32_792,
|
||||
36_888,
|
||||
40_984,
|
||||
45_080,
|
||||
49_176,
|
||||
53_272,
|
||||
57_368,
|
||||
61_464,
|
||||
65_560,
|
||||
73_752,
|
||||
81_944,
|
||||
90_136,
|
||||
98_328,
|
||||
106_520,
|
||||
114_712,
|
||||
122_904,
|
||||
131_096,
|
||||
147_480,
|
||||
163_864,
|
||||
180_248,
|
||||
196_632,
|
||||
213_016,
|
||||
229_400,
|
||||
245_784,
|
||||
262_168,
|
||||
294_936,
|
||||
327_704,
|
||||
360_472,
|
||||
393_240,
|
||||
426_008,
|
||||
458_776,
|
||||
491_544,
|
||||
524_312,
|
||||
589_848,
|
||||
655_384,
|
||||
720_920,
|
||||
786_456,
|
||||
851_992,
|
||||
917_528,
|
||||
983_064,
|
||||
1_048_600,
|
||||
1_179_672,
|
||||
1_310_744,
|
||||
1_441_816,
|
||||
1_572_888,
|
||||
1_703_960,
|
||||
1_835_032,
|
||||
1_966_104,
|
||||
2_097_176,
|
||||
2_359_320,
|
||||
2_621_464,
|
||||
2_883_608,
|
||||
3_145_752,
|
||||
3_407_896,
|
||||
3_670_040,
|
||||
3_932_184,
|
||||
4_194_328,
|
||||
4_718_616,
|
||||
5_242_904,
|
||||
5_767_192,
|
||||
6_291_480,
|
||||
6_815_768,
|
||||
7_340_056,
|
||||
7_864_344,
|
||||
8_388_632,
|
||||
9_437_208,
|
||||
10_485_784,
|
||||
11_534_360,
|
||||
12_582_936,
|
||||
13_631_512,
|
||||
14_680_088,
|
||||
15_728_664,
|
||||
16_777_240,
|
||||
18_874_392,
|
||||
20_971_544,
|
||||
23_068_696,
|
||||
25_165_848,
|
||||
27_263_000,
|
||||
29_360_152,
|
||||
31_457_304,
|
||||
33_554_456,
|
||||
37_748_760,
|
||||
41_943_064,
|
||||
46_137_368,
|
||||
50_331_672,
|
||||
54_525_976,
|
||||
58_720_280,
|
||||
62_914_584,
|
||||
67_108_888,
|
||||
75_497_496,
|
||||
83_886_104,
|
||||
92_274_712,
|
||||
100_663_320,
|
||||
109_051_928,
|
||||
117_440_536,
|
||||
125_829_144,
|
||||
134_217_752,
|
||||
150_994_968,
|
||||
167_772_184,
|
||||
184_549_400,
|
||||
201_326_616,
|
||||
218_103_832,
|
||||
234_881_048,
|
||||
251_658_264,
|
||||
268_435_480,
|
||||
301_989_912,
|
||||
335_544_344,
|
||||
369_098_776,
|
||||
402_653_208,
|
||||
436_207_640,
|
||||
469_762_072,
|
||||
503_316_504,
|
||||
536_870_936,
|
||||
603_979_800,
|
||||
671_088_664,
|
||||
738_197_528,
|
||||
805_306_392,
|
||||
872_415_256,
|
||||
939_524_120,
|
||||
1_006_632_984,
|
||||
1_073_741_848,
|
||||
1_207_959_576,
|
||||
1_342_177_304,
|
||||
1_476_395_032,
|
||||
1_610_612_760,
|
||||
1_744_830_488,
|
||||
1_879_048_216,
|
||||
2_013_265_944,
|
||||
];
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::DocId;
|
||||
|
||||
/// Reads the fieldnorm associated to a document.
|
||||
/// The fieldnorm represents the length associated to
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use common::CompositeWrite;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use crate::common::CompositeWrite;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use DocId;
|
||||
use crate::DocId;
|
||||
|
||||
use super::fieldnorm_to_id;
|
||||
use super::FieldNormsSerializer;
|
||||
use schema::Field;
|
||||
use schema::Schema;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Schema;
|
||||
use std::io;
|
||||
|
||||
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use rand::thread_rng;
|
||||
use std::collections::HashSet;
|
||||
|
||||
use crate::schema::*;
|
||||
use crate::Index;
|
||||
use crate::Searcher;
|
||||
use rand::Rng;
|
||||
use schema::*;
|
||||
use Index;
|
||||
use Searcher;
|
||||
|
||||
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
assert!(searcher.segment_readers().len() < 20);
|
||||
@@ -13,7 +13,6 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_indexing() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use super::operation::DeleteOperation;
|
||||
use crate::Opstamp;
|
||||
use std::mem;
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::{Arc, RwLock};
|
||||
@@ -23,7 +24,7 @@ struct InnerDeleteQueue {
|
||||
last_block: Option<Arc<Block>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
#[derive(Clone)]
|
||||
pub struct DeleteQueue {
|
||||
inner: Arc<RwLock<InnerDeleteQueue>>,
|
||||
}
|
||||
@@ -36,6 +37,7 @@ impl DeleteQueue {
|
||||
};
|
||||
|
||||
let next_block = NextBlock::from(delete_queue.clone());
|
||||
|
||||
{
|
||||
let mut delete_queue_wlock = delete_queue.inner.write().unwrap();
|
||||
delete_queue_wlock.last_block = Some(Arc::new(Block {
|
||||
@@ -184,7 +186,7 @@ impl DeleteCursor {
|
||||
/// queue are consume and the next get will return None.
|
||||
/// - the next get will return the first operation with an
|
||||
/// `opstamp >= target_opstamp`.
|
||||
pub fn skip_to(&mut self, target_opstamp: u64) {
|
||||
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
|
||||
// TODO Can be optimize as we work with block.
|
||||
while self.is_behind_opstamp(target_opstamp) {
|
||||
self.advance();
|
||||
@@ -192,7 +194,7 @@ impl DeleteCursor {
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
|
||||
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
|
||||
fn is_behind_opstamp(&mut self, target_opstamp: Opstamp) -> bool {
|
||||
self.get()
|
||||
.map(|operation| operation.opstamp < target_opstamp)
|
||||
.unwrap_or(false)
|
||||
@@ -249,7 +251,7 @@ impl DeleteCursor {
|
||||
mod tests {
|
||||
|
||||
use super::{DeleteOperation, DeleteQueue};
|
||||
use schema::{Field, Term};
|
||||
use crate::schema::{Field, Term};
|
||||
|
||||
#[test]
|
||||
fn test_deletequeue() {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::sync::Arc;
|
||||
use DocId;
|
||||
use crate::DocId;
|
||||
use crate::Opstamp;
|
||||
|
||||
// Doc to opstamp is used to identify which
|
||||
// document should be deleted.
|
||||
@@ -17,25 +17,25 @@ use DocId;
|
||||
// This mapping is (for the moment) stricly increasing
|
||||
// because of the way document id are allocated.
|
||||
#[derive(Clone)]
|
||||
pub enum DocToOpstampMapping {
|
||||
WithMap(Arc<Vec<u64>>),
|
||||
pub enum DocToOpstampMapping<'a> {
|
||||
WithMap(&'a [Opstamp]),
|
||||
None,
|
||||
}
|
||||
|
||||
impl From<Vec<u64>> for DocToOpstampMapping {
|
||||
fn from(opstamps: Vec<u64>) -> DocToOpstampMapping {
|
||||
DocToOpstampMapping::WithMap(Arc::new(opstamps))
|
||||
impl<'a> From<&'a [u64]> for DocToOpstampMapping<'a> {
|
||||
fn from(opstamps: &[Opstamp]) -> DocToOpstampMapping {
|
||||
DocToOpstampMapping::WithMap(opstamps)
|
||||
}
|
||||
}
|
||||
|
||||
impl DocToOpstampMapping {
|
||||
impl<'a> DocToOpstampMapping<'a> {
|
||||
/// Given an opstamp return the limit doc id L
|
||||
/// such that all doc id D such that
|
||||
// D >= L iff opstamp(D) >= than `target_opstamp`.
|
||||
//
|
||||
// The edge case opstamp = some doc opstamp is in practise
|
||||
// never called.
|
||||
pub fn compute_doc_limit(&self, target_opstamp: u64) -> DocId {
|
||||
pub fn compute_doc_limit(&self, target_opstamp: Opstamp) -> DocId {
|
||||
match *self {
|
||||
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
|
||||
match doc_opstamps.binary_search(&target_opstamp) {
|
||||
@@ -64,17 +64,18 @@ mod tests {
|
||||
#[test]
|
||||
fn test_doc_to_opstamp_mapping_complex() {
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![]);
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(&[][..]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 0);
|
||||
}
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64]);
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(&[1u64][..]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 1);
|
||||
}
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64, 12u64, 17u64, 23u64]);
|
||||
let doc_to_opstamp_mapping =
|
||||
DocToOpstampMapping::from(&[1u64, 12u64, 17u64, 23u64][..]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
for i in 2u64..13u64 {
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 1);
|
||||
|
||||
@@ -1,36 +1,38 @@
|
||||
use super::operation::{AddOperation, UserOperation};
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use crate::core::Index;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::directory::DirectoryLock;
|
||||
use crate::docset::DocSet;
|
||||
use crate::error::TantivyError;
|
||||
use crate::fastfield::write_delete_bitset;
|
||||
use crate::indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||
use crate::indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
||||
use crate::indexer::operation::DeleteOperation;
|
||||
use crate::indexer::stamper::Stamper;
|
||||
use crate::indexer::MergePolicy;
|
||||
use crate::indexer::SegmentEntry;
|
||||
use crate::indexer::SegmentWriter;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::Opstamp;
|
||||
use crate::Result;
|
||||
use bit_set::BitSet;
|
||||
use core::Index;
|
||||
use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SegmentReader;
|
||||
use crossbeam::channel;
|
||||
use directory::DirectoryLock;
|
||||
use docset::DocSet;
|
||||
use error::TantivyError;
|
||||
use fastfield::write_delete_bitset;
|
||||
use futures::{Canceled, Future};
|
||||
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
||||
use indexer::operation::DeleteOperation;
|
||||
use indexer::stamper::Stamper;
|
||||
use indexer::MergePolicy;
|
||||
use indexer::SegmentEntry;
|
||||
use indexer::SegmentWriter;
|
||||
use postings::compute_table_size;
|
||||
use schema::Document;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use smallvec::smallvec;
|
||||
use smallvec::SmallVec;
|
||||
use std::mem;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
use Result;
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
// in the heap goes below MARGIN_IN_BYTES.
|
||||
@@ -44,29 +46,15 @@ pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
|
||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
|
||||
type OperationSender = channel::Sender<Vec<AddOperation>>;
|
||||
type OperationReceiver = channel::Receiver<Vec<AddOperation>>;
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
/// - the hash table "table" itself.
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
||||
assert!(per_thread_memory_budget > 1_000);
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
if let Some(limit) = (1..)
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
{
|
||||
limit.min(19) // we cap it at 2^19 = 512K.
|
||||
} else {
|
||||
unreachable!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
);
|
||||
}
|
||||
}
|
||||
// Group of operations.
|
||||
// Most of the time, users will send operation one-by-one, but it can be useful to
|
||||
// send them as a small block to ensure that
|
||||
// - all docs in the operation will happen on the same segment and continuous docids.
|
||||
// - all operations in the group are committed at the same time, making the group
|
||||
// atomic.
|
||||
type OperationGroup = SmallVec<[AddOperation; 4]>;
|
||||
type OperationSender = channel::Sender<OperationGroup>;
|
||||
type OperationReceiver = channel::Receiver<OperationGroup>;
|
||||
|
||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||
///
|
||||
@@ -94,121 +82,44 @@ pub struct IndexWriter {
|
||||
|
||||
num_threads: usize,
|
||||
|
||||
generation: usize,
|
||||
|
||||
delete_queue: DeleteQueue,
|
||||
|
||||
stamper: Stamper,
|
||||
committed_opstamp: u64,
|
||||
committed_opstamp: Opstamp,
|
||||
}
|
||||
|
||||
/// Open a new index writer. Attempts to acquire a lockfile.
|
||||
///
|
||||
/// The lockfile should be deleted on drop, but it is possible
|
||||
/// that due to a panic or other error, a stale lockfile will be
|
||||
/// left in the index directory. If you are sure that no other
|
||||
/// `IndexWriter` on the system is accessing the index directory,
|
||||
/// it is safe to manually delete the lockfile.
|
||||
///
|
||||
/// `num_threads` specifies the number of indexing workers that
|
||||
/// should work at the same time.
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn open_index_writer(
|
||||
index: &Index,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
directory_lock: DirectoryLock,
|
||||
) -> Result<IndexWriter> {
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
|
||||
let err_msg = format!(
|
||||
"The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_MIN
|
||||
);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
if heap_size_in_bytes_per_thread >= HEAP_SIZE_MAX {
|
||||
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
let current_opstamp = index.load_metas()?.opstamp;
|
||||
|
||||
let stamper = Stamper::new(current_opstamp);
|
||||
|
||||
let segment_updater =
|
||||
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||
|
||||
let mut index_writer = IndexWriter {
|
||||
_directory_lock: Some(directory_lock),
|
||||
|
||||
heap_size_in_bytes_per_thread,
|
||||
index: index.clone(),
|
||||
|
||||
operation_receiver: document_receiver,
|
||||
operation_sender: document_sender,
|
||||
|
||||
segment_updater,
|
||||
|
||||
workers_join_handle: vec![],
|
||||
num_threads,
|
||||
|
||||
delete_queue,
|
||||
|
||||
committed_opstamp: current_opstamp,
|
||||
stamper,
|
||||
|
||||
generation: 0,
|
||||
|
||||
worker_id: 0,
|
||||
};
|
||||
index_writer.start_workers()?;
|
||||
Ok(index_writer)
|
||||
}
|
||||
|
||||
pub fn compute_deleted_bitset(
|
||||
fn compute_deleted_bitset(
|
||||
delete_bitset: &mut BitSet,
|
||||
segment_reader: &SegmentReader,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &DocToOpstampMapping,
|
||||
target_opstamp: u64,
|
||||
target_opstamp: Opstamp,
|
||||
) -> Result<bool> {
|
||||
let mut might_have_changed = false;
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::while_let_loop))]
|
||||
loop {
|
||||
if let Some(delete_op) = delete_cursor.get() {
|
||||
if delete_op.opstamp > target_opstamp {
|
||||
break;
|
||||
} else {
|
||||
// A delete operation should only affect
|
||||
// document that were inserted after it.
|
||||
//
|
||||
// Limit doc helps identify the first document
|
||||
// that may be affected by the delete operation.
|
||||
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
|
||||
let inverted_index = segment_reader.inverted_index(delete_op.term.field());
|
||||
if let Some(mut docset) =
|
||||
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)
|
||||
{
|
||||
while docset.advance() {
|
||||
let deleted_doc = docset.doc();
|
||||
if deleted_doc < limit_doc {
|
||||
delete_bitset.insert(deleted_doc as usize);
|
||||
might_have_changed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while let Some(delete_op) = delete_cursor.get() {
|
||||
if delete_op.opstamp > target_opstamp {
|
||||
break;
|
||||
}
|
||||
|
||||
// A delete operation should only affect
|
||||
// document that were inserted after it.
|
||||
//
|
||||
// Limit doc helps identify the first document
|
||||
// that may be affected by the delete operation.
|
||||
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
|
||||
let inverted_index = segment_reader.inverted_index(delete_op.term.field());
|
||||
if let Some(mut docset) =
|
||||
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)
|
||||
{
|
||||
while docset.advance() {
|
||||
let deleted_doc = docset.doc();
|
||||
if deleted_doc < limit_doc {
|
||||
delete_bitset.insert(deleted_doc as usize);
|
||||
might_have_changed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete_cursor.advance();
|
||||
}
|
||||
Ok(might_have_changed)
|
||||
@@ -216,10 +127,10 @@ pub fn compute_deleted_bitset(
|
||||
|
||||
/// Advance delete for the given segment up
|
||||
/// to the target opstamp.
|
||||
pub fn advance_deletes(
|
||||
pub(crate) fn advance_deletes(
|
||||
mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: u64,
|
||||
target_opstamp: Opstamp,
|
||||
) -> Result<()> {
|
||||
{
|
||||
if segment_entry.meta().delete_opstamp() == Some(target_opstamp) {
|
||||
@@ -228,8 +139,8 @@ pub fn advance_deletes(
|
||||
}
|
||||
|
||||
let segment_reader = SegmentReader::open(&segment)?;
|
||||
let max_doc = segment_reader.max_doc();
|
||||
|
||||
let max_doc = segment_reader.max_doc();
|
||||
let mut delete_bitset: BitSet = match segment_entry.delete_bitset() {
|
||||
Some(previous_delete_bitset) => (*previous_delete_bitset).clone(),
|
||||
None => BitSet::with_capacity(max_doc as usize),
|
||||
@@ -266,17 +177,15 @@ pub fn advance_deletes(
|
||||
fn index_documents(
|
||||
memory_budget: usize,
|
||||
segment: &Segment,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item = Vec<AddOperation>>,
|
||||
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
let schema = segment.schema();
|
||||
let segment_id = segment.id();
|
||||
let table_size = initial_table_size(memory_budget);
|
||||
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
|
||||
for documents in document_iterator {
|
||||
for doc in documents {
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
|
||||
for document_group in grouped_document_iterator {
|
||||
for doc in document_group {
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
}
|
||||
let mem_usage = segment_writer.mem_usage();
|
||||
@@ -299,39 +208,120 @@ fn index_documents(
|
||||
// the worker thread.
|
||||
assert!(num_docs > 0);
|
||||
|
||||
let doc_opstamps: Vec<u64> = segment_writer.finalize()?;
|
||||
let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
|
||||
let segment_meta = segment
|
||||
.index()
|
||||
.inventory()
|
||||
.new_segment_meta(segment_id, num_docs);
|
||||
|
||||
let segment_meta = SegmentMeta::new(segment_id, num_docs);
|
||||
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
|
||||
|
||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
||||
let delete_bitset_opt =
|
||||
apply_deletes(&segment, &mut delete_cursor, &doc_opstamps, last_docstamp)?;
|
||||
|
||||
let delete_bitset_opt = if delete_cursor.get().is_some() {
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
|
||||
Ok(segment_updater.add_segment(segment_entry))
|
||||
}
|
||||
|
||||
fn apply_deletes(
|
||||
segment: &Segment,
|
||||
mut delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &[Opstamp],
|
||||
last_docstamp: Opstamp,
|
||||
) -> Result<Option<BitSet<u32>>> {
|
||||
if delete_cursor.get().is_none() {
|
||||
// if there are no delete operation in the queue, no need
|
||||
// to even open the segment.
|
||||
return Ok(None);
|
||||
}
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let mut deleted_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
Ok(if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
|
||||
Ok(segment_updater.add_segment(generation, segment_entry))
|
||||
})
|
||||
}
|
||||
|
||||
impl IndexWriter {
|
||||
/// The index writer
|
||||
/// Create a new index writer. Attempts to acquire a lockfile.
|
||||
///
|
||||
/// The lockfile should be deleted on drop, but it is possible
|
||||
/// that due to a panic or other error, a stale lockfile will be
|
||||
/// left in the index directory. If you are sure that no other
|
||||
/// `IndexWriter` on the system is accessing the index directory,
|
||||
/// it is safe to manually delete the lockfile.
|
||||
///
|
||||
/// `num_threads` specifies the number of indexing workers that
|
||||
/// should work at the same time.
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub(crate) fn new(
|
||||
index: &Index,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
directory_lock: DirectoryLock,
|
||||
) -> Result<IndexWriter> {
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
|
||||
let err_msg = format!(
|
||||
"The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_MIN
|
||||
);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
if heap_size_in_bytes_per_thread >= HEAP_SIZE_MAX {
|
||||
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
let current_opstamp = index.load_metas()?.opstamp;
|
||||
|
||||
let stamper = Stamper::new(current_opstamp);
|
||||
|
||||
let segment_updater =
|
||||
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||
|
||||
let mut index_writer = IndexWriter {
|
||||
_directory_lock: Some(directory_lock),
|
||||
|
||||
heap_size_in_bytes_per_thread,
|
||||
index: index.clone(),
|
||||
|
||||
operation_receiver: document_receiver,
|
||||
operation_sender: document_sender,
|
||||
|
||||
segment_updater,
|
||||
|
||||
workers_join_handle: vec![],
|
||||
num_threads,
|
||||
|
||||
delete_queue,
|
||||
|
||||
committed_opstamp: current_opstamp,
|
||||
stamper,
|
||||
|
||||
worker_id: 0,
|
||||
};
|
||||
index_writer.start_workers()?;
|
||||
Ok(index_writer)
|
||||
}
|
||||
|
||||
/// If there are some merging threads, blocks until they all finish their work and
|
||||
/// then drop the `IndexWriter`.
|
||||
pub fn wait_merging_threads(mut self) -> Result<()> {
|
||||
// this will stop the indexing thread,
|
||||
// dropping the last reference to the segment_updater.
|
||||
@@ -364,8 +354,7 @@ impl IndexWriter {
|
||||
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
|
||||
let delete_cursor = self.delete_queue.cursor();
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
|
||||
self.segment_updater
|
||||
.add_segment(self.generation, segment_entry);
|
||||
self.segment_updater.add_segment(segment_entry);
|
||||
}
|
||||
|
||||
/// Creates a new segment.
|
||||
@@ -382,22 +371,16 @@ impl IndexWriter {
|
||||
|
||||
/// Spawns a new worker thread for indexing.
|
||||
/// The thread consumes documents from the pipeline.
|
||||
///
|
||||
fn add_indexing_worker(&mut self) -> Result<()> {
|
||||
let document_receiver_clone = self.operation_receiver.clone();
|
||||
let mut segment_updater = self.segment_updater.clone();
|
||||
|
||||
let generation = self.generation;
|
||||
|
||||
let mut delete_cursor = self.delete_queue.cursor();
|
||||
|
||||
let mem_budget = self.heap_size_in_bytes_per_thread;
|
||||
let index = self.index.clone();
|
||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||
.name(format!(
|
||||
"thrd-tantivy-index{}-gen{}",
|
||||
self.worker_id, generation
|
||||
))
|
||||
.name(format!("thrd-tantivy-index{}", self.worker_id))
|
||||
.spawn(move || {
|
||||
loop {
|
||||
let mut document_iterator =
|
||||
@@ -426,7 +409,6 @@ impl IndexWriter {
|
||||
index_documents(
|
||||
mem_budget,
|
||||
&segment,
|
||||
generation,
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
delete_cursor.clone(),
|
||||
@@ -439,12 +421,12 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
/// Accessor to the merge policy.
|
||||
pub fn get_merge_policy(&self) -> Arc<Box<MergePolicy>> {
|
||||
pub fn get_merge_policy(&self) -> Arc<Box<dyn MergePolicy>> {
|
||||
self.segment_updater.get_merge_policy()
|
||||
}
|
||||
|
||||
/// Set the merge policy.
|
||||
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
|
||||
pub fn set_merge_policy(&self, merge_policy: Box<dyn MergePolicy>) {
|
||||
self.segment_updater.set_merge_policy(merge_policy);
|
||||
}
|
||||
|
||||
@@ -458,7 +440,53 @@ impl IndexWriter {
|
||||
/// Detects and removes the files that
|
||||
/// are not used by the index anymore.
|
||||
pub fn garbage_collect_files(&mut self) -> Result<()> {
|
||||
self.segment_updater.garbage_collect_files()
|
||||
self.segment_updater.garbage_collect_files().wait()
|
||||
}
|
||||
|
||||
/// Deletes all documents from the index
|
||||
///
|
||||
/// Requires `commit`ing
|
||||
/// Enables users to rebuild the index,
|
||||
/// by clearing and resubmitting necessary documents
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::query::QueryParser;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::schema::*;
|
||||
/// use tantivy::Index;
|
||||
///
|
||||
/// fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
/// let schema = schema_builder.build();
|
||||
///
|
||||
/// let index = Index::create_in_ram(schema.clone());
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The modern Promotheus"));
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let clear_res = index_writer.delete_all_documents().unwrap();
|
||||
/// // have to commit, otherwise deleted terms remain available
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let searcher = index.reader()?.searcher();
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query_promo = query_parser.parse_query("Promotheus")?;
|
||||
/// let top_docs_promo = searcher.search(&query_promo, &TopDocs::with_limit(1))?;
|
||||
///
|
||||
/// assert!(top_docs_promo.is_empty());
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub fn delete_all_documents(&mut self) -> Result<Opstamp> {
|
||||
// Delete segments
|
||||
self.segment_updater.remove_all_segments();
|
||||
// Return new stamp - reverted stamp
|
||||
self.stamper.revert(self.committed_opstamp);
|
||||
Ok(self.committed_opstamp)
|
||||
}
|
||||
|
||||
/// Merges a given list of segments
|
||||
@@ -488,19 +516,22 @@ impl IndexWriter {
|
||||
|
||||
/// Rollback to the last commit
|
||||
///
|
||||
/// This cancels all of the update that
|
||||
/// happened before after the last commit.
|
||||
/// This cancels all of the updates that
|
||||
/// happened after the last commit.
|
||||
/// After calling rollback, the index is in the same
|
||||
/// state as it was after the last commit.
|
||||
///
|
||||
/// The opstamp at the last commit is returned.
|
||||
pub fn rollback(&mut self) -> Result<()> {
|
||||
pub fn rollback(&mut self) -> Result<Opstamp> {
|
||||
info!("Rolling back to opstamp {}", self.committed_opstamp);
|
||||
self.rollback_impl()
|
||||
}
|
||||
|
||||
/// Private, implementation of rollback
|
||||
fn rollback_impl(&mut self) -> Result<Opstamp> {
|
||||
// marks the segment updater as killed. From now on, all
|
||||
// segment updates will be ignored.
|
||||
self.segment_updater.kill();
|
||||
|
||||
let document_receiver = self.operation_receiver.clone();
|
||||
|
||||
// take the directory lock to create a new index_writer.
|
||||
@@ -509,7 +540,7 @@ impl IndexWriter {
|
||||
.take()
|
||||
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
|
||||
|
||||
let new_index_writer: IndexWriter = open_index_writer(
|
||||
let new_index_writer: IndexWriter = IndexWriter::new(
|
||||
&self.index,
|
||||
self.num_threads,
|
||||
self.heap_size_in_bytes_per_thread,
|
||||
@@ -527,9 +558,9 @@ impl IndexWriter {
|
||||
//
|
||||
// This will reach an end as the only document_sender
|
||||
// was dropped with the index_writer.
|
||||
for _ in document_receiver.clone() {}
|
||||
for _ in document_receiver {}
|
||||
|
||||
Ok(())
|
||||
Ok(self.committed_opstamp)
|
||||
}
|
||||
|
||||
/// Prepares a commit.
|
||||
@@ -553,21 +584,21 @@ impl IndexWriter {
|
||||
/// It is also possible to add a payload to the `commit`
|
||||
/// using this API.
|
||||
/// See [`PreparedCommit::set_payload()`](PreparedCommit.html)
|
||||
pub fn prepare_commit(&mut self) -> Result<PreparedCommit> {
|
||||
pub fn prepare_commit(&mut self) -> Result<PreparedCommit<'_>> {
|
||||
// Here, because we join all of the worker threads,
|
||||
// all of the segment update for this commit have been
|
||||
// sent.
|
||||
//
|
||||
// No document belonging to the next generation have been
|
||||
// No document belonging to the next commit have been
|
||||
// pushed too, because add_document can only happen
|
||||
// on this thread.
|
||||
|
||||
//
|
||||
// This will move uncommitted segments to the state of
|
||||
// committed segments.
|
||||
info!("Preparing commit");
|
||||
|
||||
// this will drop the current document channel
|
||||
// and recreate a new one channels.
|
||||
// and recreate a new one.
|
||||
self.recreate_document_channel();
|
||||
|
||||
let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
|
||||
@@ -577,7 +608,6 @@ impl IndexWriter {
|
||||
.join()
|
||||
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
|
||||
indexing_worker_result?;
|
||||
// add a new worker for the next generation.
|
||||
self.add_indexing_worker()?;
|
||||
}
|
||||
|
||||
@@ -601,7 +631,7 @@ impl IndexWriter {
|
||||
/// Commit returns the `opstamp` of the last document
|
||||
/// that made it in the commit.
|
||||
///
|
||||
pub fn commit(&mut self) -> Result<u64> {
|
||||
pub fn commit(&mut self) -> Result<Opstamp> {
|
||||
self.prepare_commit()?.commit()
|
||||
}
|
||||
|
||||
@@ -617,7 +647,7 @@ impl IndexWriter {
|
||||
///
|
||||
/// Like adds, the deletion itself will be visible
|
||||
/// only after calling `commit()`.
|
||||
pub fn delete_term(&mut self, term: Term) -> u64 {
|
||||
pub fn delete_term(&self, term: Term) -> Opstamp {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let delete_operation = DeleteOperation { opstamp, term };
|
||||
self.delete_queue.push(delete_operation);
|
||||
@@ -631,7 +661,7 @@ impl IndexWriter {
|
||||
///
|
||||
/// This is also the opstamp of the commit that is currently
|
||||
/// available for searchers.
|
||||
pub fn commit_opstamp(&self) -> u64 {
|
||||
pub fn commit_opstamp(&self) -> Opstamp {
|
||||
self.committed_opstamp
|
||||
}
|
||||
|
||||
@@ -645,10 +675,10 @@ impl IndexWriter {
|
||||
///
|
||||
/// Currently it represents the number of documents that
|
||||
/// have been added since the creation of the index.
|
||||
pub fn add_document(&mut self, document: Document) -> u64 {
|
||||
pub fn add_document(&self, document: Document) -> Opstamp {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let add_operation = AddOperation { opstamp, document };
|
||||
let send_result = self.operation_sender.send(vec![add_operation]);
|
||||
let send_result = self.operation_sender.send(smallvec![add_operation]);
|
||||
if let Err(e) = send_result {
|
||||
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||
}
|
||||
@@ -662,7 +692,7 @@ impl IndexWriter {
|
||||
/// The total number of stamps generated by this method is `count + 1`;
|
||||
/// each operation gets a stamp from the `stamps` iterator and `last_opstamp`
|
||||
/// is for the batch itself.
|
||||
fn get_batch_opstamps(&mut self, count: u64) -> (u64, Range<u64>) {
|
||||
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
|
||||
let Range { start, end } = self.stamper.stamps(count + 1u64);
|
||||
let last_opstamp = end - 1;
|
||||
let stamps = Range {
|
||||
@@ -688,14 +718,14 @@ impl IndexWriter {
|
||||
/// Like adds and deletes (see `IndexWriter.add_document` and
|
||||
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
|
||||
/// visible to readers only after calling `commit()`.
|
||||
pub fn run(&mut self, user_operations: Vec<UserOperation>) -> u64 {
|
||||
pub fn run(&self, user_operations: Vec<UserOperation>) -> Opstamp {
|
||||
let count = user_operations.len() as u64;
|
||||
if count == 0 {
|
||||
return self.stamper.stamp();
|
||||
}
|
||||
let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
|
||||
|
||||
let mut adds: Vec<AddOperation> = Vec::new();
|
||||
let mut adds = OperationGroup::default();
|
||||
|
||||
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
|
||||
match user_op {
|
||||
@@ -722,16 +752,16 @@ impl IndexWriter {
|
||||
mod tests {
|
||||
|
||||
use super::super::operation::UserOperation;
|
||||
use super::initial_table_size;
|
||||
use collector::TopDocs;
|
||||
use directory::error::LockError;
|
||||
use error::*;
|
||||
use indexer::NoMergePolicy;
|
||||
use query::TermQuery;
|
||||
use schema::{self, IndexRecordOption};
|
||||
use Index;
|
||||
use ReloadPolicy;
|
||||
use Term;
|
||||
use crate::collector::TopDocs;
|
||||
use crate::directory::error::LockError;
|
||||
use crate::error::*;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::{self, IndexRecordOption};
|
||||
use crate::Index;
|
||||
use crate::ReloadPolicy;
|
||||
use crate::Term;
|
||||
use fail;
|
||||
|
||||
#[test]
|
||||
fn test_operations_group() {
|
||||
@@ -739,7 +769,7 @@ mod tests {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let operations = vec![
|
||||
UserOperation::Add(doc!(text_field=>"a")),
|
||||
UserOperation::Add(doc!(text_field=>"b")),
|
||||
@@ -801,7 +831,7 @@ mod tests {
|
||||
fn test_empty_operations_group() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
let index_writer = index.writer(3_000_000).unwrap();
|
||||
let operations1 = vec![];
|
||||
let batch_opstamp1 = index_writer.run(operations1);
|
||||
assert_eq!(batch_opstamp1, 0u64);
|
||||
@@ -1015,37 +1045,143 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(initial_table_size(100_000), 11);
|
||||
assert_eq!(initial_table_size(1_000_000), 14);
|
||||
assert_eq!(initial_table_size(10_000_000), 17);
|
||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "no_fail"))]
|
||||
#[test]
|
||||
fn test_write_commit_fails() {
|
||||
use fail;
|
||||
fn test_add_then_delete_all_documents() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let num_docs_containing = |s: &str| {
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let term = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term)
|
||||
};
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
|
||||
let commit_tstamp = index_writer.commit().unwrap();
|
||||
assert!(commit_tstamp > add_tstamp);
|
||||
index_writer.delete_all_documents().unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap();
|
||||
|
||||
// Search for documents with the same term that we added
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_rollback_correct_stamp() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
|
||||
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
|
||||
|
||||
// commit documents - they are now available
|
||||
let first_commit = index_writer.commit();
|
||||
assert!(first_commit.is_ok());
|
||||
let first_commit_tstamp = first_commit.unwrap();
|
||||
assert!(first_commit_tstamp > add_tstamp);
|
||||
|
||||
// delete_all_documents the index
|
||||
let clear_tstamp = index_writer.delete_all_documents().unwrap();
|
||||
assert_eq!(clear_tstamp, add_tstamp);
|
||||
|
||||
// commit the clear command - now documents aren't available
|
||||
let second_commit = index_writer.commit();
|
||||
assert!(second_commit.is_ok());
|
||||
let second_commit_tstamp = second_commit.unwrap();
|
||||
|
||||
// add new documents again
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "b"));
|
||||
}
|
||||
assert!(index_writer.commit().is_err());
|
||||
let num_docs_containing = |s: &str| {
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
index.reader().unwrap().searcher().doc_freq(&term_a)
|
||||
};
|
||||
assert_eq!(num_docs_containing("a"), 100);
|
||||
assert_eq!(num_docs_containing("b"), 0);
|
||||
fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
|
||||
|
||||
// rollback to last commit, when index was empty
|
||||
let rollback = index_writer.rollback();
|
||||
assert!(rollback.is_ok());
|
||||
let rollback_tstamp = rollback.unwrap();
|
||||
assert_eq!(rollback_tstamp, second_commit_tstamp);
|
||||
|
||||
// working with an empty index == no documents
|
||||
let term_b = Term::from_field_text(text_field, "b");
|
||||
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_b), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_then_add() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
let res = index_writer.delete_all_documents();
|
||||
assert!(res.is_ok());
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
// add one simple doc
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
// expect the document with that term to be in the index
|
||||
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_and_rollback() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
|
||||
// add one simple doc
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
let comm = index_writer.commit();
|
||||
assert!(comm.is_ok());
|
||||
let commit_tstamp = comm.unwrap();
|
||||
|
||||
// clear but don't commit!
|
||||
let clear_tstamp = index_writer.delete_all_documents().unwrap();
|
||||
// clear_tstamp should reset to before the last commit
|
||||
assert!(clear_tstamp < commit_tstamp);
|
||||
|
||||
// rollback
|
||||
let _rollback_tstamp = index_writer.rollback().unwrap();
|
||||
// Find original docs in the index
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
// expect the document with that term to be in the index
|
||||
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_empty_index() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
let clear = index_writer.delete_all_documents();
|
||||
let commit = index_writer.commit();
|
||||
assert!(clear.is_ok());
|
||||
assert!(commit.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_index_twice() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
let clear = index_writer.delete_all_documents();
|
||||
let commit = index_writer.commit();
|
||||
assert!(clear.is_ok());
|
||||
assert!(commit.is_ok());
|
||||
let clear_again = index_writer.delete_all_documents();
|
||||
let commit_again = index_writer.commit();
|
||||
assert!(clear_again.is_ok());
|
||||
assert!(commit_again.is_ok());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use super::merge_policy::{MergeCandidate, MergePolicy};
|
||||
use core::SegmentMeta;
|
||||
use crate::core::SegmentMeta;
|
||||
use std::cmp;
|
||||
use std::f64;
|
||||
|
||||
@@ -52,7 +52,7 @@ impl MergePolicy for LogMergePolicy {
|
||||
|
||||
let mut size_sorted_tuples = segments
|
||||
.iter()
|
||||
.map(|x| x.num_docs())
|
||||
.map(SegmentMeta::num_docs)
|
||||
.enumerate()
|
||||
.collect::<Vec<(usize, u32)>>();
|
||||
|
||||
@@ -95,8 +95,11 @@ impl Default for LogMergePolicy {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use core::{SegmentId, SegmentMeta};
|
||||
use indexer::merge_policy::MergePolicy;
|
||||
use crate::core::{SegmentId, SegmentMeta, SegmentMetaInventory};
|
||||
use crate::indexer::merge_policy::MergePolicy;
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
static INVENTORY: Lazy<SegmentMetaInventory> = Lazy::new(SegmentMetaInventory::default);
|
||||
|
||||
fn test_merge_policy() -> LogMergePolicy {
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
@@ -113,7 +116,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn create_random_segment_meta(num_docs: u32) -> SegmentMeta {
|
||||
SegmentMeta::new(SegmentId::generate_random(), num_docs)
|
||||
INVENTORY.new_segment_meta(SegmentId::generate_random(), num_docs)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use crate::Opstamp;
|
||||
use crate::SegmentId;
|
||||
use census::{Inventory, TrackedObject};
|
||||
use std::collections::HashSet;
|
||||
use SegmentId;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct MergeOperationInventory(Inventory<InnerMergeOperation>);
|
||||
@@ -17,8 +18,8 @@ impl MergeOperationInventory {
|
||||
}
|
||||
}
|
||||
|
||||
/// A `MergeOperation` has two role.
|
||||
/// It carries all of the information required to describe a merge :
|
||||
/// A `MergeOperation` has two roles.
|
||||
/// It carries all of the information required to describe a merge:
|
||||
/// - `target_opstamp` is the opstamp up to which we want to consume the
|
||||
/// delete queue and reflect their deletes.
|
||||
/// - `segment_ids` is the list of segment to be merged.
|
||||
@@ -35,14 +36,14 @@ pub struct MergeOperation {
|
||||
}
|
||||
|
||||
struct InnerMergeOperation {
|
||||
target_opstamp: u64,
|
||||
target_opstamp: Opstamp,
|
||||
segment_ids: Vec<SegmentId>,
|
||||
}
|
||||
|
||||
impl MergeOperation {
|
||||
pub fn new(
|
||||
inventory: &MergeOperationInventory,
|
||||
target_opstamp: u64,
|
||||
target_opstamp: Opstamp,
|
||||
segment_ids: Vec<SegmentId>,
|
||||
) -> MergeOperation {
|
||||
let inner_merge_operation = InnerMergeOperation {
|
||||
@@ -54,7 +55,7 @@ impl MergeOperation {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn target_opstamp(&self) -> u64 {
|
||||
pub fn target_opstamp(&self) -> Opstamp {
|
||||
self.inner.target_opstamp
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use std::fmt::Debug;
|
||||
use std::marker;
|
||||
|
||||
@@ -39,8 +39,8 @@ impl MergePolicy for NoMergePolicy {
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
|
||||
/// `MergePolicy` useful for test purposes.
|
||||
///
|
||||
|
||||
@@ -1,28 +1,31 @@
|
||||
use core::Segment;
|
||||
use core::SegmentReader;
|
||||
use core::SerializableSegment;
|
||||
use docset::DocSet;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fastfield::FastFieldReader;
|
||||
use fastfield::FastFieldSerializer;
|
||||
use fastfield::MultiValueIntFastFieldReader;
|
||||
use fieldnorm::FieldNormReader;
|
||||
use fieldnorm::FieldNormsSerializer;
|
||||
use fieldnorm::FieldNormsWriter;
|
||||
use indexer::SegmentSerializer;
|
||||
use crate::common::MAX_DOC_LIMIT;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::core::SerializableSegment;
|
||||
use crate::docset::DocSet;
|
||||
use crate::fastfield::BytesFastFieldReader;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::FastFieldSerializer;
|
||||
use crate::fastfield::MultiValueIntFastFieldReader;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::fieldnorm::FieldNormsWriter;
|
||||
use crate::indexer::SegmentSerializer;
|
||||
use crate::postings::InvertedIndexSerializer;
|
||||
use crate::postings::Postings;
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::{Field, Schema};
|
||||
use crate::store::StoreWriter;
|
||||
use crate::termdict::TermMerger;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::DocId;
|
||||
use crate::Result;
|
||||
use crate::TantivyError;
|
||||
use itertools::Itertools;
|
||||
use postings::InvertedIndexSerializer;
|
||||
use postings::Postings;
|
||||
use schema::Cardinality;
|
||||
use schema::FieldType;
|
||||
use schema::{Field, Schema};
|
||||
use std::cmp;
|
||||
use std::collections::HashMap;
|
||||
use store::StoreWriter;
|
||||
use termdict::TermMerger;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
|
||||
let mut total_tokens = 0u64;
|
||||
@@ -70,7 +73,7 @@ fn compute_min_max_val(
|
||||
// some deleted documents,
|
||||
// we need to recompute the max / min
|
||||
(0..max_doc)
|
||||
.filter(|doc_id| !delete_bitset.is_deleted(*doc_id))
|
||||
.filter(|doc_id| delete_bitset.is_alive(*doc_id))
|
||||
.map(|doc_id| u64_reader.get(doc_id))
|
||||
.minmax()
|
||||
.into_option()
|
||||
@@ -150,6 +153,14 @@ impl IndexMerger {
|
||||
readers.push(reader);
|
||||
}
|
||||
}
|
||||
if max_doc >= MAX_DOC_LIMIT {
|
||||
let err_msg = format!(
|
||||
"The segment resulting from this merge would have {} docs,\
|
||||
which exceeds the limit {}.",
|
||||
max_doc, MAX_DOC_LIMIT
|
||||
);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
Ok(IndexMerger {
|
||||
schema,
|
||||
readers,
|
||||
@@ -196,6 +207,7 @@ impl IndexMerger {
|
||||
}
|
||||
FieldType::U64(ref options)
|
||||
| FieldType::I64(ref options)
|
||||
| FieldType::F64(ref options)
|
||||
| FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
self.write_single_fast_field(field, fast_field_serializer)?;
|
||||
@@ -229,7 +241,10 @@ impl IndexMerger {
|
||||
let mut max_value = u64::min_value();
|
||||
|
||||
for reader in &self.readers {
|
||||
let u64_reader: FastFieldReader<u64> = reader.fast_field_reader(field)?;
|
||||
let u64_reader: FastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.u64_lenient(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
if let Some((seg_min_val, seg_max_val)) =
|
||||
compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset())
|
||||
{
|
||||
@@ -272,24 +287,28 @@ impl IndexMerger {
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
) -> Result<()> {
|
||||
let mut total_num_vals = 0u64;
|
||||
let mut u64s_readers: Vec<MultiValueIntFastFieldReader<u64>> = Vec::new();
|
||||
|
||||
// In the first pass, we compute the total number of vals.
|
||||
//
|
||||
// This is required by the bitpacker, as it needs to know
|
||||
// what should be the bit length use for bitpacking.
|
||||
for reader in &self.readers {
|
||||
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?;
|
||||
let u64s_reader = reader.fast_fields()
|
||||
.u64s_lenient(field)
|
||||
.expect("Failed to find index for multivalued field. This is a bug in tantivy, please report.");
|
||||
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
for doc in 0u32..reader.max_doc() {
|
||||
if !delete_bitset.is_deleted(doc) {
|
||||
let start = idx_reader.get(doc);
|
||||
let end = idx_reader.get(doc + 1);
|
||||
total_num_vals += end - start;
|
||||
if delete_bitset.is_alive(doc) {
|
||||
let num_vals = u64s_reader.num_vals(doc) as u64;
|
||||
total_num_vals += num_vals;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
total_num_vals += idx_reader.max_value();
|
||||
total_num_vals += u64s_reader.total_num_vals();
|
||||
}
|
||||
u64s_readers.push(u64s_reader);
|
||||
}
|
||||
|
||||
// We can now create our `idx` serializer, and in a second pass,
|
||||
@@ -297,13 +316,10 @@ impl IndexMerger {
|
||||
let mut serialize_idx =
|
||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||
let mut idx = 0;
|
||||
for reader in &self.readers {
|
||||
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?;
|
||||
for doc in reader.doc_ids_alive() {
|
||||
for (segment_reader, u64s_reader) in self.readers.iter().zip(&u64s_readers) {
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
serialize_idx.add_val(idx)?;
|
||||
let start = idx_reader.get(doc);
|
||||
let end = idx_reader.get(doc + 1);
|
||||
idx += end - start;
|
||||
idx += u64s_reader.num_vals(doc) as u64;
|
||||
}
|
||||
}
|
||||
serialize_idx.add_val(idx)?;
|
||||
@@ -334,8 +350,10 @@ impl IndexMerger {
|
||||
for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
|
||||
let term_ordinal_mapping: &[TermOrdinal] =
|
||||
term_ordinal_mappings.get_segment(segment_ord);
|
||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
||||
segment_reader.multi_fast_field_reader(field)?;
|
||||
let ff_reader: MultiValueIntFastFieldReader<u64> = segment_reader
|
||||
.fast_fields()
|
||||
.u64s(field)
|
||||
.expect("Could not find multivalued u64 fast value reader.");
|
||||
// TODO optimize if no deletes
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
@@ -367,6 +385,8 @@ impl IndexMerger {
|
||||
|
||||
let mut vals = Vec::with_capacity(100);
|
||||
|
||||
let mut ff_readers = Vec::new();
|
||||
|
||||
// Our values are bitpacked and we need to know what should be
|
||||
// our bitwidth and our minimum value before serializing any values.
|
||||
//
|
||||
@@ -375,7 +395,10 @@ impl IndexMerger {
|
||||
// maximum value and initialize our Serializer.
|
||||
for reader in &self.readers {
|
||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
||||
reader.multi_fast_field_reader(field)?;
|
||||
reader.fast_fields().u64s_lenient(field).expect(
|
||||
"Failed to find multivalued fast field reader. This is a bug in \
|
||||
tantivy. Please report.",
|
||||
);
|
||||
for doc in reader.doc_ids_alive() {
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
for &val in &vals {
|
||||
@@ -383,6 +406,7 @@ impl IndexMerger {
|
||||
max_value = cmp::max(val, max_value);
|
||||
}
|
||||
}
|
||||
ff_readers.push(ff_reader);
|
||||
// TODO optimize when no deletes
|
||||
}
|
||||
|
||||
@@ -395,9 +419,7 @@ impl IndexMerger {
|
||||
{
|
||||
let mut serialize_vals = fast_field_serializer
|
||||
.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
|
||||
for reader in &self.readers {
|
||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
||||
reader.multi_fast_field_reader(field)?;
|
||||
for (reader, ff_reader) in self.readers.iter().zip(ff_readers) {
|
||||
// TODO optimize if no deletes
|
||||
for doc in reader.doc_ids_alive() {
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
@@ -416,19 +438,53 @@ impl IndexMerger {
|
||||
field: Field,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
) -> Result<()> {
|
||||
self.write_fast_field_idx(field, fast_field_serializer)?;
|
||||
let mut total_num_vals = 0u64;
|
||||
let mut bytes_readers: Vec<BytesFastFieldReader> = Vec::new();
|
||||
|
||||
for reader in &self.readers {
|
||||
let bytes_reader = reader.fast_fields().bytes(field).expect(
|
||||
"Failed to find bytes fast field reader. This is a bug in tantivy, please report.",
|
||||
);
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
for doc in 0u32..reader.max_doc() {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
let num_vals = bytes_reader.get_bytes(doc).len() as u64;
|
||||
total_num_vals += num_vals;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
total_num_vals += bytes_reader.total_num_bytes() as u64;
|
||||
}
|
||||
bytes_readers.push(bytes_reader);
|
||||
}
|
||||
|
||||
{
|
||||
// We can now create our `idx` serializer, and in a second pass,
|
||||
// can effectively push the different indexes.
|
||||
let mut serialize_idx =
|
||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||
let mut idx = 0;
|
||||
for (segment_reader, bytes_reader) in self.readers.iter().zip(&bytes_readers) {
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
serialize_idx.add_val(idx)?;
|
||||
idx += bytes_reader.get_bytes(doc).len() as u64;
|
||||
}
|
||||
}
|
||||
serialize_idx.add_val(idx)?;
|
||||
serialize_idx.close_field()?;
|
||||
}
|
||||
|
||||
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1)?;
|
||||
for reader in &self.readers {
|
||||
let bytes_reader = reader.bytes_fast_field_reader(field)?;
|
||||
for segment_reader in &self.readers {
|
||||
let bytes_reader = segment_reader.fast_fields().bytes(field)
|
||||
.expect("Failed to find bytes field in fast field reader. This is a bug in tantivy. Please report.");
|
||||
// TODO: optimize if no deletes
|
||||
for doc in reader.doc_ids_alive() {
|
||||
let val = bytes_reader.get_val(doc);
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
let val = bytes_reader.get_bytes(doc);
|
||||
serialize_vals.write_all(val)?;
|
||||
}
|
||||
}
|
||||
serialize_vals.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -637,28 +693,28 @@ impl SerializableSegment for IndexMerger {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||
use crate::collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
|
||||
use crate::collector::{Count, FacetCollector};
|
||||
use crate::core::Index;
|
||||
use crate::query::AllQuery;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema;
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::Facet;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::IntOptions;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::TextFieldIndexing;
|
||||
use crate::schema::INDEXED;
|
||||
use crate::DocAddress;
|
||||
use crate::IndexWriter;
|
||||
use crate::Searcher;
|
||||
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
||||
use collector::tests::TestCollector;
|
||||
use collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
|
||||
use collector::{Count, FacetCollector};
|
||||
use core::Index;
|
||||
use futures::Future;
|
||||
use query::AllQuery;
|
||||
use query::BooleanQuery;
|
||||
use query::TermQuery;
|
||||
use schema;
|
||||
use schema::Cardinality;
|
||||
use schema::Document;
|
||||
use schema::Facet;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::IntOptions;
|
||||
use schema::Term;
|
||||
use schema::TextFieldIndexing;
|
||||
use schema::INDEXED;
|
||||
use std::io::Cursor;
|
||||
use DocAddress;
|
||||
use IndexWriter;
|
||||
use Searcher;
|
||||
|
||||
#[test]
|
||||
fn test_index_merger_no_deletes() {
|
||||
@@ -752,7 +808,7 @@ mod tests {
|
||||
let searcher = reader.searcher();
|
||||
let get_doc_ids = |terms: Vec<Term>| {
|
||||
let query = BooleanQuery::new_multiterms_query(terms);
|
||||
let top_docs = searcher.search(&query, &TestCollector).unwrap();
|
||||
let top_docs = searcher.search(&query, &TEST_COLLECTOR_WITH_SCORE).unwrap();
|
||||
top_docs.docs().to_vec()
|
||||
};
|
||||
{
|
||||
@@ -969,14 +1025,16 @@ mod tests {
|
||||
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(0)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 4000);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(1)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 1);
|
||||
assert_eq!(score_field_reader.max_value(), 3);
|
||||
@@ -1027,7 +1085,8 @@ mod tests {
|
||||
);
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(0)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 3);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
@@ -1073,7 +1132,8 @@ mod tests {
|
||||
);
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(0)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 3);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
@@ -1125,7 +1185,8 @@ mod tests {
|
||||
);
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(0)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 6000);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
@@ -1371,7 +1432,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let segment = searcher.segment_reader(0u32);
|
||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1, 2]);
|
||||
@@ -1406,7 +1467,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let segment = searcher.segment_reader(1u32);
|
||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[28, 27]);
|
||||
|
||||
@@ -1416,7 +1477,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let segment = searcher.segment_reader(2u32);
|
||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[20]);
|
||||
}
|
||||
@@ -1449,7 +1510,7 @@ mod tests {
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
let segment = searcher.segment_reader(0u32);
|
||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1, 2]);
|
||||
|
||||
@@ -1,17 +1,18 @@
|
||||
use schema::Document;
|
||||
use schema::Term;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::Term;
|
||||
use crate::Opstamp;
|
||||
|
||||
/// Timestamped Delete operation.
|
||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||
pub struct DeleteOperation {
|
||||
pub opstamp: u64,
|
||||
pub opstamp: Opstamp,
|
||||
pub term: Term,
|
||||
}
|
||||
|
||||
/// Timestamped Add operation.
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub struct AddOperation {
|
||||
pub opstamp: u64,
|
||||
pub opstamp: Opstamp,
|
||||
pub document: Document,
|
||||
}
|
||||
|
||||
|
||||
@@ -1,15 +1,16 @@
|
||||
use super::IndexWriter;
|
||||
use Result;
|
||||
use crate::Opstamp;
|
||||
use crate::Result;
|
||||
|
||||
/// A prepared commit
|
||||
pub struct PreparedCommit<'a> {
|
||||
index_writer: &'a mut IndexWriter,
|
||||
payload: Option<String>,
|
||||
opstamp: u64,
|
||||
opstamp: Opstamp,
|
||||
}
|
||||
|
||||
impl<'a> PreparedCommit<'a> {
|
||||
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit {
|
||||
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit<'_> {
|
||||
PreparedCommit {
|
||||
index_writer,
|
||||
payload: None,
|
||||
@@ -17,7 +18,7 @@ impl<'a> PreparedCommit<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn opstamp(&self) -> u64 {
|
||||
pub fn opstamp(&self) -> Opstamp {
|
||||
self.opstamp
|
||||
}
|
||||
|
||||
@@ -25,11 +26,11 @@ impl<'a> PreparedCommit<'a> {
|
||||
self.payload = Some(payload.to_string())
|
||||
}
|
||||
|
||||
pub fn abort(self) -> Result<()> {
|
||||
pub fn abort(self) -> Result<Opstamp> {
|
||||
self.index_writer.rollback()
|
||||
}
|
||||
|
||||
pub fn commit(self) -> Result<u64> {
|
||||
pub fn commit(self) -> Result<Opstamp> {
|
||||
info!("committing {}", self.opstamp);
|
||||
self.index_writer
|
||||
.segment_updater()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use bit_set::BitSet;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use std::fmt;
|
||||
|
||||
/// A segment entry describes the state of
|
||||
@@ -67,7 +67,7 @@ impl SegmentEntry {
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentEntry {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(formatter, "SegmentEntry({:?})", self.meta)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +1,14 @@
|
||||
use super::segment_register::SegmentRegister;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::META_FILEPATH;
|
||||
use error::TantivyError;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::SegmentEntry;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::error::TantivyError;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::indexer::SegmentEntry;
|
||||
use crate::Result as TantivyResult;
|
||||
use std::collections::hash_set::HashSet;
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
|
||||
use Result as TantivyResult;
|
||||
|
||||
#[derive(Default)]
|
||||
struct SegmentRegisters {
|
||||
@@ -29,7 +27,7 @@ pub struct SegmentManager {
|
||||
}
|
||||
|
||||
impl Debug for SegmentManager {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
let lock = self.read();
|
||||
write!(
|
||||
f,
|
||||
@@ -75,29 +73,16 @@ impl SegmentManager {
|
||||
segment_entries
|
||||
}
|
||||
|
||||
/// List the files that are useful to the index.
|
||||
///
|
||||
/// This does not include lock files, or files that are obsolete
|
||||
/// but have not yet been deleted by the garbage collector.
|
||||
pub fn list_files(&self) -> HashSet<PathBuf> {
|
||||
let mut files = HashSet::new();
|
||||
files.insert(META_FILEPATH.clone());
|
||||
for segment_meta in SegmentMeta::all() {
|
||||
files.extend(segment_meta.list_files());
|
||||
}
|
||||
files
|
||||
}
|
||||
|
||||
// Lock poisoning should never happen :
|
||||
// The lock is acquired and released within this class,
|
||||
// and the operations cannot panic.
|
||||
fn read(&self) -> RwLockReadGuard<SegmentRegisters> {
|
||||
fn read(&self) -> RwLockReadGuard<'_, SegmentRegisters> {
|
||||
self.registers
|
||||
.read()
|
||||
.expect("Failed to acquire read lock on SegmentManager.")
|
||||
}
|
||||
|
||||
fn write(&self) -> RwLockWriteGuard<SegmentRegisters> {
|
||||
fn write(&self) -> RwLockWriteGuard<'_, SegmentRegisters> {
|
||||
self.registers
|
||||
.write()
|
||||
.expect("Failed to acquire write lock on SegmentManager.")
|
||||
@@ -118,6 +103,12 @@ impl SegmentManager {
|
||||
});
|
||||
}
|
||||
|
||||
pub(crate) fn remove_all_segments(&self) {
|
||||
let mut registers_lock = self.write();
|
||||
registers_lock.committed.clear();
|
||||
registers_lock.uncommitted.clear();
|
||||
}
|
||||
|
||||
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
|
||||
let mut registers_lock = self.write();
|
||||
registers_lock.committed.clear();
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::segment_entry::SegmentEntry;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::indexer::segment_entry::SegmentEntry;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
@@ -20,7 +20,7 @@ pub struct SegmentRegister {
|
||||
}
|
||||
|
||||
impl Debug for SegmentRegister {
|
||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
write!(f, "SegmentRegister(")?;
|
||||
for k in self.segment_states.keys() {
|
||||
write!(f, "{}, ", k.short_uuid_string())?;
|
||||
@@ -56,7 +56,7 @@ impl SegmentRegister {
|
||||
.values()
|
||||
.map(|segment_entry| segment_entry.meta().clone())
|
||||
.collect();
|
||||
segment_ids.sort_by_key(|meta| meta.id());
|
||||
segment_ids.sort_by_key(SegmentMeta::id);
|
||||
segment_ids
|
||||
}
|
||||
|
||||
@@ -93,9 +93,8 @@ impl SegmentRegister {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use indexer::delete_queue::*;
|
||||
use crate::core::{SegmentId, SegmentMetaInventory};
|
||||
use crate::indexer::delete_queue::*;
|
||||
|
||||
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
|
||||
segment_register
|
||||
@@ -107,6 +106,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_segment_register() {
|
||||
let inventory = SegmentMetaInventory::default();
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
let mut segment_register = SegmentRegister::default();
|
||||
@@ -115,20 +115,20 @@ mod tests {
|
||||
let segment_id_merged = SegmentId::generate_random();
|
||||
|
||||
{
|
||||
let segment_meta = SegmentMeta::new(segment_id_a, 0u32);
|
||||
let segment_meta = inventory.new_segment_meta(segment_id_a, 0u32);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
|
||||
{
|
||||
let segment_meta = SegmentMeta::new(segment_id_b, 0u32);
|
||||
let segment_meta = inventory.new_segment_meta(segment_id_b, 0u32);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
segment_register.remove_segment(&segment_id_a);
|
||||
segment_register.remove_segment(&segment_id_b);
|
||||
{
|
||||
let segment_meta_merged = SegmentMeta::new(segment_id_merged, 0u32);
|
||||
let segment_meta_merged = inventory.new_segment_meta(segment_id_merged, 0u32);
|
||||
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use Result;
|
||||
use crate::Result;
|
||||
|
||||
use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use fastfield::FastFieldSerializer;
|
||||
use fieldnorm::FieldNormsSerializer;
|
||||
use postings::InvertedIndexSerializer;
|
||||
use store::StoreWriter;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
use crate::fastfield::FastFieldSerializer;
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::postings::InvertedIndexSerializer;
|
||||
use crate::store::StoreWriter;
|
||||
|
||||
/// Segment serializer is in charge of laying out on disk
|
||||
/// the data accumulated and sorted by the `SegmentWriter`.
|
||||
|
||||
@@ -1,29 +1,31 @@
|
||||
use super::segment_manager::{get_mergeable_segments, SegmentManager};
|
||||
use core::Index;
|
||||
use core::IndexMeta;
|
||||
use core::Segment;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SerializableSegment;
|
||||
use core::META_FILEPATH;
|
||||
use directory::{Directory, DirectoryClone};
|
||||
use error::TantivyError;
|
||||
use crate::core::Index;
|
||||
use crate::core::IndexMeta;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::core::SerializableSegment;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::{Directory, DirectoryClone};
|
||||
use crate::error::TantivyError;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::indexer::index_writer::advance_deletes;
|
||||
use crate::indexer::merge_operation::MergeOperationInventory;
|
||||
use crate::indexer::merger::IndexMerger;
|
||||
use crate::indexer::stamper::Stamper;
|
||||
use crate::indexer::MergeOperation;
|
||||
use crate::indexer::SegmentEntry;
|
||||
use crate::indexer::SegmentSerializer;
|
||||
use crate::indexer::{DefaultMergePolicy, MergePolicy};
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use crate::Result;
|
||||
use futures::oneshot;
|
||||
use futures::sync::oneshot::Receiver;
|
||||
use futures::Future;
|
||||
use futures_cpupool::Builder as CpuPoolBuilder;
|
||||
use futures_cpupool::CpuFuture;
|
||||
use futures_cpupool::CpuPool;
|
||||
use indexer::delete_queue::DeleteCursor;
|
||||
use indexer::index_writer::advance_deletes;
|
||||
use indexer::merge_operation::MergeOperationInventory;
|
||||
use indexer::merger::IndexMerger;
|
||||
use indexer::stamper::Stamper;
|
||||
use indexer::MergeOperation;
|
||||
use indexer::SegmentEntry;
|
||||
use indexer::SegmentSerializer;
|
||||
use indexer::{DefaultMergePolicy, MergePolicy};
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashMap;
|
||||
@@ -31,23 +33,23 @@ use std::collections::HashSet;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::ops::DerefMut;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
use Result;
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
/// Either
|
||||
// - it fails, in which case an error is returned,
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it success, and `meta.json` is written
|
||||
/// - it succeeds, and `meta.json` is written
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
|
||||
pub fn save_new_metas(schema: Schema, directory: &mut dyn Directory) -> Result<()> {
|
||||
save_metas(
|
||||
&IndexMeta {
|
||||
segments: Vec::new(),
|
||||
@@ -68,7 +70,8 @@ pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
fn save_metas(metas: &IndexMeta, directory: &mut Directory) -> Result<()> {
|
||||
fn save_metas(metas: &IndexMeta, directory: &mut dyn Directory) -> Result<()> {
|
||||
info!("save metas");
|
||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||
// Just adding a new line at the end of the buffer.
|
||||
writeln!(&mut buffer)?;
|
||||
@@ -123,7 +126,9 @@ fn perform_merge(
|
||||
|
||||
let num_docs = merger.write(segment_serializer)?;
|
||||
|
||||
let segment_meta = SegmentMeta::new(merged_segment.id(), num_docs);
|
||||
let segment_meta = index
|
||||
.inventory()
|
||||
.new_segment_meta(merged_segment.id(), num_docs);
|
||||
|
||||
let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None);
|
||||
Ok(after_merge_segment_entry)
|
||||
@@ -140,10 +145,9 @@ struct InnerSegmentUpdater {
|
||||
pool: CpuPool,
|
||||
index: Index,
|
||||
segment_manager: SegmentManager,
|
||||
merge_policy: RwLock<Arc<Box<MergePolicy>>>,
|
||||
merge_policy: RwLock<Arc<Box<dyn MergePolicy>>>,
|
||||
merging_thread_id: AtomicUsize,
|
||||
merging_threads: RwLock<HashMap<usize, JoinHandle<Result<()>>>>,
|
||||
generation: AtomicUsize,
|
||||
killed: AtomicBool,
|
||||
stamper: Stamper,
|
||||
merge_operations: MergeOperationInventory,
|
||||
@@ -170,18 +174,17 @@ impl SegmentUpdater {
|
||||
merge_policy: RwLock::new(Arc::new(Box::new(DefaultMergePolicy::default()))),
|
||||
merging_thread_id: AtomicUsize::default(),
|
||||
merging_threads: RwLock::new(HashMap::new()),
|
||||
generation: AtomicUsize::default(),
|
||||
killed: AtomicBool::new(false),
|
||||
stamper,
|
||||
merge_operations: Default::default(),
|
||||
})))
|
||||
}
|
||||
|
||||
pub fn get_merge_policy(&self) -> Arc<Box<MergePolicy>> {
|
||||
pub fn get_merge_policy(&self) -> Arc<Box<dyn MergePolicy>> {
|
||||
self.0.merge_policy.read().unwrap().clone()
|
||||
}
|
||||
|
||||
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
|
||||
pub fn set_merge_policy(&self, merge_policy: Box<dyn MergePolicy>) {
|
||||
let arc_merge_policy = Arc::new(merge_policy);
|
||||
*self.0.merge_policy.write().unwrap() = arc_merge_policy;
|
||||
}
|
||||
@@ -198,18 +201,19 @@ impl SegmentUpdater {
|
||||
self.0.pool.spawn_fn(move || Ok(f(me_clone)))
|
||||
}
|
||||
|
||||
pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool {
|
||||
if generation >= self.0.generation.load(Ordering::Acquire) {
|
||||
self.run_async(|segment_updater| {
|
||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
true
|
||||
})
|
||||
.forget();
|
||||
pub fn add_segment(&self, segment_entry: SegmentEntry) -> bool {
|
||||
self.run_async(|segment_updater| {
|
||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
.forget();
|
||||
true
|
||||
}
|
||||
|
||||
/// Orders `SegmentManager` to remove all segments
|
||||
pub(crate) fn remove_all_segments(&self) {
|
||||
self.0.segment_manager.remove_all_segments();
|
||||
}
|
||||
|
||||
pub fn kill(&mut self) {
|
||||
@@ -222,9 +226,9 @@ impl SegmentUpdater {
|
||||
|
||||
/// Apply deletes up to the target opstamp to all segments.
|
||||
///
|
||||
/// Tne method returns copies of the segment entries,
|
||||
/// The method returns copies of the segment entries,
|
||||
/// updated with the delete information.
|
||||
fn purge_deletes(&self, target_opstamp: u64) -> Result<Vec<SegmentEntry>> {
|
||||
fn purge_deletes(&self, target_opstamp: Opstamp) -> Result<Vec<SegmentEntry>> {
|
||||
let mut segment_entries = self.0.segment_manager.segment_entries();
|
||||
for segment_entry in &mut segment_entries {
|
||||
let segment = self.0.index.segment(segment_entry.meta().clone());
|
||||
@@ -233,7 +237,7 @@ impl SegmentUpdater {
|
||||
Ok(segment_entries)
|
||||
}
|
||||
|
||||
pub fn save_metas(&self, opstamp: u64, commit_message: Option<String>) {
|
||||
pub fn save_metas(&self, opstamp: Opstamp, commit_message: Option<String>) {
|
||||
if self.is_alive() {
|
||||
let index = &self.0.index;
|
||||
let directory = index.directory();
|
||||
@@ -265,22 +269,32 @@ impl SegmentUpdater {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn garbage_collect_files(&self) -> Result<()> {
|
||||
pub fn garbage_collect_files(&self) -> CpuFuture<(), TantivyError> {
|
||||
self.run_async(move |segment_updater| {
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
})
|
||||
.wait()
|
||||
}
|
||||
|
||||
/// List the files that are useful to the index.
|
||||
///
|
||||
/// This does not include lock files, or files that are obsolete
|
||||
/// but have not yet been deleted by the garbage collector.
|
||||
fn list_files(&self) -> HashSet<PathBuf> {
|
||||
let mut files = HashSet::new();
|
||||
files.insert(META_FILEPATH.to_path_buf());
|
||||
for segment_meta in self.0.index.inventory().all() {
|
||||
files.extend(segment_meta.list_files());
|
||||
}
|
||||
files
|
||||
}
|
||||
|
||||
fn garbage_collect_files_exec(&self) {
|
||||
info!("Running garbage collection");
|
||||
let mut index = self.0.index.clone();
|
||||
index
|
||||
.directory_mut()
|
||||
.garbage_collect(|| self.0.segment_manager.list_files());
|
||||
index.directory_mut().garbage_collect(|| self.list_files());
|
||||
}
|
||||
|
||||
pub fn commit(&self, opstamp: u64, payload: Option<String>) -> Result<()> {
|
||||
pub fn commit(&self, opstamp: Opstamp, payload: Option<String>) -> Result<()> {
|
||||
self.run_async(move |segment_updater| {
|
||||
if segment_updater.is_alive() {
|
||||
let segment_entries = segment_updater
|
||||
@@ -420,6 +434,7 @@ impl SegmentUpdater {
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
merge_candidates.extend(committed_merge_candidates.into_iter());
|
||||
|
||||
for merge_operation in merge_candidates {
|
||||
match self.start_merge_impl(merge_operation) {
|
||||
Ok(merge_future) => {
|
||||
@@ -444,38 +459,41 @@ impl SegmentUpdater {
|
||||
) -> Result<()> {
|
||||
self.run_async(move |segment_updater| {
|
||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
if let Some(delete_operation) = delete_cursor.get() {
|
||||
let committed_opstamp = segment_updater.load_metas().opstamp;
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let index = &segment_updater.0.index;
|
||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||
if let Err(e) =
|
||||
advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp)
|
||||
{
|
||||
error!(
|
||||
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||
merge_operation.segment_ids(),
|
||||
e
|
||||
);
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
{
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
if let Some(delete_operation) = delete_cursor.get() {
|
||||
let committed_opstamp = segment_updater.load_metas().opstamp;
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let index = &segment_updater.0.index;
|
||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||
if let Err(e) = advance_deletes(
|
||||
segment,
|
||||
&mut after_merge_segment_entry,
|
||||
committed_opstamp,
|
||||
) {
|
||||
error!(
|
||||
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||
merge_operation.segment_ids(),
|
||||
e
|
||||
);
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
}
|
||||
// ... cancel merge
|
||||
// `merge_operations` are tracked. As it is dropped, the
|
||||
// the segment_ids will be available again for merge.
|
||||
return;
|
||||
}
|
||||
// ... cancel merge
|
||||
// `merge_operations` are tracked. As it is dropped, the
|
||||
// the segment_ids will be available again for merge.
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
segment_updater
|
||||
.0
|
||||
.segment_manager
|
||||
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
info!("save metas");
|
||||
let previous_metas = segment_updater.load_metas();
|
||||
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
|
||||
let previous_metas = segment_updater.load_metas();
|
||||
segment_updater
|
||||
.0
|
||||
.segment_manager
|
||||
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
|
||||
} // we drop all possible handle to a now useless `SegmentMeta`.
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
})
|
||||
.wait()
|
||||
@@ -522,9 +540,9 @@ impl SegmentUpdater {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use indexer::merge_policy::tests::MergeWheneverPossible;
|
||||
use schema::*;
|
||||
use Index;
|
||||
use crate::indexer::merge_policy::tests::MergeWheneverPossible;
|
||||
use crate::schema::*;
|
||||
use crate::Index;
|
||||
|
||||
#[test]
|
||||
fn test_delete_during_merge() {
|
||||
@@ -649,4 +667,31 @@ mod tests {
|
||||
assert!(index.searchable_segment_metas().unwrap().is_empty());
|
||||
assert!(reader.searcher().segment_readers().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remove_all_segments() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
|
||||
{
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index_writer.segment_updater().remove_all_segments();
|
||||
let seg_vec = index_writer
|
||||
.segment_updater()
|
||||
.0
|
||||
.segment_manager
|
||||
.segment_entries();
|
||||
assert!(seg_vec.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,24 +1,44 @@
|
||||
use super::operation::AddOperation;
|
||||
use core::Segment;
|
||||
use core::SerializableSegment;
|
||||
use fastfield::FastFieldsWriter;
|
||||
use fieldnorm::FieldNormsWriter;
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use postings::MultiFieldPostingsWriter;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use schema::Value;
|
||||
use crate::core::Segment;
|
||||
use crate::core::SerializableSegment;
|
||||
use crate::fastfield::FastFieldsWriter;
|
||||
use crate::fieldnorm::FieldNormsWriter;
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::postings::compute_table_size;
|
||||
use crate::postings::MultiFieldPostingsWriter;
|
||||
use crate::schema::FieldEntry;
|
||||
use crate::schema::FieldType;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::Value;
|
||||
use crate::tokenizer::BoxedTokenizer;
|
||||
use crate::tokenizer::FacetTokenizer;
|
||||
use crate::tokenizer::{TokenStream, Tokenizer};
|
||||
use crate::DocId;
|
||||
use crate::Opstamp;
|
||||
use crate::Result;
|
||||
use crate::TantivyError;
|
||||
use std::io;
|
||||
use std::str;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::FacetTokenizer;
|
||||
use tokenizer::{TokenStream, Tokenizer};
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
/// Computes the initial size of the hash table.
|
||||
///
|
||||
/// Returns a number of bit `b`, such that the recommended initial table size is 2^b.
|
||||
fn initial_table_size(per_thread_memory_budget: usize) -> Result<usize> {
|
||||
let table_memory_upper_bound = per_thread_memory_budget / 3;
|
||||
if let Some(limit) = (10..)
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_memory_upper_bound)
|
||||
.last()
|
||||
{
|
||||
Ok(limit.min(19)) // we cap it at 2^19 = 512K.
|
||||
} else {
|
||||
Err(TantivyError::InvalidArgument(
|
||||
format!("per thread memory budget (={}) is too small. Raise the memory budget or lower the number of threads.", per_thread_memory_budget)))
|
||||
}
|
||||
}
|
||||
|
||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||
/// documents.
|
||||
/// set of documents.
|
||||
///
|
||||
/// They creates the postings list in anonymous memory.
|
||||
/// The segment is layed on disk when the segment gets `finalized`.
|
||||
@@ -28,8 +48,8 @@ pub struct SegmentWriter {
|
||||
segment_serializer: SegmentSerializer,
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FieldNormsWriter,
|
||||
doc_opstamps: Vec<u64>,
|
||||
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
|
||||
doc_opstamps: Vec<Opstamp>,
|
||||
tokenizers: Vec<Option<Box<dyn BoxedTokenizer>>>,
|
||||
}
|
||||
|
||||
impl SegmentWriter {
|
||||
@@ -43,17 +63,18 @@ impl SegmentWriter {
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(
|
||||
table_bits: usize,
|
||||
memory_budget: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema,
|
||||
) -> Result<SegmentWriter> {
|
||||
let table_num_bits = initial_table_size(memory_budget)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits);
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
|
||||
let tokenizers =
|
||||
schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field_entry| field_entry.field_type())
|
||||
.map(FieldEntry::field_type)
|
||||
.map(|field_type| match *field_type {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
@@ -193,6 +214,17 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::F64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_f64(
|
||||
field_value.field(),
|
||||
field_value.value().f64_value(),
|
||||
);
|
||||
self.multifield_postings.subscribe(doc_id, &term);
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
// Do nothing. Bytes only supports fast fields.
|
||||
}
|
||||
@@ -252,3 +284,17 @@ impl SerializableSegment for SegmentWriter {
|
||||
Ok(max_doc)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::initial_table_size;
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(initial_table_size(100_000).unwrap(), 11);
|
||||
assert_eq!(initial_table_size(1_000_000).unwrap(), 14);
|
||||
assert_eq!(initial_table_size(10_000_000).unwrap(), 17);
|
||||
assert_eq!(initial_table_size(1_000_000_000).unwrap(), 19);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,76 +1,39 @@
|
||||
use crate::Opstamp;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
// AtomicU64 have not landed in stable.
|
||||
// For the moment let's just use AtomicUsize on
|
||||
// x86/64 bit platform, and a mutex on other platform.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod archicture_impl {
|
||||
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AtomicU64Ersatz(AtomicUsize);
|
||||
|
||||
impl AtomicU64Ersatz {
|
||||
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
||||
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize))
|
||||
}
|
||||
|
||||
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
|
||||
self.0.fetch_add(val as usize, order) as u64
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "x86_64"))]
|
||||
mod archicture_impl {
|
||||
|
||||
use std::sync::atomic::Ordering;
|
||||
/// Under other architecture, we rely on a mutex.
|
||||
use std::sync::RwLock;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AtomicU64Ersatz(RwLock<u64>);
|
||||
|
||||
impl AtomicU64Ersatz {
|
||||
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
||||
AtomicU64Ersatz(RwLock::new(first_opstamp))
|
||||
}
|
||||
|
||||
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
|
||||
let mut lock = self.0.write().unwrap();
|
||||
let previous_val = *lock;
|
||||
*lock = previous_val + incr;
|
||||
previous_val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
use self::archicture_impl::AtomicU64Ersatz;
|
||||
|
||||
/// Stamper provides Opstamps, which is just an auto-increment id to label
|
||||
/// an operation.
|
||||
///
|
||||
/// Cloning does not "fork" the stamp generation. The stamper actually wraps an `Arc`.
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<AtomicU64Ersatz>);
|
||||
pub struct Stamper(Arc<AtomicU64>);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
|
||||
pub fn new(first_opstamp: Opstamp) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
pub fn stamp(&self) -> Opstamp {
|
||||
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
||||
}
|
||||
|
||||
/// Given a desired count `n`, `stamps` returns an iterator that
|
||||
/// will supply `n` number of u64 stamps.
|
||||
pub fn stamps(&self, n: u64) -> Range<u64> {
|
||||
pub fn stamps(&self, n: u64) -> Range<Opstamp> {
|
||||
let start = self.0.fetch_add(n, Ordering::SeqCst);
|
||||
Range {
|
||||
start,
|
||||
end: start + n,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reverts the stamper to a given `Opstamp` value and returns it
|
||||
pub fn revert(&self, to_opstamp: Opstamp) -> Opstamp {
|
||||
self.0.store(to_opstamp, Ordering::SeqCst);
|
||||
to_opstamp
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -92,4 +55,18 @@ mod test {
|
||||
assert_eq!(stamper.stamps(3u64), (12..15));
|
||||
assert_eq!(stamper.stamp(), 15u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stamper_revert() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
assert_eq!(stamper.stamp(), 7u64);
|
||||
assert_eq!(stamper.stamp(), 8u64);
|
||||
|
||||
let stamper_clone = stamper.clone();
|
||||
assert_eq!(stamper_clone.stamp(), 9u64);
|
||||
|
||||
stamper.revert(6);
|
||||
assert_eq!(stamper.stamp(), 6);
|
||||
assert_eq!(stamper_clone.stamp(), 7);
|
||||
}
|
||||
}
|
||||
|
||||
192
src/lib.rs
192
src/lib.rs
@@ -105,11 +105,8 @@
|
||||
//!
|
||||
//! A good place for you to get started is to check out
|
||||
//! the example code (
|
||||
//! [literate programming](http://fulmicoton.com/tantivy-examples/simple_search.html) /
|
||||
//! [source code](https://github.com/fulmicoton/tantivy/blob/master/examples/simple_search.rs))
|
||||
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
|
||||
//! [source code](https://github.com/tantivy-search/tantivy/blob/master/examples/basic_search.rs))
|
||||
|
||||
#[macro_use]
|
||||
extern crate serde_derive;
|
||||
@@ -123,71 +120,21 @@ extern crate log;
|
||||
#[macro_use]
|
||||
extern crate failure;
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
extern crate atomicwrites;
|
||||
extern crate base64;
|
||||
extern crate bit_set;
|
||||
extern crate bitpacking;
|
||||
extern crate byteorder;
|
||||
extern crate combine;
|
||||
extern crate crossbeam;
|
||||
extern crate fnv;
|
||||
extern crate futures;
|
||||
extern crate futures_cpupool;
|
||||
extern crate htmlescape;
|
||||
extern crate itertools;
|
||||
extern crate levenshtein_automata;
|
||||
#[cfg(feature = "mmap")]
|
||||
extern crate memmap;
|
||||
extern crate num_cpus;
|
||||
extern crate owning_ref;
|
||||
extern crate regex;
|
||||
extern crate rust_stemmers;
|
||||
extern crate scoped_pool;
|
||||
extern crate serde;
|
||||
extern crate stable_deref_trait;
|
||||
extern crate tantivy_fst;
|
||||
extern crate tempdir;
|
||||
extern crate tempfile;
|
||||
extern crate uuid;
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate matches;
|
||||
|
||||
#[cfg(windows)]
|
||||
extern crate winapi;
|
||||
|
||||
#[cfg(test)]
|
||||
extern crate rand;
|
||||
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate maplit;
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
extern crate test;
|
||||
|
||||
#[macro_use]
|
||||
extern crate downcast_rs;
|
||||
|
||||
#[macro_use]
|
||||
extern crate fail;
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(test)]
|
||||
mod functional_test;
|
||||
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
|
||||
pub use error::TantivyError;
|
||||
pub use crate::error::TantivyError;
|
||||
|
||||
#[deprecated(since = "0.7.0", note = "please use `tantivy::TantivyError` instead")]
|
||||
pub use error::TantivyError as Error;
|
||||
|
||||
extern crate census;
|
||||
pub extern crate chrono;
|
||||
extern crate owned_read;
|
||||
pub use crate::error::TantivyError as Error;
|
||||
pub use chrono;
|
||||
|
||||
/// Tantivy result.
|
||||
pub type Result<T> = std::result::Result<T, error::TantivyError>;
|
||||
@@ -224,15 +171,15 @@ pub use self::snippet::{Snippet, SnippetGenerator};
|
||||
mod docset;
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
|
||||
pub use core::SegmentComponent;
|
||||
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use core::{InvertedIndexReader, SegmentReader};
|
||||
pub use directory::Directory;
|
||||
pub use indexer::IndexWriter;
|
||||
pub use postings::Postings;
|
||||
pub use schema::{Document, Term};
|
||||
|
||||
pub use common::{i64_to_u64, u64_to_i64};
|
||||
pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
pub use crate::core::SegmentComponent;
|
||||
pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use crate::core::{InvertedIndexReader, SegmentReader};
|
||||
pub use crate::directory::Directory;
|
||||
pub use crate::indexer::IndexWriter;
|
||||
pub use crate::postings::Postings;
|
||||
pub use crate::reader::LeasedItem;
|
||||
pub use crate::schema::{Document, Term};
|
||||
|
||||
/// Expose the current version of tantivy, as well
|
||||
/// whether it was compiled with the simd compression.
|
||||
@@ -242,10 +189,10 @@ pub fn version() -> &'static str {
|
||||
|
||||
/// Defines tantivy's merging strategy
|
||||
pub mod merge_policy {
|
||||
pub use indexer::DefaultMergePolicy;
|
||||
pub use indexer::LogMergePolicy;
|
||||
pub use indexer::MergePolicy;
|
||||
pub use indexer::NoMergePolicy;
|
||||
pub use crate::indexer::DefaultMergePolicy;
|
||||
pub use crate::indexer::LogMergePolicy;
|
||||
pub use crate::indexer::MergePolicy;
|
||||
pub use crate::indexer::NoMergePolicy;
|
||||
}
|
||||
|
||||
/// A `u32` identifying a document within a segment.
|
||||
@@ -253,6 +200,16 @@ pub mod merge_policy {
|
||||
/// as they are added in the segment.
|
||||
pub type DocId = u32;
|
||||
|
||||
/// A u64 assigned to every operation incrementally
|
||||
///
|
||||
/// All operations modifying the index receives an monotonic Opstamp.
|
||||
/// The resulting state of the index is consistent with the opstamp ordering.
|
||||
///
|
||||
/// For instance, a commit with opstamp `32_423` will reflect all Add and Delete operations
|
||||
/// with an opstamp `<= 32_423`. A delete operation with opstamp n will no affect a document added
|
||||
/// with opstamp `n+1`.
|
||||
pub type Opstamp = u64;
|
||||
|
||||
/// A f32 that represents the relevance of the document to the query
|
||||
///
|
||||
/// This is modelled internally as a `f32`. The
|
||||
@@ -293,20 +250,20 @@ pub struct DocAddress(pub SegmentLocalId, pub DocId);
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use collector::tests::TestCollector;
|
||||
use core::SegmentReader;
|
||||
use docset::DocSet;
|
||||
use query::BooleanQuery;
|
||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::DocSet;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::schema::*;
|
||||
use crate::DocAddress;
|
||||
use crate::Index;
|
||||
use crate::IndexWriter;
|
||||
use crate::Postings;
|
||||
use crate::ReloadPolicy;
|
||||
use rand::distributions::Bernoulli;
|
||||
use rand::distributions::Uniform;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use schema::*;
|
||||
use DocAddress;
|
||||
use Index;
|
||||
use IndexWriter;
|
||||
use Postings;
|
||||
use ReloadPolicy;
|
||||
|
||||
pub fn assert_nearly_equals(expected: f32, val: f32) {
|
||||
assert!(
|
||||
@@ -331,7 +288,7 @@ mod tests {
|
||||
|
||||
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
StdRng::from_seed([seed_val; 32])
|
||||
.sample_iter(&Bernoulli::new(ratio))
|
||||
.sample_iter(&Bernoulli::new(ratio).unwrap())
|
||||
.take(n as usize)
|
||||
.enumerate()
|
||||
.filter_map(|(val, keep)| if keep { Some(val as u32) } else { None })
|
||||
@@ -469,7 +426,7 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn advance_undeleted(docset: &mut DocSet, reader: &SegmentReader) -> bool {
|
||||
fn advance_undeleted(docset: &mut dyn DocSet, reader: &SegmentReader) -> bool {
|
||||
while docset.advance() {
|
||||
if !reader.is_deleted(docset.doc()) {
|
||||
return true;
|
||||
@@ -668,6 +625,30 @@ mod tests {
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_indexed_f64() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let value_field = schema_builder.add_f64_field("value", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let val = std::f64::consts::PI;
|
||||
index_writer.add_document(doc!(value_field => val));
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let term = Term::from_field_f64(value_field, val);
|
||||
let mut postings = searcher
|
||||
.segment_reader(0)
|
||||
.inverted_index(term.field())
|
||||
.read_postings(&term, IndexRecordOption::Basic)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_indexedfield_not_in_documents() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -780,7 +761,7 @@ mod tests {
|
||||
let searcher = reader.searcher();
|
||||
let get_doc_ids = |terms: Vec<Term>| {
|
||||
let query = BooleanQuery::new_multiterms_query(terms);
|
||||
let topdocs = searcher.search(&query, &TestCollector).unwrap();
|
||||
let topdocs = searcher.search(&query, &TEST_COLLECTOR_WITH_SCORE).unwrap();
|
||||
topdocs.docs().to_vec()
|
||||
};
|
||||
assert_eq!(
|
||||
@@ -860,6 +841,7 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
|
||||
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
|
||||
let fast_field_float = schema_builder.add_f64_field("float", FAST);
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let stored_int_field = schema_builder.add_u64_field("text", STORED);
|
||||
let schema = schema_builder.build();
|
||||
@@ -867,7 +849,8 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
|
||||
{
|
||||
let document = doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64);
|
||||
let document =
|
||||
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
|
||||
index_writer.add_document(document);
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
@@ -875,29 +858,40 @@ mod tests {
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(text_field);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(text_field);
|
||||
assert!(fast_field_reader_opt.is_none());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(stored_int_field);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(stored_int_field);
|
||||
assert!(fast_field_reader_opt.is_none());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_signed);
|
||||
assert!(fast_field_reader_opt.is_none());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_float);
|
||||
assert!(fast_field_reader_opt.is_none());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_unsigned);
|
||||
assert!(fast_field_reader_opt.is_some());
|
||||
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4u64)
|
||||
}
|
||||
|
||||
{
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
|
||||
assert!(fast_field_reader_opt.is_some());
|
||||
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
}
|
||||
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().f64(fast_field_float);
|
||||
assert!(fast_field_reader_opt.is_some());
|
||||
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4f64)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -67,7 +67,7 @@ macro_rules! doc(
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use schema::{Schema, FAST, TEXT};
|
||||
use crate::schema::{Schema, FAST, TEXT};
|
||||
|
||||
#[test]
|
||||
fn test_doc_basic() {
|
||||
|
||||
@@ -38,8 +38,8 @@ const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) a
|
||||
pub mod tests {
|
||||
|
||||
use super::{PositionReader, PositionSerializer};
|
||||
use directory::ReadOnlySource;
|
||||
use positions::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::positions::COMPRESSION_BLOCK_SIZE;
|
||||
use std::iter;
|
||||
|
||||
fn create_stream_buffer(vals: &[u32]) -> (ReadOnlySource, ReadOnlySource) {
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
use crate::common::{BinarySerializable, FixedSize};
|
||||
use crate::directory::ReadOnlySource;
|
||||
use crate::positions::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::positions::LONG_SKIP_INTERVAL;
|
||||
use crate::positions::LONG_SKIP_IN_BLOCKS;
|
||||
use crate::postings::compression::compressed_block_size;
|
||||
/// Positions works as a long sequence of compressed block.
|
||||
/// All terms are chained one after the other.
|
||||
///
|
||||
@@ -19,13 +25,7 @@
|
||||
/// so skipping a block without decompressing it is just a matter of advancing that many
|
||||
/// bytes.
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use directory::ReadOnlySource;
|
||||
use owned_read::OwnedRead;
|
||||
use positions::COMPRESSION_BLOCK_SIZE;
|
||||
use positions::LONG_SKIP_INTERVAL;
|
||||
use positions::LONG_SKIP_IN_BLOCKS;
|
||||
use postings::compression::compressed_block_size;
|
||||
|
||||
struct Positions {
|
||||
bit_packer: BitPacker4x,
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CountingWriter;
|
||||
use crate::positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
|
||||
use bitpacking::BitPacker;
|
||||
use bitpacking::BitPacker4x;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
|
||||
use std::io::{self, Write};
|
||||
|
||||
pub struct PositionSerializer<W: io::Write> {
|
||||
|
||||
249
src/postings/block_search.rs
Normal file
249
src/postings/block_search.rs
Normal file
@@ -0,0 +1,249 @@
|
||||
use crate::postings::compression::AlignedBuffer;
|
||||
|
||||
/// This modules define the logic used to search for a doc in a given
|
||||
/// block. (at most 128 docs)
|
||||
///
|
||||
/// Searching within a block is a hotspot when running intersection.
|
||||
/// so it was worth defining it in its own module.
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod sse2 {
|
||||
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
use std::arch::x86_64::__m128i as DataType;
|
||||
use std::arch::x86_64::_mm_add_epi32 as op_add;
|
||||
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
|
||||
use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
|
||||
use std::arch::x86_64::_mm_set1_epi32 as set1;
|
||||
use std::arch::x86_64::_mm_setzero_si128 as set0;
|
||||
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
|
||||
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
|
||||
|
||||
const MASK1: i32 = 78;
|
||||
const MASK2: i32 = 177;
|
||||
|
||||
/// Performs an exhaustive linear search over the
|
||||
///
|
||||
/// There is no early exit here. We simply count the
|
||||
/// number of elements that are `< target`.
|
||||
pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
|
||||
unsafe {
|
||||
let ptr = arr as *const AlignedBuffer as *const DataType;
|
||||
let vkey = set1(target as i32);
|
||||
let mut cnt = set0();
|
||||
// We work over 4 `__m128i` at a time.
|
||||
// A single `__m128i` actual contains 4 `u32`.
|
||||
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
|
||||
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
|
||||
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
|
||||
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
|
||||
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
|
||||
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
|
||||
cnt = op_sub(cnt, sum);
|
||||
}
|
||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
|
||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
|
||||
_mm_cvtsi128_si32(cnt) as usize
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::linear_search_sse2_128;
|
||||
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
#[test]
|
||||
fn test_linear_search_sse2_128_u32() {
|
||||
let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
|
||||
for el in 0u32..128u32 {
|
||||
block[el as usize] = el * 2 + 1 << 18;
|
||||
}
|
||||
let target = block[64] + 1;
|
||||
assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This `linear search` browser exhaustively through the array.
|
||||
/// but the early exit is very difficult to predict.
|
||||
///
|
||||
/// Coupled with `exponential search` this function is likely
|
||||
/// to be called with the same `len`
|
||||
fn linear_search(arr: &[u32], target: u32) -> usize {
|
||||
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
|
||||
}
|
||||
|
||||
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
||||
let end = arr.len();
|
||||
let mut begin = 0;
|
||||
for &pivot in &[1, 3, 7, 15, 31, 63] {
|
||||
if pivot >= end {
|
||||
break;
|
||||
}
|
||||
if arr[pivot] > target {
|
||||
return (begin, pivot);
|
||||
}
|
||||
begin = pivot;
|
||||
}
|
||||
(begin, end)
|
||||
}
|
||||
|
||||
fn galloping(block_docs: &[u32], target: u32) -> usize {
|
||||
let (start, end) = exponential_search(&block_docs, target);
|
||||
start + linear_search(&block_docs[start..end], target)
|
||||
}
|
||||
|
||||
/// Tantivy may rely on SIMD instructions to search for a specific document within
|
||||
/// a given block.
|
||||
#[derive(Clone, Copy, PartialEq)]
|
||||
pub enum BlockSearcher {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
SSE2,
|
||||
Scalar,
|
||||
}
|
||||
|
||||
impl BlockSearcher {
|
||||
/// Search the first index containing an element greater or equal to
|
||||
/// the target.
|
||||
///
|
||||
/// The results should be equivalent to
|
||||
/// ```ignore
|
||||
/// block[..]
|
||||
// .iter()
|
||||
// .take_while(|&&val| val < target)
|
||||
// .count()
|
||||
/// ```
|
||||
///
|
||||
/// The `start` argument is just used to hint that the response is
|
||||
/// greater than beyond `start`. The implementation may or may not use
|
||||
/// it for optimization.
|
||||
///
|
||||
/// # Assumption
|
||||
///
|
||||
/// The array len is > start.
|
||||
/// The block is sorted
|
||||
/// The target is assumed greater or equal to the `arr[start]`.
|
||||
/// The target is assumed smaller or equal to the last element of the block.
|
||||
///
|
||||
/// Currently the scalar implementation starts by an exponential search, and
|
||||
/// then operates a linear search in the result subarray.
|
||||
///
|
||||
/// If SSE2 instructions are available in the `(platform, running CPU)`,
|
||||
/// then we use a different implementation that does an exhaustive linear search over
|
||||
/// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
|
||||
/// of branch.
|
||||
pub(crate) fn search_in_block(
|
||||
self,
|
||||
block_docs: &AlignedBuffer,
|
||||
len: usize,
|
||||
start: usize,
|
||||
target: u32,
|
||||
) -> usize {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
if self == BlockSearcher::SSE2 && len == COMPRESSION_BLOCK_SIZE {
|
||||
return sse2::linear_search_sse2_128(block_docs, target);
|
||||
}
|
||||
}
|
||||
start + galloping(&block_docs.0[start..len], target)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BlockSearcher {
|
||||
fn default() -> BlockSearcher {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if is_x86_feature_detected!("sse2") {
|
||||
return BlockSearcher::SSE2;
|
||||
}
|
||||
}
|
||||
BlockSearcher::Scalar
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::exponential_search;
|
||||
use super::linear_search;
|
||||
use super::BlockSearcher;
|
||||
use crate::postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
#[test]
|
||||
fn test_linear_search() {
|
||||
let len: usize = 50;
|
||||
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
|
||||
for target in 1..*arr.last().unwrap() {
|
||||
let res = linear_search(&arr[..], target);
|
||||
if res > 0 {
|
||||
assert!(arr[res - 1] < target);
|
||||
}
|
||||
if res < len {
|
||||
assert!(arr[res] >= target);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exponentiel_search() {
|
||||
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
|
||||
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
|
||||
assert_eq!(
|
||||
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
|
||||
(3, 7)
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
|
||||
let cursor = search_in_block_trivial_but_slow(block, target);
|
||||
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
|
||||
let mut output_buffer = [u32::max_value(); COMPRESSION_BLOCK_SIZE];
|
||||
output_buffer[..block.len()].copy_from_slice(block);
|
||||
for i in 0..cursor {
|
||||
assert_eq!(
|
||||
block_searcher.search_in_block(
|
||||
&AlignedBuffer(output_buffer),
|
||||
block.len(),
|
||||
i,
|
||||
target
|
||||
),
|
||||
cursor
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
|
||||
use std::collections::HashSet;
|
||||
let mut targets = HashSet::new();
|
||||
for (i, val) in block.iter().cloned().enumerate() {
|
||||
if i > 0 {
|
||||
targets.insert(val - 1);
|
||||
}
|
||||
targets.insert(val);
|
||||
}
|
||||
for target in targets {
|
||||
util_test_search_in_block(block_searcher, block, target);
|
||||
}
|
||||
}
|
||||
|
||||
fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
||||
block.iter().take_while(|&&val| val < target).count()
|
||||
}
|
||||
|
||||
fn test_search_in_block_util(block_searcher: BlockSearcher) {
|
||||
for len in 1u32..128u32 {
|
||||
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
|
||||
util_test_search_in_block_all(block_searcher, &v[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_search_in_block_scalar() {
|
||||
test_search_in_block_util(BlockSearcher::Scalar);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[test]
|
||||
fn test_search_in_block_sse2() {
|
||||
test_search_in_block_util(BlockSearcher::SSE2);
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::common::FixedSize;
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
use common::FixedSize;
|
||||
|
||||
pub const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * u32::SIZE_IN_BYTES;
|
||||
@@ -43,9 +43,14 @@ impl BlockEncoder {
|
||||
}
|
||||
}
|
||||
|
||||
/// We ensure that the OutputBuffer is align on 128 bits
|
||||
/// in order to run SSE2 linear search on it.
|
||||
#[repr(align(128))]
|
||||
pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
|
||||
|
||||
pub struct BlockDecoder {
|
||||
bitpacker: BitPacker4x,
|
||||
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
|
||||
output: AlignedBuffer,
|
||||
pub output_len: usize,
|
||||
}
|
||||
|
||||
@@ -55,11 +60,9 @@ impl BlockDecoder {
|
||||
}
|
||||
|
||||
pub fn with_val(val: u32) -> BlockDecoder {
|
||||
let mut output = [val; COMPRESSION_BLOCK_SIZE + 1];
|
||||
output[COMPRESSION_BLOCK_SIZE] = 0u32;
|
||||
BlockDecoder {
|
||||
bitpacker: BitPacker4x::new(),
|
||||
output,
|
||||
output: AlignedBuffer([val; COMPRESSION_BLOCK_SIZE]),
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
@@ -72,23 +75,28 @@ impl BlockDecoder {
|
||||
) -> usize {
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker
|
||||
.decompress_sorted(offset, &compressed_data, &mut self.output, num_bits)
|
||||
.decompress_sorted(offset, &compressed_data, &mut self.output.0, num_bits)
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker
|
||||
.decompress(&compressed_data, &mut self.output, num_bits)
|
||||
.decompress(&compressed_data, &mut self.output.0, num_bits)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output_array(&self) -> &[u32] {
|
||||
&self.output[..self.output_len]
|
||||
&self.output.0[..self.output_len]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn output_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||
(&self.output, self.output_len)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output(&self, idx: usize) -> u32 {
|
||||
self.output[idx]
|
||||
self.output.0[idx]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,12 +167,12 @@ impl VIntDecoder for BlockDecoder {
|
||||
num_els: usize,
|
||||
) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
Postings module (also called inverted index)
|
||||
*/
|
||||
|
||||
mod block_search;
|
||||
pub(crate) mod compression;
|
||||
/// Postings module
|
||||
///
|
||||
@@ -16,6 +17,8 @@ mod skip;
|
||||
mod stacker;
|
||||
mod term_info;
|
||||
|
||||
pub(crate) use self::block_search::BlockSearcher;
|
||||
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
|
||||
@@ -28,7 +31,7 @@ pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||
|
||||
pub(crate) use self::stacker::compute_table_size;
|
||||
|
||||
pub use common::HasLen;
|
||||
pub use crate::common::HasLen;
|
||||
|
||||
pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32;
|
||||
pub(crate) type UnorderedTermId = u64;
|
||||
@@ -45,22 +48,25 @@ pub(crate) enum FreqReadingOption {
|
||||
pub mod tests {
|
||||
|
||||
use super::*;
|
||||
use core::Index;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentReader;
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fieldnorm::FieldNormReader;
|
||||
use indexer::operation::AddOperation;
|
||||
use indexer::SegmentWriter;
|
||||
use query::Scorer;
|
||||
use crate::core::Index;
|
||||
use crate::core::SegmentComponent;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::indexer::operation::AddOperation;
|
||||
use crate::indexer::SegmentWriter;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::query::Scorer;
|
||||
use crate::schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
|
||||
use crate::schema::{Field, TextOptions};
|
||||
use crate::schema::{IndexRecordOption, TextFieldIndexing};
|
||||
use crate::tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
|
||||
use crate::DocId;
|
||||
use crate::Score;
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use schema::Field;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
|
||||
use std::iter;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
#[test]
|
||||
pub fn test_position_write() {
|
||||
@@ -104,9 +110,7 @@ pub mod tests {
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
|
||||
let term = Term::from_field_text(title, "abc");
|
||||
|
||||
let mut positions = Vec::new();
|
||||
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
@@ -159,6 +163,52 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_drop_token_that_are_too_long() {
|
||||
let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
|
||||
let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
|
||||
exceeding_token_text.push_str(" hello");
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_options = TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
||||
.set_tokenizer("simple_no_truncation"),
|
||||
);
|
||||
let text_field = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
let reader = index.reader().unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>exceeding_token_text));
|
||||
index_writer.commit().unwrap();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field);
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
|
||||
assert_eq!(&bytes, b"hello");
|
||||
}
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
|
||||
index_writer.commit().unwrap();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(1u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field);
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
|
||||
assert_eq!(&bytes[..], ok_token_text.as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_position_and_fieldnorm1() {
|
||||
let mut positions = Vec::new();
|
||||
@@ -170,7 +220,7 @@ pub mod tests {
|
||||
|
||||
{
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(18, segment.clone(), &schema).unwrap();
|
||||
SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
// checking that position works if the field has two values
|
||||
@@ -460,53 +510,52 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
pub static ref TERM_A: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "a")
|
||||
};
|
||||
pub static ref TERM_B: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "b")
|
||||
};
|
||||
pub static ref TERM_C: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "c")
|
||||
};
|
||||
pub static ref TERM_D: Term = {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "d")
|
||||
};
|
||||
pub static ref INDEX: Index = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", STRING);
|
||||
let schema = schema_builder.build();
|
||||
pub static TERM_A: Lazy<Term> = Lazy::new(|| {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "a")
|
||||
});
|
||||
pub static TERM_B: Lazy<Term> = Lazy::new(|| {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "b")
|
||||
});
|
||||
pub static TERM_C: Lazy<Term> = Lazy::new(|| {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "c")
|
||||
});
|
||||
pub static TERM_D: Lazy<Term> = Lazy::new(|| {
|
||||
let field = Field(0);
|
||||
Term::from_field_text(field, "d")
|
||||
});
|
||||
|
||||
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
|
||||
pub static INDEX: Lazy<Index> = Lazy::new(|| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", STRING);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let posting_list_size = 1_000_000;
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for _ in 0..posting_list_size {
|
||||
let mut doc = Document::default();
|
||||
if rng.gen_bool(1f64 / 15f64) {
|
||||
doc.add_text(text_field, "a");
|
||||
}
|
||||
if rng.gen_bool(1f64 / 10f64) {
|
||||
doc.add_text(text_field, "b");
|
||||
}
|
||||
if rng.gen_bool(1f64 / 5f64) {
|
||||
doc.add_text(text_field, "c");
|
||||
}
|
||||
doc.add_text(text_field, "d");
|
||||
index_writer.add_document(doc);
|
||||
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let posting_list_size = 1_000_000;
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for _ in 0..posting_list_size {
|
||||
let mut doc = Document::default();
|
||||
if rng.gen_bool(1f64 / 15f64) {
|
||||
doc.add_text(text_field, "a");
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
if rng.gen_bool(1f64 / 10f64) {
|
||||
doc.add_text(text_field, "b");
|
||||
}
|
||||
if rng.gen_bool(1f64 / 5f64) {
|
||||
doc.add_text(text_field, "c");
|
||||
}
|
||||
doc.add_text(text_field, "d");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index
|
||||
};
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index
|
||||
});
|
||||
|
||||
/// Wraps a given docset, and forward alls call but the
|
||||
/// `.skip_next(...)`. This is useful to test that a specialized
|
||||
@@ -540,7 +589,7 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn test_skip_against_unoptimized<F: Fn() -> Box<DocSet>>(
|
||||
pub fn test_skip_against_unoptimized<F: Fn() -> Box<dyn DocSet>>(
|
||||
postings_factory: F,
|
||||
targets: Vec<u32>,
|
||||
) {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use docset::DocSet;
|
||||
use crate::docset::DocSet;
|
||||
|
||||
/// Postings (also called inverted list)
|
||||
///
|
||||
|
||||
@@ -1,23 +1,23 @@
|
||||
use super::stacker::{Addr, MemoryArena, TermHashMap};
|
||||
|
||||
use postings::recorder::{
|
||||
use crate::postings::recorder::{
|
||||
BufferLender, NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder,
|
||||
};
|
||||
use postings::UnorderedTermId;
|
||||
use postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Field, FieldEntry, FieldType, Schema, Term};
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::{Field, FieldEntry, FieldType, Schema, Term};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::tokenizer::TokenStream;
|
||||
use crate::tokenizer::{Token, MAX_TOKEN_LEN};
|
||||
use crate::DocId;
|
||||
use crate::Result;
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use termdict::TermOrdinal;
|
||||
use tokenizer::Token;
|
||||
use tokenizer::TokenStream;
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
|
||||
fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter> {
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
@@ -35,6 +35,7 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||
FieldType::U64(_)
|
||||
| FieldType::I64(_)
|
||||
| FieldType::F64(_)
|
||||
| FieldType::Date(_)
|
||||
| FieldType::HierarchicalFacet => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
|
||||
FieldType::Bytes => {
|
||||
@@ -49,7 +50,7 @@ pub struct MultiFieldPostingsWriter {
|
||||
heap: MemoryArena,
|
||||
schema: Schema,
|
||||
term_index: TermHashMap,
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter>>,
|
||||
per_field_postings_writers: Vec<Box<dyn PostingsWriter>>,
|
||||
}
|
||||
|
||||
fn make_field_partition(
|
||||
@@ -99,7 +100,12 @@ impl MultiFieldPostingsWriter {
|
||||
self.term_index.mem_usage() + self.heap.mem_usage()
|
||||
}
|
||||
|
||||
pub fn index_text(&mut self, doc: DocId, field: Field, token_stream: &mut TokenStream) -> u32 {
|
||||
pub fn index_text(
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut dyn TokenStream,
|
||||
) -> u32 {
|
||||
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
|
||||
postings_writer.index_text(
|
||||
&mut self.term_index,
|
||||
@@ -138,10 +144,10 @@ impl MultiFieldPostingsWriter {
|
||||
FieldType::Str(_) | FieldType::HierarchicalFacet => {
|
||||
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||
// for the field.
|
||||
let mut unordered_term_ids = term_offsets[start..stop]
|
||||
let unordered_term_ids = term_offsets[start..stop]
|
||||
.iter()
|
||||
.map(|&(_, _, bucket)| bucket);
|
||||
let mut mapping: HashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
|
||||
let mapping: HashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
|
||||
.enumerate()
|
||||
.map(|(term_ord, unord_term_id)| {
|
||||
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
|
||||
@@ -149,7 +155,7 @@ impl MultiFieldPostingsWriter {
|
||||
.collect();
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
}
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::Date(_) => {}
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {}
|
||||
FieldType::Bytes => {}
|
||||
}
|
||||
|
||||
@@ -194,7 +200,7 @@ pub trait PostingsWriter {
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
term_heap: &MemoryArena,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()>;
|
||||
@@ -205,13 +211,23 @@ pub trait PostingsWriter {
|
||||
term_index: &mut TermHashMap,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut TokenStream,
|
||||
token_stream: &mut dyn TokenStream,
|
||||
heap: &mut MemoryArena,
|
||||
) -> u32 {
|
||||
let mut term = Term::for_field(field);
|
||||
let mut sink = |token: &Token| {
|
||||
term.set_text(token.text.as_str());
|
||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||
// We skip all tokens with a len greater than u16.
|
||||
if token.text.len() <= MAX_TOKEN_LEN {
|
||||
term.set_text(token.text.as_str());
|
||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||
} else {
|
||||
info!(
|
||||
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
|
||||
MAX_TOKEN_LEN in the documentation for more information.",
|
||||
token.text.len(),
|
||||
MAX_TOKEN_LEN
|
||||
);
|
||||
}
|
||||
};
|
||||
token_stream.process(&mut sink)
|
||||
}
|
||||
@@ -236,7 +252,7 @@ impl<Rec: Recorder + 'static> SpecializedPostingsWriter<Rec> {
|
||||
}
|
||||
|
||||
/// Builds a `SpecializedPostingsWriter` storing its data in a heap.
|
||||
pub fn new_boxed() -> Box<PostingsWriter> {
|
||||
pub fn new_boxed() -> Box<dyn PostingsWriter> {
|
||||
Box::new(SpecializedPostingsWriter::<Rec>::new())
|
||||
}
|
||||
}
|
||||
@@ -273,7 +289,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
||||
fn serialize(
|
||||
&self,
|
||||
term_addrs: &[(&[u8], Addr, UnorderedTermId)],
|
||||
serializer: &mut FieldSerializer,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
termdict_heap: &MemoryArena,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()> {
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||
use common::{read_u32_vint, write_u32_vint};
|
||||
use postings::FieldSerializer;
|
||||
use crate::common::{read_u32_vint, write_u32_vint};
|
||||
use crate::postings::FieldSerializer;
|
||||
use crate::DocId;
|
||||
use std::io;
|
||||
use DocId;
|
||||
|
||||
const POSITION_END: u32 = 0;
|
||||
|
||||
@@ -72,7 +72,7 @@ pub(crate) trait Recorder: Copy + 'static {
|
||||
fn serialize(
|
||||
&self,
|
||||
buffer_lender: &mut BufferLender,
|
||||
serializer: &mut FieldSerializer,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()>;
|
||||
}
|
||||
@@ -108,7 +108,7 @@ impl Recorder for NothingRecorder {
|
||||
fn serialize(
|
||||
&self,
|
||||
buffer_lender: &mut BufferLender,
|
||||
serializer: &mut FieldSerializer,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()> {
|
||||
let buffer = buffer_lender.lend_u8();
|
||||
@@ -159,7 +159,7 @@ impl Recorder for TermFrequencyRecorder {
|
||||
fn serialize(
|
||||
&self,
|
||||
buffer_lender: &mut BufferLender,
|
||||
serializer: &mut FieldSerializer,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()> {
|
||||
let buffer = buffer_lender.lend_u8();
|
||||
@@ -208,7 +208,7 @@ impl Recorder for TFAndPositionRecorder {
|
||||
fn serialize(
|
||||
&self,
|
||||
buffer_lender: &mut BufferLender,
|
||||
serializer: &mut FieldSerializer,
|
||||
serializer: &mut FieldSerializer<'_>,
|
||||
heap: &MemoryArena,
|
||||
) -> io::Result<()> {
|
||||
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
|
||||
|
||||
@@ -1,20 +1,21 @@
|
||||
use common::BitSet;
|
||||
use common::HasLen;
|
||||
use common::{BinarySerializable, VInt};
|
||||
use docset::{DocSet, SkipResult};
|
||||
use crate::common::BitSet;
|
||||
use crate::common::HasLen;
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::docset::{DocSet, SkipResult};
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::compression::{compressed_block_size, AlignedBuffer};
|
||||
use crate::postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::postings::serializer::PostingsSerializer;
|
||||
use crate::postings::BlockSearcher;
|
||||
use crate::postings::FreqReadingOption;
|
||||
use crate::postings::Postings;
|
||||
use crate::postings::SkipReader;
|
||||
use crate::postings::USE_SKIP_INFO_LIMIT;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::DocId;
|
||||
use owned_read::OwnedRead;
|
||||
use positions::PositionReader;
|
||||
use postings::compression::compressed_block_size;
|
||||
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
use postings::serializer::PostingsSerializer;
|
||||
use postings::FreqReadingOption;
|
||||
use postings::Postings;
|
||||
use postings::SkipReader;
|
||||
use postings::USE_SKIP_INFO_LIMIT;
|
||||
use schema::IndexRecordOption;
|
||||
use std::cmp::Ordering;
|
||||
use tantivy_fst::Streamer;
|
||||
use DocId;
|
||||
|
||||
struct PositionComputer {
|
||||
// store the amount of position int
|
||||
@@ -60,6 +61,7 @@ pub struct SegmentPostings {
|
||||
block_cursor: BlockSegmentPostings,
|
||||
cur: usize,
|
||||
position_computer: Option<PositionComputer>,
|
||||
block_searcher: BlockSearcher,
|
||||
}
|
||||
|
||||
impl SegmentPostings {
|
||||
@@ -70,6 +72,7 @@ impl SegmentPostings {
|
||||
block_cursor: empty_block_cursor,
|
||||
cur: COMPRESSION_BLOCK_SIZE,
|
||||
position_computer: None,
|
||||
block_searcher: BlockSearcher::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -117,42 +120,33 @@ impl SegmentPostings {
|
||||
block_cursor: segment_block_postings,
|
||||
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
|
||||
position_computer: positions_stream_opt.map(PositionComputer::new),
|
||||
block_searcher: BlockSearcher::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn linear_search(arr: &[u32], target: u32) -> usize {
|
||||
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
|
||||
}
|
||||
|
||||
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
||||
let end = arr.len();
|
||||
let mut begin = 0;
|
||||
for &pivot in &[1, 3, 7, 15, 31, 63] {
|
||||
if pivot >= end {
|
||||
break;
|
||||
}
|
||||
if arr[pivot] > target {
|
||||
return (begin, pivot);
|
||||
}
|
||||
begin = pivot;
|
||||
}
|
||||
(begin, end)
|
||||
}
|
||||
|
||||
/// Search the first index containing an element greater or equal to the target.
|
||||
///
|
||||
/// # Assumption
|
||||
///
|
||||
/// The array is assumed non empty.
|
||||
/// The target is assumed greater or equal to the first element.
|
||||
/// The target is assumed smaller or equal to the last element.
|
||||
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
|
||||
let (start, end) = exponential_search(block_docs, target);
|
||||
start + linear_search(&block_docs[start..end], target)
|
||||
}
|
||||
|
||||
impl DocSet for SegmentPostings {
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.position_computer.is_some() && self.cur < COMPRESSION_BLOCK_SIZE {
|
||||
let term_freq = self.term_freq() as usize;
|
||||
if let Some(position_computer) = self.position_computer.as_mut() {
|
||||
position_computer.add_skip(term_freq);
|
||||
}
|
||||
}
|
||||
self.cur += 1;
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
self.cur = 0;
|
||||
if !self.block_cursor.advance() {
|
||||
self.cur = COMPRESSION_BLOCK_SIZE;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
@@ -175,7 +169,6 @@ impl DocSet for SegmentPostings {
|
||||
|
||||
// skip blocks until one that might contain the target
|
||||
// check if we need to go to the next block
|
||||
let need_positions = self.position_computer.is_some();
|
||||
let mut sum_freqs_skipped: u32 = 0;
|
||||
if !self
|
||||
.block_cursor
|
||||
@@ -189,7 +182,7 @@ impl DocSet for SegmentPostings {
|
||||
// we are not in the right block.
|
||||
//
|
||||
// First compute all of the freqs skipped from the current block.
|
||||
if need_positions {
|
||||
if self.position_computer.is_some() {
|
||||
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
|
||||
match self.block_cursor.skip_to(target) {
|
||||
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
|
||||
@@ -208,25 +201,21 @@ impl DocSet for SegmentPostings {
|
||||
self.cur = 0;
|
||||
}
|
||||
|
||||
// we're in the right block now, start with an exponential search
|
||||
let block_docs = self.block_cursor.docs();
|
||||
let new_cur = self
|
||||
.cur
|
||||
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||
let cur = self.cur;
|
||||
|
||||
if need_positions {
|
||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
||||
.iter()
|
||||
.sum::<u32>();
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.add_skip(sum_freqs_skipped as usize);
|
||||
// we're in the right block now, start with an exponential search
|
||||
let (output, len) = self.block_cursor.docs_aligned();
|
||||
let new_cur = self
|
||||
.block_searcher
|
||||
.search_in_block(&output, len, cur, target);
|
||||
if let Some(position_computer) = self.position_computer.as_mut() {
|
||||
sum_freqs_skipped += self.block_cursor.freqs()[cur..new_cur].iter().sum::<u32>();
|
||||
position_computer.add_skip(sum_freqs_skipped as usize);
|
||||
}
|
||||
self.cur = new_cur;
|
||||
|
||||
// `doc` is now the first element >= `target`
|
||||
let doc = block_docs[new_cur];
|
||||
let doc = output.0[new_cur];
|
||||
debug_assert!(doc >= target);
|
||||
if doc == target {
|
||||
SkipResult::Reached
|
||||
@@ -235,40 +224,25 @@ impl DocSet for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.position_computer.is_some() {
|
||||
let term_freq = self.term_freq() as usize;
|
||||
self.position_computer.as_mut().unwrap().add_skip(term_freq);
|
||||
}
|
||||
self.cur += 1;
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
self.cur = 0;
|
||||
if !self.block_cursor.advance() {
|
||||
self.cur = COMPRESSION_BLOCK_SIZE;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.len() as u32
|
||||
}
|
||||
|
||||
/// Return the current document's `DocId`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Will panics if called without having called advance before.
|
||||
#[inline]
|
||||
fn doc(&self) -> DocId {
|
||||
let docs = self.block_cursor.docs();
|
||||
debug_assert!(
|
||||
self.cur < docs.len(),
|
||||
"Have you forgotten to call `.advance()` at least once before calling .doc()."
|
||||
"Have you forgotten to call `.advance()` at least once before calling `.doc()` ."
|
||||
);
|
||||
docs[self.cur]
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.len() as u32
|
||||
}
|
||||
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
// finish the current block
|
||||
if self.advance() {
|
||||
@@ -292,17 +266,33 @@ impl HasLen for SegmentPostings {
|
||||
}
|
||||
|
||||
impl Postings for SegmentPostings {
|
||||
/// Returns the frequency associated to the current document.
|
||||
/// If the schema is set up so that no frequency have been encoded,
|
||||
/// this method should always return 1.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Will panics if called without having called advance before.
|
||||
fn term_freq(&self) -> u32 {
|
||||
debug_assert!(
|
||||
// Here we do not use the len of `freqs()`
|
||||
// because it is actually ok to request for the freq of doc
|
||||
// even if no frequency were encoded for the field.
|
||||
//
|
||||
// In that case we hit the block just as if the frequency had been
|
||||
// decoded. The block is simply prefilled by the value 1.
|
||||
self.cur < COMPRESSION_BLOCK_SIZE,
|
||||
"Have you forgotten to call `.advance()` at least once before calling \
|
||||
`.term_freq()`."
|
||||
);
|
||||
self.block_cursor.freq(self.cur)
|
||||
}
|
||||
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
if self.position_computer.is_some() {
|
||||
output.resize(self.term_freq() as usize, 0u32);
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.positions_with_offset(offset, &mut output[..])
|
||||
let term_freq = self.term_freq() as usize;
|
||||
if let Some(position_comp) = self.position_computer.as_mut() {
|
||||
output.resize(term_freq, 0u32);
|
||||
position_comp.positions_with_offset(offset, &mut output[..]);
|
||||
} else {
|
||||
output.clear();
|
||||
}
|
||||
@@ -424,6 +414,10 @@ impl BlockSegmentPostings {
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||
self.doc_decoder.output_aligned()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
@@ -614,38 +608,20 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::exponential_search;
|
||||
use super::linear_search;
|
||||
use super::search_within_block;
|
||||
use super::BlockSegmentPostings;
|
||||
use super::BlockSegmentPostingsSkipResult;
|
||||
use super::SegmentPostings;
|
||||
use common::HasLen;
|
||||
use core::Index;
|
||||
use docset::DocSet;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use schema::INDEXED;
|
||||
use crate::common::HasLen;
|
||||
use crate::core::Index;
|
||||
use crate::docset::DocSet;
|
||||
use crate::postings::postings::Postings;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::Term;
|
||||
use crate::schema::INDEXED;
|
||||
use crate::DocId;
|
||||
use crate::SkipResult;
|
||||
use tantivy_fst::Streamer;
|
||||
use DocId;
|
||||
use SkipResult;
|
||||
|
||||
#[test]
|
||||
fn test_linear_search() {
|
||||
let len: usize = 50;
|
||||
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
|
||||
for target in 1..*arr.last().unwrap() {
|
||||
let res = linear_search(&arr[..], target);
|
||||
if res > 0 {
|
||||
assert!(arr[res - 1] < target);
|
||||
}
|
||||
if res < len {
|
||||
assert!(arr[res] >= target);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_segment_postings() {
|
||||
@@ -655,6 +631,18 @@ mod tests {
|
||||
assert_eq!(postings.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
|
||||
fn test_panic_if_doc_called_before_advance() {
|
||||
SegmentPostings::empty().doc();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
|
||||
fn test_panic_if_freq_called_before_advance() {
|
||||
SegmentPostings::empty().term_freq();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_block_segment_postings() {
|
||||
let mut postings = BlockSegmentPostings::empty();
|
||||
@@ -662,56 +650,6 @@ mod tests {
|
||||
assert_eq!(postings.doc_freq(), 0);
|
||||
}
|
||||
|
||||
fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
||||
block
|
||||
.iter()
|
||||
.cloned()
|
||||
.enumerate()
|
||||
.filter(|&(_, ref val)| *val >= target)
|
||||
.next()
|
||||
.unwrap()
|
||||
.0
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exponentiel_search() {
|
||||
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
|
||||
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
|
||||
assert_eq!(
|
||||
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
|
||||
(3, 7)
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_within_block(block: &[u32], target: u32) {
|
||||
assert_eq!(
|
||||
search_within_block(block, target),
|
||||
search_within_block_trivial_but_slow(block, target)
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_within_block_all(block: &[u32]) {
|
||||
use std::collections::HashSet;
|
||||
let mut targets = HashSet::new();
|
||||
for (i, val) in block.iter().cloned().enumerate() {
|
||||
if i > 0 {
|
||||
targets.insert(val - 1);
|
||||
}
|
||||
targets.insert(val);
|
||||
}
|
||||
for target in targets {
|
||||
util_test_search_within_block(block, target);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_search_within_block() {
|
||||
for len in 1u32..128u32 {
|
||||
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
|
||||
util_test_search_within_block_all(&v[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings() {
|
||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
use super::TermInfo;
|
||||
use common::{BinarySerializable, VInt};
|
||||
use common::{CompositeWrite, CountingWriter};
|
||||
use core::Segment;
|
||||
use directory::WritePtr;
|
||||
use positions::PositionSerializer;
|
||||
use postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use postings::skip::SkipSerializer;
|
||||
use postings::USE_SKIP_INFO_LIMIT;
|
||||
use schema::Schema;
|
||||
use schema::{Field, FieldEntry, FieldType};
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::common::{CompositeWrite, CountingWriter};
|
||||
use crate::core::Segment;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::positions::PositionSerializer;
|
||||
use crate::postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE};
|
||||
use crate::postings::skip::SkipSerializer;
|
||||
use crate::postings::USE_SKIP_INFO_LIMIT;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::{Field, FieldEntry, FieldType};
|
||||
use crate::termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||
use crate::DocId;
|
||||
use crate::Result;
|
||||
use std::io::{self, Write};
|
||||
use termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
/// `PostingsSerializer` is in charge of serializing
|
||||
/// `InvertedIndexSerializer` is in charge of serializing
|
||||
/// postings on disk, in the
|
||||
/// * `.idx` (inverted index)
|
||||
/// * `.pos` (positions file)
|
||||
@@ -54,7 +54,7 @@ pub struct InvertedIndexSerializer {
|
||||
}
|
||||
|
||||
impl InvertedIndexSerializer {
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
/// Open a new `InvertedIndexSerializer` for the given segment
|
||||
fn create(
|
||||
terms_write: CompositeWrite<WritePtr>,
|
||||
postings_write: CompositeWrite<WritePtr>,
|
||||
@@ -73,7 +73,7 @@ impl InvertedIndexSerializer {
|
||||
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
|
||||
use SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
|
||||
use crate::SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
|
||||
InvertedIndexSerializer::create(
|
||||
CompositeWrite::wrap(segment.open_write(TERMS)?),
|
||||
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
|
||||
@@ -91,7 +91,7 @@ impl InvertedIndexSerializer {
|
||||
&mut self,
|
||||
field: Field,
|
||||
total_num_tokens: u64,
|
||||
) -> io::Result<FieldSerializer> {
|
||||
) -> io::Result<FieldSerializer<'_>> {
|
||||
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
|
||||
let term_dictionary_write = self.terms_write.for_field(field);
|
||||
let postings_write = self.postings_write.for_field(field);
|
||||
@@ -175,7 +175,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
let positions_idx = self
|
||||
.positions_serializer_opt
|
||||
.as_ref()
|
||||
.map(|positions_serializer| positions_serializer.positions_idx())
|
||||
.map(PositionSerializer::positions_idx)
|
||||
.unwrap_or(0u64);
|
||||
TermInfo {
|
||||
doc_freq: 0,
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use common::BinarySerializable;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::DocId;
|
||||
use owned_read::OwnedRead;
|
||||
use postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use schema::IndexRecordOption;
|
||||
use DocId;
|
||||
|
||||
pub struct SkipSerializer {
|
||||
buffer: Vec<u8>,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user