mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-27 20:42:54 +00:00
Compare commits
83 Commits
issue/483
...
bump-versi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8a3e520bfe | ||
|
|
bcd7386fc5 | ||
|
|
c23a7c992b | ||
|
|
2a88094ec4 | ||
|
|
ca3cfddab4 | ||
|
|
7bd9f9773b | ||
|
|
e2da92fcb5 | ||
|
|
876e1451c4 | ||
|
|
a37d2f9777 | ||
|
|
4822940b19 | ||
|
|
d590f4c6b0 | ||
|
|
edfa619519 | ||
|
|
96f194635f | ||
|
|
444662485f | ||
|
|
943c25d0f8 | ||
|
|
5c0b2a4579 | ||
|
|
9870a9258d | ||
|
|
7102b363f5 | ||
|
|
66b4615e4e | ||
|
|
da46913839 | ||
|
|
3df037961f | ||
|
|
8ffae47854 | ||
|
|
1a90a1f3b0 | ||
|
|
dac50c6aeb | ||
|
|
31b22c5acc | ||
|
|
8e50921363 | ||
|
|
96a4f503ec | ||
|
|
9df288b0c9 | ||
|
|
b7c2d0de97 | ||
|
|
62445e0ec8 | ||
|
|
a228825462 | ||
|
|
d3eabd14bc | ||
|
|
c967031d21 | ||
|
|
d823163d52 | ||
|
|
c4f59f202d | ||
|
|
acd29b535d | ||
|
|
2cd31bcda2 | ||
|
|
99870de55c | ||
|
|
cad2d91845 | ||
|
|
79f3cd6cf4 | ||
|
|
e3abb4481b | ||
|
|
bfa61d2f2f | ||
|
|
6c0e621fdb | ||
|
|
a8cc5208f1 | ||
|
|
83eb0d0cb7 | ||
|
|
ee6e273365 | ||
|
|
6ea34b3d53 | ||
|
|
22cf1004bd | ||
|
|
5768d93171 | ||
|
|
663dd89c05 | ||
|
|
a934577168 | ||
|
|
94f1885334 | ||
|
|
2ccfdb97b5 | ||
|
|
e67883138d | ||
|
|
f5c65f1f60 | ||
|
|
ec73a9a284 | ||
|
|
a814a31f1e | ||
|
|
9acadb3756 | ||
|
|
774fcecf23 | ||
|
|
27c9fa6028 | ||
|
|
fdefea9e26 | ||
|
|
b422f9c389 | ||
|
|
9451fd5b09 | ||
|
|
788b3803d9 | ||
|
|
5b11228083 | ||
|
|
515adff644 | ||
|
|
e70a45426a | ||
|
|
e14701e9cd | ||
|
|
45e62d4329 | ||
|
|
76d2b4dab6 | ||
|
|
04e9606638 | ||
|
|
a5c57ebbd9 | ||
|
|
96eaa5bc63 | ||
|
|
f1d30ab196 | ||
|
|
4507df9255 | ||
|
|
e8625548b7 | ||
|
|
50ed6fb534 | ||
|
|
76609deadf | ||
|
|
749e62c40b | ||
|
|
259ce567d1 | ||
|
|
4c93b096eb | ||
|
|
6a547b0b5f | ||
|
|
e99d1a2355 |
21
.travis.yml
21
.travis.yml
@@ -10,7 +10,7 @@ env:
|
||||
global:
|
||||
- CRATE_NAME=tantivy
|
||||
- TRAVIS_CARGO_NIGHTLY_FEATURE=""
|
||||
- secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
# - secure: eC8HjTi1wgRVCsMAeXEXt8Ckr0YBSGOEnQkkW4/Nde/OZ9jJjz2nmP1ELQlDE7+czHub2QvYtDMG0parcHZDx/Kus0yvyn08y3g2rhGIiE7y8OCvQm1Mybu2D/p7enm6shXquQ6Z5KRfRq+18mHy80wy9ABMA/ukEZdvnfQ76/Een8/Lb0eHaDoXDXn3PqLVtByvSfQQ7OhS60dEScu8PWZ6/l1057P5NpdWbMExBE7Ro4zYXNhkJeGZx0nP/Bd4Jjdt1XfPzMEybV6NZ5xsTILUBFTmOOt603IsqKGov089NExqxYu5bD3K+S4MzF1Nd6VhomNPJqLDCfhlymJCUj5n5Ku4yidlhQbM4Ej9nGrBalJnhcjBjPua5tmMF2WCxP9muKn/2tIOu1/+wc0vMf9Yd3wKIkf5+FtUxCgs2O+NslWvmOMAMI/yD25m7hb4t1IwE/4Bk+GVcWJRWXbo0/m6ZUHzRzdjUY2a1qvw7C9udzdhg7gcnXwsKrSWi2NjMiIVw86l+Zim0nLpKIN41sxZHLaFRG63Ki8zQ/481LGn32awJ6i3sizKS0WD+N1DfR2qYMrwYHaMN0uR0OFXYTJkFvTFttAeUY3EKmRKAuMhmO2YRdSr4/j/G5E9HMc1gSGJj6PxgpQU7EpvxRsmoVAEJr0mszmOj9icGHep/FM=
|
||||
|
||||
addons:
|
||||
apt:
|
||||
@@ -38,12 +38,12 @@ matrix:
|
||||
# Linux
|
||||
#- env: TARGET=aarch64-unknown-linux-gnu
|
||||
#- env: TARGET=i686-unknown-linux-gnu
|
||||
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1
|
||||
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 UPLOAD_DOCS=1
|
||||
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
|
||||
|
||||
# OSX
|
||||
- env: TARGET=x86_64-apple-darwin
|
||||
os: osx
|
||||
#- env: TARGET=x86_64-apple-darwin
|
||||
# os: osx
|
||||
|
||||
before_install:
|
||||
- set -e
|
||||
@@ -52,6 +52,7 @@ before_install:
|
||||
install:
|
||||
- sh ci/install.sh
|
||||
- source ~/.cargo/env || true
|
||||
- env | grep "TRAVIS"
|
||||
|
||||
before_script:
|
||||
- export PATH=$HOME/.cargo/bin:$PATH
|
||||
@@ -64,10 +65,20 @@ script:
|
||||
before_deploy:
|
||||
- sh ci/before_deploy.sh
|
||||
|
||||
after_success:
|
||||
# Needs GH_TOKEN env var to be set in travis settings
|
||||
- if [[ -v GH_TOKEN ]]; then echo "GH TOKEN IS SET"; else echo "GH TOKEN NOT SET"; fi
|
||||
- if [[ -v UPLOAD_DOCS ]]; then cargo doc; cargo doc-upload; else echo "doc upload disabled."; fi
|
||||
|
||||
cache: cargo
|
||||
before_cache:
|
||||
# Travis can't cache files that are not readable by "others"
|
||||
- chmod -R a+r $HOME/.cargo
|
||||
- find ./target/debug -type f -maxdepth 1 -delete
|
||||
- rm -f ./target/.rustc_info.json
|
||||
- rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
|
||||
- rm -r target/debug/examples/
|
||||
- ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
|
||||
|
||||
#branches:
|
||||
# only:
|
||||
@@ -77,4 +88,4 @@ before_cache:
|
||||
|
||||
notifications:
|
||||
email:
|
||||
on_success: never
|
||||
on_success: never
|
||||
|
||||
89
CHANGELOG.md
89
CHANGELOG.md
@@ -1,9 +1,98 @@
|
||||
Tantivy 0.10.0
|
||||
=====================
|
||||
|
||||
*Tantivy 0.10.0 index format is compatible with the index format in 0.9.0.*
|
||||
|
||||
- Added an ASCII folding filter (@drusellers)
|
||||
- Bugfix in `query.count` in presence of deletes (@pmasurel)
|
||||
- Added `.explain(...)` in `Query` and `Weight` to (@pmasurel)
|
||||
- Added an efficient way to `delete_all_documents` in `IndexWriter` (@petr-tik).
|
||||
All segments are simply removed.
|
||||
|
||||
Minor
|
||||
---------
|
||||
- Small simplification of the code.
|
||||
Calling .freq() or .doc() when .advance() has never been called
|
||||
on segment postings should panic from now on.
|
||||
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
|
||||
- Fast fields are now preloaded when the `SegmentReader` is created.
|
||||
- `IndexMeta` is now public. (@hntd187)
|
||||
- `IndexWriter` `add_document`, `delete_term`. `IndexWriter` is `Sync`, making it possible to use it with a `
|
||||
Arc<RwLock<IndexWriter>>`. `add_document` and `delete_term` can
|
||||
only require a read lock. (@pmasurel)
|
||||
- Introducing `Opstamp` as an expressive type alias for `u64`. (@petr-tik)
|
||||
- Stamper now relies on `AtomicU64` on all platforms (@petr-tik)
|
||||
|
||||
## How to update?
|
||||
|
||||
Your existing indexes are usable as is, but you may need some
|
||||
trivial updates.
|
||||
|
||||
### Fast fields
|
||||
|
||||
Fast fields used to be accessed directly from the `SegmentReader`.
|
||||
The API changed, you are now required to acquire your fast field reader via the
|
||||
`segment_reader.fast_fields()`, and use one of the typed method:
|
||||
- `.u64()`, `.i64()` if your field is single-valued ;
|
||||
- `.u64s()`, `.i64s()` if your field is multi-valued ;
|
||||
- `.bytes()` if your field is bytes fast field.
|
||||
|
||||
|
||||
|
||||
Tantivy 0.9.0
|
||||
=====================
|
||||
*0.9.0 index format is not compatible with the
|
||||
previous index format.*
|
||||
- MAJOR BUGFIX :
|
||||
Some `Mmap` objects were being leaked, and would never get released. (@fulmicoton)
|
||||
- Removed most unsafe (@fulmicoton)
|
||||
- Indexer memory footprint improved. (VInt comp, inlining the first block. (@fulmicoton)
|
||||
- Stemming in other language possible (@pentlander)
|
||||
- Segments with no docs are deleted earlier (@barrotsteindev)
|
||||
- Added grouped add and delete operations.
|
||||
They are guaranteed to happen together (i.e. they cannot be split by a commit).
|
||||
In addition, adds are guaranteed to happen on the same segment. (@elbow-jason)
|
||||
- Removed `INT_STORED` and `INT_INDEXED`. It is now possible to use `STORED` and `INDEXED`
|
||||
for int fields. (@fulmicoton)
|
||||
- Added DateTime field (@barrotsteindev)
|
||||
- Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
|
||||
- SIMD linear search within blocks (@fulmicoton)
|
||||
|
||||
## How to update ?
|
||||
|
||||
tantivy 0.9 brought some API breaking change.
|
||||
To update from tantivy 0.8, you will need to go through the following steps.
|
||||
|
||||
- `schema::INT_INDEXED` and `schema::INT_STORED` should be replaced by `schema::INDEXED` and `schema::INT_STORED`.
|
||||
- The index now does not hold the pool of searcher anymore. You are required to create an intermediary object called
|
||||
`IndexReader` for this.
|
||||
|
||||
```rust
|
||||
// create the reader. You typically need to create 1 reader for the entire
|
||||
// lifetime of you program.
|
||||
let reader = index.reader()?;
|
||||
|
||||
// Acquire a searcher (previously `index.searcher()`) is now written:
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// With the default setting of the reader, you are not required to
|
||||
// call `index.load_searchers()` anymore.
|
||||
//
|
||||
// The IndexReader will pick up that change automatically, regardless
|
||||
// of whether the update was done in a different process or not.
|
||||
// If this behavior is not wanted, you can create your reader with
|
||||
// the `ReloadPolicy::Manual`, and manually decide when to reload the index
|
||||
// by calling `reader.reload()?`.
|
||||
|
||||
```
|
||||
|
||||
|
||||
Tantivy 0.8.2
|
||||
=====================
|
||||
Fixing build for x86_64 platforms. (#496)
|
||||
No need to update from 0.8.1 if tantivy
|
||||
is building on your platform.
|
||||
|
||||
|
||||
Tantivy 0.8.1
|
||||
=====================
|
||||
|
||||
28
Cargo.toml
28
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.9.0-dev"
|
||||
version = "0.10.0-dev"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -16,14 +16,14 @@ base64 = "0.10.0"
|
||||
byteorder = "1.0"
|
||||
lazy_static = "1"
|
||||
regex = "1.0"
|
||||
fst = {version="0.3", default-features=false}
|
||||
fst-regex = { version="0.2" }
|
||||
tantivy-fst = "0.1"
|
||||
memmap = {version = "0.7", optional=true}
|
||||
lz4 = {version="1.20", optional=true}
|
||||
snap = {version="0.2"}
|
||||
atomicwrites = {version="0.2.2", optional=true}
|
||||
tempfile = "3.0"
|
||||
log = "0.4"
|
||||
combine = "3"
|
||||
combine = ">=3.6.0,<4.0.0"
|
||||
tempdir = "0.3"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
@@ -32,17 +32,17 @@ num_cpus = "1.2"
|
||||
fs2={version="0.4", optional=true}
|
||||
itertools = "0.8"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
notify = {version="4", optional=true}
|
||||
bit-set = "0.5"
|
||||
uuid = { version = "0.7", features = ["v4", "serde"] }
|
||||
uuid = { version = "0.7.2", features = ["v4", "serde"] }
|
||||
crossbeam = "0.5"
|
||||
futures = "0.1"
|
||||
futures-cpupool = "0.1"
|
||||
owning_ref = "0.4"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "1"
|
||||
downcast = { version="0.9" }
|
||||
matches = "0.1"
|
||||
bitpacking = "0.5"
|
||||
rust-stemmers = "1.1"
|
||||
downcast-rs = { version="1.0" }
|
||||
bitpacking = "0.7"
|
||||
census = "0.2"
|
||||
fnv = "1.0.6"
|
||||
owned-read = "0.4"
|
||||
@@ -51,13 +51,16 @@ htmlescape = "0.3.1"
|
||||
fail = "0.2"
|
||||
scoped-pool = "1.0"
|
||||
murmurhash32 = "0.2"
|
||||
chrono = "0.4"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.2"
|
||||
winapi = "0.3"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.6"
|
||||
maplit = "1"
|
||||
matches = "0.1.8"
|
||||
time = "0.1.42"
|
||||
|
||||
[profile.release]
|
||||
opt-level = 3
|
||||
@@ -71,12 +74,11 @@ overflow-checks = true
|
||||
[features]
|
||||
# by default no-fail is disabled. We manually enable it when running test.
|
||||
default = ["mmap", "no_fail"]
|
||||
mmap = ["fst/mmap", "atomicwrites", "fs2"]
|
||||
mmap = ["atomicwrites", "fs2", "memmap", "notify"]
|
||||
lz4-compression = ["lz4"]
|
||||
no_fail = ["fail/no_fail"]
|
||||
unstable = [] # useful for benches.
|
||||
wasm-bindgen = ["uuid/wasm-bindgen"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
|
||||
|
||||
24
README.md
24
README.md
@@ -4,6 +4,7 @@
|
||||
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
|
||||
[](https://crates.io/crates/tantivy)
|
||||
[](https://saythanks.io/to/fulmicoton)
|
||||
|
||||

|
||||
@@ -17,6 +18,7 @@
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
|
||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
|
||||
|
||||
[](https://www.patreon.com/fulmicoton)
|
||||
|
||||
|
||||
**Tantivy** is a **full text search engine library** written in rust.
|
||||
@@ -27,9 +29,18 @@ to build such a search engine.
|
||||
|
||||
Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
# Benchmark
|
||||
|
||||
Tantivy is typically faster than Lucene, but the results will depend on
|
||||
the nature of the queries in your workload.
|
||||
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
||||
performance for different type of queries / collection.
|
||||
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
- Configurable tokenizer. (stemming available for 17 latin languages. Third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)) and [Japanese](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)
|
||||
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
||||
- Tiny startup time (<10ms), perfect for command line tools
|
||||
- BM25 scoring (the same as lucene)
|
||||
@@ -41,6 +52,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
|
||||
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
|
||||
- `&[u8]` fast fields
|
||||
- Text, i64, u64, dates and hierarchical facet fields
|
||||
- LZ4 compressed document store
|
||||
- Range queries
|
||||
- Faceted search
|
||||
@@ -85,6 +97,14 @@ To check out and run tests, you can simply run :
|
||||
Some tests will not run with just `cargo test` because of `fail-rs`.
|
||||
To run the tests exhaustively, run `./run-tests.sh`.
|
||||
|
||||
# Contribute
|
||||
# How can I support this project ?
|
||||
|
||||
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||
There are many ways to support this project.
|
||||
|
||||
- If you use tantivy, tell us about your experience on [gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
|
||||
- Report bugs
|
||||
- Write a blog post
|
||||
- Complete documentation
|
||||
- Contribute code (you can join [our gitter](https://gitter.im/tantivy-search/tantivy) )
|
||||
- Talk about tantivy around you
|
||||
- Drop a word on on [](https://saythanks.io/to/fulmicoton) or even [](https://www.patreon.com/fulmicoton)
|
||||
|
||||
@@ -20,6 +20,7 @@ use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::ReloadPolicy;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
@@ -170,24 +171,33 @@ fn main() -> tantivy::Result<()> {
|
||||
//
|
||||
// ### Searcher
|
||||
//
|
||||
// Let's search our index. Start by reloading
|
||||
// searchers in the index. This should be done
|
||||
// after every `commit()`.
|
||||
index.load_searchers()?;
|
||||
// A reader is required to get search the index.
|
||||
// It acts as a `Searcher` pool that reloads itself,
|
||||
// depending on a `ReloadPolicy`.
|
||||
//
|
||||
// For a search server you will typically create one reader for the entire lifetime of your
|
||||
// program, and acquire a new searcher for every single request.
|
||||
//
|
||||
// In the code below, we rely on the 'ON_COMMIT' policy: the reader
|
||||
// will reload the index automatically after each commit.
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()?;
|
||||
|
||||
// We now need to acquire a searcher.
|
||||
// Some search experience might require more than
|
||||
// one query.
|
||||
//
|
||||
// The searcher ensure that we get to work
|
||||
// with a consistent version of the index.
|
||||
// A searcher points to snapshotted, immutable version of the index.
|
||||
//
|
||||
// Some search experience might require more than
|
||||
// one query. Using the same searcher ensures that all of these queries will run on the
|
||||
// same version of the index.
|
||||
//
|
||||
// Acquiring a `searcher` is very cheap.
|
||||
//
|
||||
// You should acquire a searcher every time you
|
||||
// start processing a request and
|
||||
// You should acquire a searcher every time you start processing a request and
|
||||
// and release it right after your query is finished.
|
||||
let searcher = index.searcher();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// ### Query
|
||||
|
||||
@@ -224,7 +234,6 @@ fn main() -> tantivy::Result<()> {
|
||||
// Since the body field was not configured as stored,
|
||||
// the document returned will only contain
|
||||
// a title.
|
||||
|
||||
for (_score, doc_address) in top_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
println!("{}", schema.to_json(&retrieved_doc));
|
||||
|
||||
@@ -17,9 +17,9 @@ use tantivy::collector::{Collector, SegmentCollector};
|
||||
use tantivy::fastfield::FastFieldReader;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::Field;
|
||||
use tantivy::schema::{Schema, FAST, INT_INDEXED, TEXT};
|
||||
use tantivy::Index;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||
use tantivy::SegmentReader;
|
||||
use tantivy::{Index, TantivyError};
|
||||
|
||||
#[derive(Default)]
|
||||
struct Stats {
|
||||
@@ -75,9 +75,18 @@ impl Collector for StatsCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: u32,
|
||||
segment: &SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> tantivy::Result<StatsSegmentCollector> {
|
||||
let fast_field_reader = segment.fast_field_reader(self.field)?;
|
||||
let fast_field_reader = segment_reader
|
||||
.fast_fields()
|
||||
.u64(self.field)
|
||||
.ok_or_else(|| {
|
||||
let field_name = segment_reader.schema().get_field_name(self.field);
|
||||
TantivyError::SchemaError(format!(
|
||||
"Field {:?} is not a u64 fast field.",
|
||||
field_name
|
||||
))
|
||||
})?;
|
||||
Ok(StatsSegmentCollector {
|
||||
fast_field_reader,
|
||||
stats: Stats::default(),
|
||||
@@ -137,7 +146,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// products, and with a name, a description, and a price.
|
||||
let product_name = schema_builder.add_text_field("name", TEXT);
|
||||
let product_description = schema_builder.add_text_field("description", TEXT);
|
||||
let price = schema_builder.add_u64_field("price", INT_INDEXED | FAST);
|
||||
let price = schema_builder.add_u64_field("price", INDEXED | FAST);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
@@ -170,9 +179,9 @@ fn main() -> tantivy::Result<()> {
|
||||
price => 5_200u64
|
||||
));
|
||||
index_writer.commit()?;
|
||||
index.load_searchers()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);
|
||||
|
||||
// here we want to get a hit on the 'ken' in Frankenstein
|
||||
|
||||
@@ -91,9 +91,9 @@ fn main() -> tantivy::Result<()> {
|
||||
increasing confidence in the success of my undertaking."#
|
||||
));
|
||||
index_writer.commit()?;
|
||||
index.load_searchers()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// The query parser can interpret human queries.
|
||||
// Here, if the user does not specify which
|
||||
|
||||
@@ -14,12 +14,16 @@ use tantivy::collector::TopDocs;
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::IndexReader;
|
||||
|
||||
// A simple helper function to fetch a single document
|
||||
// given its id from our index.
|
||||
// It will be helpful to check our work.
|
||||
fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Option<Document>> {
|
||||
let searcher = index.searcher();
|
||||
fn extract_doc_given_isbn(
|
||||
reader: &IndexReader,
|
||||
isbn_term: &Term,
|
||||
) -> tantivy::Result<Option<Document>> {
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// This is the simplest query you can think of.
|
||||
// It matches all of the documents containing a specific term.
|
||||
@@ -85,12 +89,12 @@ fn main() -> tantivy::Result<()> {
|
||||
isbn => "978-9176370711",
|
||||
));
|
||||
index_writer.commit()?;
|
||||
index.load_searchers()?;
|
||||
let reader = index.reader()?;
|
||||
|
||||
let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711");
|
||||
|
||||
// Oops our frankenstein doc seems mispelled
|
||||
let frankenstein_doc_misspelled = extract_doc_given_isbn(&index, &frankenstein_isbn)?.unwrap();
|
||||
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
|
||||
assert_eq!(
|
||||
schema.to_json(&frankenstein_doc_misspelled),
|
||||
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
|
||||
@@ -129,10 +133,10 @@ fn main() -> tantivy::Result<()> {
|
||||
// Everything happened as if the document was updated.
|
||||
index_writer.commit()?;
|
||||
// We reload our searcher to make our change available to clients.
|
||||
index.load_searchers()?;
|
||||
reader.reload()?;
|
||||
|
||||
// No more typo!
|
||||
let frankenstein_new_doc = extract_doc_given_isbn(&index, &frankenstein_isbn)?.unwrap();
|
||||
let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
|
||||
assert_eq!(
|
||||
schema.to_json(&frankenstein_new_doc),
|
||||
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
|
||||
|
||||
@@ -55,9 +55,9 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
index.load_searchers()?;
|
||||
let reader = index.reader()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let mut facet_collector = FacetCollector::for_field(tags);
|
||||
facet_collector.add_facet("/pools");
|
||||
|
||||
43
examples/integer_range_search.rs
Normal file
43
examples/integer_range_search.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
// # Searching a range on an indexed int field.
|
||||
//
|
||||
// Below is an example of creating an indexed integer field in your schema
|
||||
// You can use RangeQuery to get a Count of all occurrences in a given range.
|
||||
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::Count;
|
||||
use tantivy::query::RangeQuery;
|
||||
use tantivy::schema::{Schema, INDEXED};
|
||||
use tantivy::Index;
|
||||
use tantivy::Result;
|
||||
|
||||
fn run() -> Result<()> {
|
||||
// For the sake of simplicity, this schema will only have 1 field
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// `INDEXED` is a short-hand to indicate that our field should be "searchable".
|
||||
let year_field = schema_builder.add_u64_field("year", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let reader = index.reader()?;
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
|
||||
for year in 1950u64..2019u64 {
|
||||
index_writer.add_document(doc!(year_field => year));
|
||||
}
|
||||
index_writer.commit()?;
|
||||
// The index will be a range of years
|
||||
}
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
// The end is excluded i.e. here we are searching up to 1969
|
||||
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
|
||||
// Uses a Count collector to sum the total number of docs in the range
|
||||
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||
assert_eq!(num_60s_books, 10);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() {
|
||||
run().unwrap()
|
||||
}
|
||||
@@ -33,9 +33,9 @@ fn main() -> tantivy::Result<()> {
|
||||
index_writer.add_document(doc!(title => "The modern Promotheus"));
|
||||
index_writer.commit()?;
|
||||
|
||||
index.load_searchers()?;
|
||||
let reader = index.reader()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// A tantivy index is actually a collection of segments.
|
||||
// Similarly, a searcher just wraps a list `segment_reader`.
|
||||
|
||||
107
examples/multiple_producer.rs
Normal file
107
examples/multiple_producer.rs
Normal file
@@ -0,0 +1,107 @@
|
||||
// # Indexing from different threads.
|
||||
//
|
||||
// It is fairly common to have to index from different threads.
|
||||
// Tantivy forbids to create more than one `IndexWriter` at a time.
|
||||
//
|
||||
// This `IndexWriter` itself has its own multithreaded layer, so managing your own
|
||||
// indexing threads will not help. However, it can still be useful for some applications.
|
||||
//
|
||||
// For instance, if preparing documents to send to tantivy before indexing is the bottleneck of
|
||||
// your application, it is reasonable to have multiple threads.
|
||||
//
|
||||
// Another very common reason to want to index from multiple threads, is implementing a webserver
|
||||
// with CRUD capabilities. The server framework will most likely handle request from
|
||||
// different threads.
|
||||
//
|
||||
// The recommended way to address both of these use case is to wrap your `IndexWriter` into a
|
||||
// `Arc<RwLock<IndexWriter>>`.
|
||||
//
|
||||
// While this is counterintuitive, adding and deleting documents do not require mutability
|
||||
// over the `IndexWriter`, so several threads will be able to do this operation concurrently.
|
||||
//
|
||||
// The example below does not represent an actual real-life use case (who would spawn thread to
|
||||
// index a single document?), but aims at demonstrating the mechanism that makes indexing
|
||||
// from several threads possible.
|
||||
|
||||
extern crate tempdir;
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tantivy::schema::{Schema, STORED, TEXT};
|
||||
use tantivy::Opstamp;
|
||||
use tantivy::{Index, IndexWriter};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
let body = schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let index_writer: Arc<RwLock<IndexWriter>> = Arc::new(RwLock::new(index.writer(50_000_000)?));
|
||||
|
||||
// # First indexing thread.
|
||||
let index_writer_clone_1 = index_writer.clone();
|
||||
thread::spawn(move || {
|
||||
// we index 100 times the document... for the sake of the example.
|
||||
for i in 0..100 {
|
||||
let opstamp = {
|
||||
// A read lock is sufficient here.
|
||||
let index_writer_rlock = index_writer_clone_1.read().unwrap();
|
||||
index_writer_rlock.add_document(
|
||||
doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
))
|
||||
};
|
||||
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
}
|
||||
});
|
||||
|
||||
// # Second indexing thread.
|
||||
let index_writer_clone_2 = index_writer.clone();
|
||||
// For convenience, tantivy also comes with a macro to
|
||||
// reduce the boilerplate above.
|
||||
thread::spawn(move || {
|
||||
// we index 100 times the document... for the sake of the example.
|
||||
for i in 0..100 {
|
||||
// A read lock is sufficient here.
|
||||
let opstamp = {
|
||||
let index_writer_rlock = index_writer_clone_2.read().unwrap();
|
||||
index_writer_rlock.add_document(doc!(
|
||||
title => "Manufacturing consent",
|
||||
body => "Some great book description..."
|
||||
))
|
||||
};
|
||||
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(10));
|
||||
}
|
||||
});
|
||||
|
||||
// # In the main thread, we commit 10 times, once every 500ms.
|
||||
for _ in 0..10 {
|
||||
let opstamp: Opstamp = {
|
||||
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
|
||||
let mut index_writer_wlock = index_writer.write().unwrap();
|
||||
index_writer_wlock.commit().unwrap()
|
||||
};
|
||||
println!("committed with opstamp {}", opstamp);
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -48,9 +48,8 @@ fn main() -> tantivy::Result<()> {
|
||||
// ...
|
||||
index_writer.commit()?;
|
||||
|
||||
index.load_searchers()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
let query = query_parser.parse_query("sycamore spring")?;
|
||||
|
||||
|
||||
@@ -96,9 +96,9 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
index_writer.commit()?;
|
||||
|
||||
index.load_searchers()?;
|
||||
let reader = index.reader()?;
|
||||
|
||||
let searcher = index.searcher();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
schema_builder.add_u64_field("year", INT_INDEXED);
|
||||
schema_builder.add_u64_field("year", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// Let's assume we have a json-serialized document.
|
||||
|
||||
@@ -40,8 +40,8 @@ use SegmentReader;
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
|
||||
@@ -17,6 +17,7 @@ use Result;
|
||||
use Score;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use TantivyError;
|
||||
|
||||
struct Hit<'a> {
|
||||
count: u64,
|
||||
@@ -122,17 +123,16 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// facet => Facet::from("/lang/en"),
|
||||
/// facet => Facet::from("/category/biography")
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// index_writer.commit()?;
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||
/// facet_collector.add_facet("/lang");
|
||||
/// facet_collector.add_facet("/category");
|
||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = facet_counts
|
||||
@@ -147,7 +147,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||
/// facet_collector.add_facet("/category/fiction");
|
||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = facet_counts
|
||||
@@ -163,7 +163,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
/// {
|
||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||
/// facet_collector.add_facet("/category/fiction");
|
||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
|
||||
///
|
||||
/// // This lists all of the facet counts
|
||||
/// let facets: Vec<(&Facet, u64)> = facet_counts.top_k("/category/fiction", 1);
|
||||
@@ -265,7 +265,10 @@ impl Collector for FacetCollector {
|
||||
_: SegmentLocalId,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<FacetSegmentCollector> {
|
||||
let facet_reader = reader.facet_reader(self.field)?;
|
||||
let field_name = reader.schema().get_field_name(self.field);
|
||||
let facet_reader = reader.facet_reader(self.field).ok_or_else(|| {
|
||||
TantivyError::SchemaError(format!("Field {:?} is not a facet field.", field_name))
|
||||
})?;
|
||||
|
||||
let mut collapse_mapping = Vec::new();
|
||||
let mut counts = Vec::new();
|
||||
@@ -483,8 +486,8 @@ mod tests {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet(Facet::from("/top1"));
|
||||
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
||||
@@ -532,8 +535,8 @@ mod tests {
|
||||
facet_field => Facet::from_text(&"/subjects/B/b"),
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 1);
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet("/subjects");
|
||||
@@ -579,9 +582,7 @@ mod tests {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet("/facet");
|
||||
@@ -635,8 +636,7 @@ mod bench {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
b.iter(|| {
|
||||
let searcher = index.searcher();
|
||||
let facet_collector = FacetCollector::for_field(facet_field);
|
||||
|
||||
@@ -101,8 +101,7 @@ mod tests {
|
||||
assert_eq!(index_writer.commit().unwrap(), 10u64);
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().searcher();
|
||||
let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64);
|
||||
let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64);
|
||||
|
||||
|
||||
@@ -53,9 +53,9 @@ use tantivy::collector::{Count, TopDocs};
|
||||
# index_writer.add_document(doc!(
|
||||
# title => "The Diary of Muadib",
|
||||
# ));
|
||||
# index_writer.commit().unwrap();
|
||||
# index.load_searchers()?;
|
||||
# let searcher = index.searcher();
|
||||
# index_writer.commit()?;
|
||||
# let reader = index.reader()?;
|
||||
# let searcher = reader.searcher();
|
||||
# let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
# let query = query_parser.parse_query("diary")?;
|
||||
let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
|
||||
@@ -85,7 +85,7 @@ See the `custom_collector` example.
|
||||
|
||||
*/
|
||||
|
||||
use downcast;
|
||||
use downcast_rs;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
@@ -111,9 +111,9 @@ pub use self::facet_collector::FacetCollector;
|
||||
|
||||
/// `Fruit` is the type for the result of our collection.
|
||||
/// e.g. `usize` for the `Count` collector.
|
||||
pub trait Fruit: Send + downcast::Any {}
|
||||
pub trait Fruit: Send + downcast_rs::Downcast {}
|
||||
|
||||
impl<T> Fruit for T where T: Send + downcast::Any {}
|
||||
impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
|
||||
|
||||
/// Collectors are in charge of collecting and retaining relevant
|
||||
/// information from the document found and scored by the query.
|
||||
@@ -358,10 +358,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(missing_docs)]
|
||||
mod downcast_impl {
|
||||
downcast!(super::Fruit);
|
||||
}
|
||||
impl_downcast!(Fruit);
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests;
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use super::Collector;
|
||||
use super::SegmentCollector;
|
||||
use collector::Fruit;
|
||||
use downcast::Downcast;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::Deref;
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
@@ -37,11 +37,11 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
||||
let typed_fruit: Vec<TCollector::Fruit> = children
|
||||
.into_iter()
|
||||
.map(|untyped_fruit| {
|
||||
Downcast::<TCollector::Fruit>::downcast(untyped_fruit)
|
||||
untyped_fruit
|
||||
.downcast::<TCollector::Fruit>()
|
||||
.map(|boxed_but_typed| *boxed_but_typed)
|
||||
.map_err(|e| {
|
||||
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
|
||||
TantivyError::InvalidArgument(err_msg)
|
||||
.map_err(|_| {
|
||||
TantivyError::InvalidArgument("Failed to cast child fruit.".to_string())
|
||||
})
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
@@ -89,7 +89,10 @@ pub struct FruitHandle<TFruit: Fruit> {
|
||||
impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
|
||||
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
|
||||
*Downcast::<TFruit>::downcast(boxed_fruit).expect("Failed")
|
||||
*boxed_fruit
|
||||
.downcast::<TFruit>()
|
||||
.map_err(|_| ())
|
||||
.expect("Failed to downcast collector fruit.")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -132,8 +135,8 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let mut collectors = MultiCollector::new();
|
||||
/// let top_docs_handle = collectors.add_collector(TopDocs::with_limit(2));
|
||||
@@ -197,7 +200,10 @@ impl<'a> Collector for MultiCollector<'a> {
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
self.collector_wrappers.iter().any(|c| c.requires_scoring())
|
||||
self.collector_wrappers
|
||||
.iter()
|
||||
.map(Deref::deref)
|
||||
.any(Collector::requires_scoring)
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, segments_multifruits: Vec<MultiFruit>) -> Result<MultiFruit> {
|
||||
@@ -276,8 +282,7 @@ mod tests {
|
||||
index_writer.add_document(doc!(text=>"abc"));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let term = Term::from_field_text(text, "abc");
|
||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
|
||||
|
||||
@@ -114,11 +114,15 @@ impl Collector for FastFieldTestCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_: SegmentLocalId,
|
||||
reader: &SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Result<FastFieldSegmentCollector> {
|
||||
let reader = segment_reader
|
||||
.fast_fields()
|
||||
.u64(self.field)
|
||||
.expect("Requested field is not a fast field.");
|
||||
Ok(FastFieldSegmentCollector {
|
||||
vals: Vec::new(),
|
||||
reader: reader.fast_field_reader(self.field)?,
|
||||
reader,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -170,11 +174,14 @@ impl Collector for BytesFastFieldTestCollector {
|
||||
fn for_segment(
|
||||
&self,
|
||||
_segment_local_id: u32,
|
||||
segment: &SegmentReader,
|
||||
segment_reader: &SegmentReader,
|
||||
) -> Result<BytesFastFieldSegmentCollector> {
|
||||
Ok(BytesFastFieldSegmentCollector {
|
||||
vals: Vec::new(),
|
||||
reader: segment.bytes_fast_field_reader(self.field)?,
|
||||
reader: segment_reader
|
||||
.fast_fields()
|
||||
.bytes(self.field)
|
||||
.expect("Field is not a bytes fast field."),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -191,7 +198,7 @@ impl SegmentCollector for BytesFastFieldSegmentCollector {
|
||||
type Fruit = Vec<u8>;
|
||||
|
||||
fn collect(&mut self, doc: u32, _score: f32) {
|
||||
let data = self.reader.get_val(doc);
|
||||
let data = self.reader.get_bytes(doc);
|
||||
self.vals.extend(data);
|
||||
}
|
||||
|
||||
|
||||
@@ -98,11 +98,11 @@ where
|
||||
.collect())
|
||||
}
|
||||
|
||||
pub(crate) fn for_segment(
|
||||
pub(crate) fn for_segment<F: PartialOrd>(
|
||||
&self,
|
||||
segment_id: SegmentLocalId,
|
||||
_: &SegmentReader,
|
||||
) -> Result<TopSegmentCollector<T>> {
|
||||
) -> Result<TopSegmentCollector<F>> {
|
||||
Ok(TopSegmentCollector::new(segment_id, self.limit))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,10 +5,12 @@ use collector::SegmentCollector;
|
||||
use fastfield::FastFieldReader;
|
||||
use fastfield::FastValue;
|
||||
use schema::Field;
|
||||
use std::marker::PhantomData;
|
||||
use DocAddress;
|
||||
use Result;
|
||||
use SegmentLocalId;
|
||||
use SegmentReader;
|
||||
use TantivyError;
|
||||
|
||||
/// The Top Field Collector keeps track of the K documents
|
||||
/// sorted by a fast field in the index
|
||||
@@ -23,15 +25,16 @@ use SegmentReader;
|
||||
/// # use tantivy::schema::{Schema, Field, FAST, TEXT};
|
||||
/// # use tantivy::{Index, Result, DocAddress};
|
||||
/// # use tantivy::query::{Query, QueryParser};
|
||||
/// use tantivy::Searcher;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
///
|
||||
/// # fn main() {
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
/// # let title = schema_builder.add_text_field("title", TEXT);
|
||||
/// # let rating = schema_builder.add_u64_field("rating", FAST);
|
||||
/// # let schema = schema_builder.build();
|
||||
/// # let index = Index::create_in_ram(schema);
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||
/// # index_writer.add_document(doc!(
|
||||
/// # title => "The Name of the Wind",
|
||||
/// # rating => 92u64,
|
||||
@@ -39,13 +42,14 @@ use SegmentReader;
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
|
||||
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
|
||||
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
|
||||
/// # index_writer.commit().unwrap();
|
||||
/// # index.load_searchers().unwrap();
|
||||
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary").unwrap();
|
||||
/// # let top_docs = docs_sorted_by_rating(&index, &query, rating).unwrap();
|
||||
/// # index_writer.commit()?;
|
||||
/// # let reader = index.reader()?;
|
||||
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
|
||||
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
|
||||
/// # assert_eq!(top_docs,
|
||||
/// # vec![(97u64, DocAddress(0u32, 1)),
|
||||
/// # (80u64, DocAddress(0u32, 3))]);
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// #
|
||||
/// /// Searches the document matching the given query, and
|
||||
@@ -53,7 +57,9 @@ use SegmentReader;
|
||||
/// /// given in argument.
|
||||
/// ///
|
||||
/// /// `field` is required to be a FAST field.
|
||||
/// fn docs_sorted_by_rating(index: &Index, query: &Query, sort_by_field: Field)
|
||||
/// fn docs_sorted_by_rating(searcher: &Searcher,
|
||||
/// query: &Query,
|
||||
/// sort_by_field: Field)
|
||||
/// -> Result<Vec<(u64, DocAddress)>> {
|
||||
///
|
||||
/// // This is where we build our collector!
|
||||
@@ -61,8 +67,7 @@ use SegmentReader;
|
||||
///
|
||||
/// // ... and here is our documents. Not this is a simple vec.
|
||||
/// // The `u64` in the pair is the value of our fast field for each documents.
|
||||
/// index.searcher()
|
||||
/// .search(query, &top_docs_by_rating)
|
||||
/// searcher.search(query, &top_docs_by_rating)
|
||||
/// }
|
||||
/// ```
|
||||
pub struct TopDocsByField<T> {
|
||||
@@ -76,6 +81,12 @@ impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
|
||||
/// The given field name must be a fast field, otherwise the collector have an error while
|
||||
/// collecting results.
|
||||
///
|
||||
/// This constructor is crate-private. Client are supposed to call
|
||||
/// build `TopDocsByField` object using the `TopDocs` API.
|
||||
///
|
||||
/// e.g.:
|
||||
/// `TopDocs::with_limit(2).order_by_field(sort_by_field)`
|
||||
///
|
||||
/// # Panics
|
||||
/// The method panics if limit is 0
|
||||
pub(crate) fn new(field: Field, limit: usize) -> TopDocsByField<T> {
|
||||
@@ -97,8 +108,15 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByF
|
||||
reader: &SegmentReader,
|
||||
) -> Result<TopFieldSegmentCollector<T>> {
|
||||
let collector = self.collector.for_segment(segment_local_id, reader)?;
|
||||
let reader = reader.fast_field_reader(self.field)?;
|
||||
Ok(TopFieldSegmentCollector { collector, reader })
|
||||
let reader = reader.fast_fields().u64(self.field).ok_or_else(|| {
|
||||
let field_name = reader.schema().get_field_name(self.field);
|
||||
TantivyError::SchemaError(format!("Failed to find fast field reader {:?}", field_name))
|
||||
})?;
|
||||
Ok(TopFieldSegmentCollector {
|
||||
collector,
|
||||
reader,
|
||||
_type: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
@@ -113,9 +131,10 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByF
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TopFieldSegmentCollector<T: FastValue + PartialOrd> {
|
||||
collector: TopSegmentCollector<T>,
|
||||
reader: FastFieldReader<T>,
|
||||
pub struct TopFieldSegmentCollector<T> {
|
||||
collector: TopSegmentCollector<u64>,
|
||||
reader: FastFieldReader<u64>,
|
||||
_type: PhantomData<T>,
|
||||
}
|
||||
|
||||
impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
||||
@@ -129,7 +148,11 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
||||
}
|
||||
|
||||
fn harvest(self) -> Vec<(T, DocAddress)> {
|
||||
self.collector.harvest()
|
||||
self.collector
|
||||
.harvest()
|
||||
.into_iter()
|
||||
.map(|(val, doc_address)| (T::from_u64(val), doc_address))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -171,7 +194,7 @@ mod tests {
|
||||
size => 16u64,
|
||||
));
|
||||
});
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
|
||||
let top_collector = TopDocs::with_limit(4).order_by_field(size);
|
||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
|
||||
@@ -198,7 +221,7 @@ mod tests {
|
||||
size => 12u64,
|
||||
));
|
||||
});
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(Field(2));
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
top_collector
|
||||
@@ -218,7 +241,7 @@ mod tests {
|
||||
size => 12u64,
|
||||
));
|
||||
});
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment = searcher.segment_reader(0);
|
||||
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(size);
|
||||
assert_matches!(
|
||||
@@ -226,7 +249,7 @@ mod tests {
|
||||
.for_segment(0, segment)
|
||||
.map(|_| ())
|
||||
.unwrap_err(),
|
||||
TantivyError::FastFieldError(_)
|
||||
TantivyError::SchemaError(_)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -241,8 +264,6 @@ mod tests {
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
doc_adder(&mut index_writer);
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![query_field]);
|
||||
let query = query_parser.parse_query(query).unwrap();
|
||||
(index, query)
|
||||
|
||||
@@ -51,8 +51,8 @@ use SegmentReader;
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query = query_parser.parse_query("diary")?;
|
||||
@@ -148,7 +148,6 @@ mod tests {
|
||||
index_writer.add_document(doc!(text_field=>"I like Droopy"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
@@ -159,6 +158,8 @@ mod tests {
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let score_docs: Vec<(Score, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
.searcher()
|
||||
.search(&text_query, &TopDocs::with_limit(4))
|
||||
.unwrap();
|
||||
@@ -179,6 +180,8 @@ mod tests {
|
||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
||||
let score_docs: Vec<(Score, DocAddress)> = index
|
||||
.reader()
|
||||
.unwrap()
|
||||
.searcher()
|
||||
.search(&text_query, &TopDocs::with_limit(2))
|
||||
.unwrap();
|
||||
|
||||
@@ -64,7 +64,7 @@ pub struct BitUnpacker<Data>
|
||||
where
|
||||
Data: Deref<Target = [u8]>,
|
||||
{
|
||||
num_bits: usize,
|
||||
num_bits: u64,
|
||||
mask: u64,
|
||||
data: Data,
|
||||
}
|
||||
@@ -80,13 +80,13 @@ where
|
||||
(1u64 << num_bits) - 1u64
|
||||
};
|
||||
BitUnpacker {
|
||||
num_bits: num_bits as usize,
|
||||
num_bits: u64::from(num_bits),
|
||||
mask,
|
||||
data,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: usize) -> u64 {
|
||||
pub fn get(&self, idx: u64) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0u64;
|
||||
}
|
||||
@@ -97,10 +97,10 @@ where
|
||||
let addr = addr_in_bits >> 3;
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
debug_assert!(
|
||||
addr + 8 <= data.len(),
|
||||
addr + 8 <= data.len() as u64,
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
|
||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[(addr as usize)..]);
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
}
|
||||
@@ -129,7 +129,7 @@ mod test {
|
||||
fn test_bitpacker_util(len: usize, num_bits: u8) {
|
||||
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
|
||||
for (i, val) in vals.iter().enumerate() {
|
||||
assert_eq!(bitunpacker.get(i), *val);
|
||||
assert_eq!(bitunpacker.get(i as u64), *val);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ impl BinarySerializable for FileAddr {
|
||||
/// A `CompositeWrite` is used to write a `CompositeFile`.
|
||||
pub struct CompositeWrite<W = WritePtr> {
|
||||
write: CountingWriter<W>,
|
||||
offsets: HashMap<FileAddr, usize>,
|
||||
offsets: HashMap<FileAddr, u64>,
|
||||
}
|
||||
|
||||
impl<W: Write> CompositeWrite<W> {
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::io::Write;
|
||||
|
||||
pub struct CountingWriter<W> {
|
||||
underlying: W,
|
||||
written_bytes: usize,
|
||||
written_bytes: u64,
|
||||
}
|
||||
|
||||
impl<W: Write> CountingWriter<W> {
|
||||
@@ -14,11 +14,11 @@ impl<W: Write> CountingWriter<W> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn written_bytes(&self) -> usize {
|
||||
pub fn written_bytes(&self) -> u64 {
|
||||
self.written_bytes
|
||||
}
|
||||
|
||||
pub fn finish(mut self) -> io::Result<(W, usize)> {
|
||||
pub fn finish(mut self) -> io::Result<(W, u64)> {
|
||||
self.flush()?;
|
||||
Ok((self.underlying, self.written_bytes))
|
||||
}
|
||||
@@ -27,10 +27,16 @@ impl<W: Write> CountingWriter<W> {
|
||||
impl<W: Write> Write for CountingWriter<W> {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
let written_size = self.underlying.write(buf)?;
|
||||
self.written_bytes += written_size;
|
||||
self.written_bytes += written_size as u64;
|
||||
Ok(written_size)
|
||||
}
|
||||
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||
self.underlying.write_all(buf)?;
|
||||
self.written_bytes += buf.len() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.underlying.flush()
|
||||
}
|
||||
@@ -48,8 +54,8 @@ mod test {
|
||||
let mut counting_writer = CountingWriter::wrap(buffer);
|
||||
let bytes = (0u8..10u8).collect::<Vec<u8>>();
|
||||
counting_writer.write_all(&bytes).unwrap();
|
||||
let (w, len): (Vec<u8>, usize) = counting_writer.finish().unwrap();
|
||||
assert_eq!(len, 10);
|
||||
let (w, len): (Vec<u8>, u64) = counting_writer.finish().unwrap();
|
||||
assert_eq!(len, 10u64);
|
||||
assert_eq!(w.len(), 10);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,10 @@ pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
use std::io;
|
||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||
///
|
||||
/// We do not allow segments with more than
|
||||
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
||||
|
||||
/// Computes the number of bits that will be used for bitpacking.
|
||||
///
|
||||
@@ -52,11 +55,6 @@ pub(crate) fn is_power_of_2(n: usize) -> bool {
|
||||
(n > 0) && (n & (n - 1) == 0)
|
||||
}
|
||||
|
||||
/// Create a default io error given a string.
|
||||
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
}
|
||||
|
||||
/// Has length trait
|
||||
pub trait HasLen {
|
||||
/// Return length
|
||||
@@ -134,4 +132,11 @@ pub(crate) mod test {
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_max_doc() {
|
||||
// this is the first time I write a unit test for a constant.
|
||||
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
|
||||
assert!((super::MAX_DOC_LIMIT as i32) < 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,15 +123,14 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_map_multithread() {
|
||||
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
|
||||
.map(|i| Ok(i * 2), 0..10)
|
||||
.unwrap();
|
||||
assert_eq!(result.len(), 10);
|
||||
for i in 0..10 {
|
||||
assert_eq!(result[i], i * 2);
|
||||
#[test]
|
||||
fn test_map_multithread() {
|
||||
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
|
||||
.map(|i| Ok(i * 2), 0..10)
|
||||
.unwrap();
|
||||
assert_eq!(result.len(), 10);
|
||||
for i in 0..10 {
|
||||
assert_eq!(result[i], i * 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,19 +1,14 @@
|
||||
use super::pool::LeasedItem;
|
||||
use super::pool::Pool;
|
||||
use super::segment::create_segment;
|
||||
use super::segment::Segment;
|
||||
use core::searcher::Searcher;
|
||||
use core::Executor;
|
||||
use core::IndexMeta;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use core::SegmentReader;
|
||||
use core::META_FILEPATH;
|
||||
use directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use directory::INDEX_WRITER_LOCK;
|
||||
use directory::META_LOCK;
|
||||
use directory::{Directory, RAMDirectory};
|
||||
use error::DataCorruption;
|
||||
use error::TantivyError;
|
||||
@@ -21,14 +16,16 @@ use indexer::index_writer::open_index_writer;
|
||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||
use indexer::segment_updater::save_new_metas;
|
||||
use num_cpus;
|
||||
use reader::IndexReader;
|
||||
use reader::IndexReaderBuilder;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::fmt;
|
||||
#[cfg(feature = "mmap")]
|
||||
use std::path::Path;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::TokenizerManager;
|
||||
@@ -49,11 +46,10 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||
}
|
||||
|
||||
/// Search Index
|
||||
#[derive(Clone)]
|
||||
pub struct Index {
|
||||
directory: ManagedDirectory,
|
||||
schema: Schema,
|
||||
num_searchers: Arc<AtomicUsize>,
|
||||
searcher_pool: Arc<Pool<Searcher>>,
|
||||
executor: Arc<Executor>,
|
||||
tokenizers: TokenizerManager,
|
||||
}
|
||||
@@ -111,7 +107,6 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Opens or creates a new index in the provided directory
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||
if Index::exists(&dir) {
|
||||
let index = Index::open(dir)?;
|
||||
@@ -159,16 +154,12 @@ impl Index {
|
||||
/// Creates a new index given a directory and an `IndexMeta`.
|
||||
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
|
||||
let schema = metas.schema.clone();
|
||||
let n_cpus = num_cpus::get();
|
||||
let index = Index {
|
||||
directory,
|
||||
schema,
|
||||
num_searchers: Arc::new(AtomicUsize::new(n_cpus)),
|
||||
searcher_pool: Arc::new(Pool::new()),
|
||||
tokenizers: TokenizerManager::default(),
|
||||
executor: Arc::new(Executor::single_thread()),
|
||||
};
|
||||
index.load_searchers()?;
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
@@ -198,6 +189,22 @@ impl Index {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a default `IndexReader` for the given index.
|
||||
///
|
||||
/// See [`Index.reader_builder()`](#method.reader_builder).
|
||||
pub fn reader(&self) -> Result<IndexReader> {
|
||||
self.reader_builder().try_into()
|
||||
}
|
||||
|
||||
/// Create a `IndexReader` for the given index.
|
||||
///
|
||||
/// Most project should create at most one reader for a given index.
|
||||
/// This method is typically called only once per `Index` instance,
|
||||
/// over the lifetime of most problem.
|
||||
pub fn reader_builder(&self) -> IndexReaderBuilder {
|
||||
IndexReaderBuilder::new(self.clone())
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
#[cfg(feature = "mmap")]
|
||||
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
@@ -333,56 +340,9 @@ impl Index {
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.map(SegmentMeta::id)
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Sets the number of searchers to use
|
||||
///
|
||||
/// Only works after the next call to `load_searchers`
|
||||
pub fn set_num_searchers(&mut self, num_searchers: usize) {
|
||||
self.num_searchers.store(num_searchers, Ordering::Release);
|
||||
}
|
||||
|
||||
/// Update searchers so that they reflect the state of the last
|
||||
/// `.commit()`.
|
||||
///
|
||||
/// If indexing happens in the same process as searching,
|
||||
/// you most likely want to call `.load_searchers()` right after each
|
||||
/// successful call to `.commit()`.
|
||||
///
|
||||
/// If indexing and searching happen in different processes, the way to
|
||||
/// get the freshest `index` at all time, is to watch `meta.json` and
|
||||
/// call `load_searchers` whenever a changes happen.
|
||||
pub fn load_searchers(&self) -> Result<()> {
|
||||
let _meta_lock = self.directory().acquire_lock(&META_LOCK)?;
|
||||
let searchable_segments = self.searchable_segments()?;
|
||||
let segment_readers: Vec<SegmentReader> = searchable_segments
|
||||
.iter()
|
||||
.map(SegmentReader::open)
|
||||
.collect::<Result<_>>()?;
|
||||
let schema = self.schema();
|
||||
let num_searchers: usize = self.num_searchers.load(Ordering::Acquire);
|
||||
let searchers = (0..num_searchers)
|
||||
.map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone()))
|
||||
.collect();
|
||||
self.searcher_pool.publish_new_generation(searchers);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns a searcher
|
||||
///
|
||||
/// This method should be called every single time a search
|
||||
/// query is performed.
|
||||
/// The searchers are taken from a pool of `num_searchers` searchers.
|
||||
/// If no searcher is available
|
||||
/// this may block.
|
||||
///
|
||||
/// The same searcher must be used for a given query, as it ensures
|
||||
/// the use of a consistent segment set.
|
||||
pub fn searcher(&self) -> LeasedItem<Searcher> {
|
||||
self.searcher_pool.acquire()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Index {
|
||||
@@ -391,29 +351,22 @@ impl fmt::Debug for Index {
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for Index {
|
||||
fn clone(&self) -> Index {
|
||||
Index {
|
||||
directory: self.directory.clone(),
|
||||
schema: self.schema.clone(),
|
||||
num_searchers: Arc::clone(&self.num_searchers),
|
||||
searcher_pool: Arc::clone(&self.searcher_pool),
|
||||
tokenizers: self.tokenizers.clone(),
|
||||
executor: self.executor.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use directory::RAMDirectory;
|
||||
use schema::{Schema, INT_INDEXED, TEXT};
|
||||
use schema::Field;
|
||||
use schema::{Schema, INDEXED, TEXT};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use Index;
|
||||
use IndexReader;
|
||||
use IndexWriter;
|
||||
use ReloadPolicy;
|
||||
|
||||
#[test]
|
||||
fn test_indexer_for_field() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
||||
let num_likes_field = schema_builder.add_u64_field("num_likes", INDEXED);
|
||||
let body_field = schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -471,7 +424,148 @@ mod tests {
|
||||
|
||||
fn throw_away_schema() -> Schema {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let _ = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
||||
let _ = schema_builder.add_u64_field("num_likes", INDEXED);
|
||||
schema_builder.build()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||
}
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
mod mmap_specific {
|
||||
|
||||
use super::*;
|
||||
use std::path::PathBuf;
|
||||
use tempdir::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_mmap() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_manual_policy_mmap() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
writer.commit().unwrap();
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64));
|
||||
writer.commit().unwrap();
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
reader.reload().unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_different_directories() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
|
||||
let reader = read_index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::OnCommit)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
||||
}
|
||||
}
|
||||
|
||||
fn test_index_on_commit_reload_policy_aux(
|
||||
field: Field,
|
||||
writer: &mut IndexWriter,
|
||||
reader: &IndexReader,
|
||||
) {
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64));
|
||||
writer.commit().unwrap();
|
||||
let mut count = 0;
|
||||
for _ in 0..100 {
|
||||
count = reader.searcher().num_docs();
|
||||
if count > 0 {
|
||||
break;
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
assert_eq!(count, 1);
|
||||
writer.add_document(doc!(field=>2u64));
|
||||
writer.commit().unwrap();
|
||||
let mut count = 0;
|
||||
for _ in 0..10 {
|
||||
count = reader.searcher().num_docs();
|
||||
if count > 1 {
|
||||
break;
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
assert_eq!(count, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn garbage_collect_works_as_intended() {
|
||||
let directory = RAMDirectory::create();
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let index = Index::create(directory.clone(), schema).unwrap();
|
||||
|
||||
let mut writer = index.writer_with_num_threads(8, 24_000_000).unwrap();
|
||||
for i in 0u64..8_000u64 {
|
||||
writer.add_document(doc!(field => i));
|
||||
}
|
||||
writer.commit().unwrap();
|
||||
let mem_right_after_commit = directory.total_mem_usage();
|
||||
thread::sleep(Duration::from_millis(1_000));
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(reader.searcher().num_docs(), 8_000);
|
||||
writer.wait_merging_threads().unwrap();
|
||||
let mem_right_after_merge_finished = directory.total_mem_usage();
|
||||
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 8_000);
|
||||
assert!(mem_right_after_merge_finished < mem_right_after_commit);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ use core::SegmentMeta;
|
||||
use schema::Schema;
|
||||
use serde_json;
|
||||
use std::fmt;
|
||||
use Opstamp;
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
@@ -13,14 +14,27 @@ use std::fmt;
|
||||
///
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct IndexMeta {
|
||||
/// List of `SegmentMeta` informations associated to each finalized segment of the index.
|
||||
pub segments: Vec<SegmentMeta>,
|
||||
/// Index `Schema`
|
||||
pub schema: Schema,
|
||||
pub opstamp: u64,
|
||||
/// Opstamp associated to the last `commit` operation.
|
||||
pub opstamp: Opstamp,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
/// Payload associated to the last commit.
|
||||
///
|
||||
/// Upon commit, clients can optionally add a small `Striing` payload to their commit
|
||||
/// to help identify this commit.
|
||||
/// This payload is entirely unused by tantivy.
|
||||
pub payload: Option<String>,
|
||||
}
|
||||
|
||||
impl IndexMeta {
|
||||
/// Create an `IndexMeta` object representing a brand new `Index`
|
||||
/// with the given index.
|
||||
///
|
||||
/// This new index does not contains any segments.
|
||||
/// Opstamp will the value `0u64`.
|
||||
pub fn with_schema(schema: Schema) -> IndexMeta {
|
||||
IndexMeta {
|
||||
segments: vec![],
|
||||
|
||||
@@ -2,7 +2,6 @@ mod executor;
|
||||
pub mod index;
|
||||
mod index_meta;
|
||||
mod inverted_index_reader;
|
||||
mod pool;
|
||||
pub mod searcher;
|
||||
mod segment;
|
||||
mod segment_component;
|
||||
@@ -25,6 +24,7 @@ pub use self::segment_reader::SegmentReader;
|
||||
use std::path::PathBuf;
|
||||
|
||||
lazy_static! {
|
||||
|
||||
/// The meta file contains all the information about the list of segments and the schema
|
||||
/// of the index.
|
||||
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");
|
||||
|
||||
@@ -59,7 +59,7 @@ impl Searcher {
|
||||
) -> Searcher {
|
||||
let store_readers = segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.get_store_reader())
|
||||
.map(SegmentReader::get_store_reader)
|
||||
.collect();
|
||||
Searcher {
|
||||
schema,
|
||||
@@ -218,7 +218,7 @@ impl fmt::Debug for Searcher {
|
||||
let segment_ids = self
|
||||
.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.segment_id())
|
||||
.map(SegmentReader::segment_id)
|
||||
.collect::<Vec<_>>();
|
||||
write!(f, "Searcher({:?})", segment_ids)
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ use schema::Schema;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::result;
|
||||
use Opstamp;
|
||||
use Result;
|
||||
|
||||
/// A segment is a piece of the index.
|
||||
@@ -50,7 +51,7 @@ impl Segment {
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> Segment {
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
|
||||
Segment {
|
||||
index: self.index,
|
||||
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),
|
||||
|
||||
@@ -19,7 +19,7 @@ pub struct SegmentId(Uuid);
|
||||
#[cfg(test)]
|
||||
lazy_static! {
|
||||
static ref AUTO_INC_COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::default();
|
||||
static ref EMPTY_ARR: [u8; 8] = [0u8; 8];
|
||||
static ref ZERO_ARRAY: [u8; 8] = [0u8; 8];
|
||||
}
|
||||
|
||||
// During tests, we generate the segment id in a autoincrement manner
|
||||
@@ -30,7 +30,7 @@ lazy_static! {
|
||||
#[cfg(test)]
|
||||
fn create_uuid() -> Uuid {
|
||||
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
|
||||
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*EMPTY_ARR).unwrap()
|
||||
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*ZERO_ARRAY).unwrap()
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
|
||||
@@ -5,6 +5,7 @@ use serde;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use Opstamp;
|
||||
|
||||
lazy_static! {
|
||||
static ref INVENTORY: Inventory<InnerSegmentMeta> = { Inventory::new() };
|
||||
@@ -13,7 +14,7 @@ lazy_static! {
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
num_deleted_docs: u32,
|
||||
opstamp: u64,
|
||||
opstamp: Opstamp,
|
||||
}
|
||||
|
||||
/// `SegmentMeta` contains simple meta information about a segment.
|
||||
@@ -136,9 +137,9 @@ impl SegmentMeta {
|
||||
self.max_doc() - self.num_deleted_docs()
|
||||
}
|
||||
|
||||
/// Returns the opstamp of the last delete operation
|
||||
/// Returns the `Opstamp` of the last delete operation
|
||||
/// taken in account in this segment.
|
||||
pub fn delete_opstamp(&self) -> Option<u64> {
|
||||
pub fn delete_opstamp(&self) -> Option<Opstamp> {
|
||||
self.tracked
|
||||
.deletes
|
||||
.as_ref()
|
||||
@@ -152,7 +153,7 @@ impl SegmentMeta {
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> SegmentMeta {
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
||||
let delete_meta = DeleteMeta {
|
||||
num_deleted_docs,
|
||||
opstamp,
|
||||
|
||||
@@ -5,14 +5,10 @@ use core::Segment;
|
||||
use core::SegmentComponent;
|
||||
use core::SegmentId;
|
||||
use directory::ReadOnlySource;
|
||||
use error::TantivyError;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fastfield::FacetReader;
|
||||
use fastfield::FastFieldReader;
|
||||
use fastfield::{self, FastFieldNotAvailableError};
|
||||
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
||||
use fastfield::FastFieldReaders;
|
||||
use fieldnorm::FieldNormReader;
|
||||
use schema::Cardinality;
|
||||
use schema::Field;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
@@ -51,7 +47,7 @@ pub struct SegmentReader {
|
||||
postings_composite: CompositeFile,
|
||||
positions_composite: CompositeFile,
|
||||
positions_idx_composite: CompositeFile,
|
||||
fast_fields_composite: CompositeFile,
|
||||
fast_fields_readers: Arc<FastFieldReaders>,
|
||||
fieldnorms_composite: CompositeFile,
|
||||
|
||||
store_source: ReadOnlySource,
|
||||
@@ -105,93 +101,21 @@ impl SegmentReader {
|
||||
///
|
||||
/// # Panics
|
||||
/// May panic if the index is corrupted.
|
||||
pub fn fast_field_reader<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
|
||||
{
|
||||
self.fast_fields_composite
|
||||
.open_read(field)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> fastfield::Result<FastFieldReader<Item>> {
|
||||
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
|
||||
Ok(FastFieldReader::open(ff_source))
|
||||
} else {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
|
||||
/// May panick if the field is not a multivalued fastfield of the type `Item`.
|
||||
pub fn multi_fast_field_reader<Item: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
|
||||
{
|
||||
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
|
||||
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
|
||||
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
|
||||
} else {
|
||||
Err(FastFieldNotAvailableError::new(field_entry))
|
||||
}
|
||||
}
|
||||
|
||||
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
|
||||
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
match *field_entry.field_type() {
|
||||
FieldType::Bytes => {}
|
||||
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
|
||||
}
|
||||
let idx_reader = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let values = self
|
||||
.fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||
Ok(BytesFastFieldReader::open(idx_reader, values))
|
||||
pub fn fast_fields(&self) -> &FastFieldReaders {
|
||||
&self.fast_fields_readers
|
||||
}
|
||||
|
||||
/// Accessor to the `FacetReader` associated to a given `Field`.
|
||||
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
|
||||
pub fn facet_reader(&self, field: Field) -> Option<FacetReader> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
if field_entry.field_type() != &FieldType::HierarchicalFacet {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"The field {:?} is not a \
|
||||
hierarchical facet.",
|
||||
field_entry
|
||||
)));
|
||||
return None;
|
||||
}
|
||||
let term_ords_reader = self.multi_fast_field_reader(field)?;
|
||||
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"The field \"{}\" is a hierarchical \
|
||||
but this segment does not seem to have the field term \
|
||||
dictionary.",
|
||||
field_entry.name()
|
||||
))
|
||||
})?;
|
||||
let term_ords_reader = self.fast_fields().u64s(field)?;
|
||||
let termdict_source = self.termdict_composite.open_read(field)?;
|
||||
let termdict = TermDictionary::from_source(&termdict_source);
|
||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||
Ok(facet_reader)
|
||||
Some(facet_reader)
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `Field norms`'s reader.
|
||||
@@ -247,8 +171,12 @@ impl SegmentReader {
|
||||
}
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
let fast_field_readers =
|
||||
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
|
||||
|
||||
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
|
||||
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
|
||||
@@ -260,14 +188,13 @@ impl SegmentReader {
|
||||
None
|
||||
};
|
||||
|
||||
let schema = segment.schema();
|
||||
Ok(SegmentReader {
|
||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
max_doc: segment.meta().max_doc(),
|
||||
num_docs: segment.meta().num_docs(),
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_composite,
|
||||
fast_fields_readers: fast_field_readers,
|
||||
fieldnorms_composite,
|
||||
segment_id: segment.id(),
|
||||
store_source,
|
||||
@@ -319,7 +246,7 @@ impl SegmentReader {
|
||||
let termdict_source = self
|
||||
.termdict_composite
|
||||
.open_read(field)
|
||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
|
||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed?");
|
||||
|
||||
let positions_source = self
|
||||
.positions_composite
|
||||
@@ -381,12 +308,12 @@ impl SegmentReader {
|
||||
self.postings_composite.space_usage(),
|
||||
self.positions_composite.space_usage(),
|
||||
self.positions_idx_composite.space_usage(),
|
||||
self.fast_fields_composite.space_usage(),
|
||||
self.fast_fields_readers.space_usage(),
|
||||
self.fieldnorms_composite.space_usage(),
|
||||
self.get_store_reader().space_usage(),
|
||||
self.delete_bitset_opt
|
||||
.as_ref()
|
||||
.map(|x| x.space_usage())
|
||||
.map(DeleteBitSet::space_usage)
|
||||
.unwrap_or(0),
|
||||
)
|
||||
}
|
||||
@@ -477,9 +404,7 @@ mod test {
|
||||
// ok, now we should have a deleted doc
|
||||
index_writer2.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let docs: Vec<DocId> = searcher.segment_reader(0).doc_ids_alive().collect();
|
||||
assert_eq!(vec![0u32, 2u32], docs);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use directory::directory_lock::Lock;
|
||||
use directory::error::LockError;
|
||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use directory::WatchCallback;
|
||||
use directory::WatchHandle;
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
@@ -46,14 +48,14 @@ impl RetryPolicy {
|
||||
///
|
||||
/// It is transparently associated to a lock file, that gets deleted
|
||||
/// on `Drop.` The lock is released automatically on `Drop`.
|
||||
pub struct DirectoryLock(Box<Drop + Send + 'static>);
|
||||
pub struct DirectoryLock(Box<Drop + Send + Sync + 'static>);
|
||||
|
||||
struct DirectoryLockGuard {
|
||||
directory: Box<Directory>,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl<T: Drop + Send + 'static> From<Box<T>> for DirectoryLock {
|
||||
impl<T: Drop + Send + Sync + 'static> From<Box<T>> for DirectoryLock {
|
||||
fn from(underlying: Box<T>) -> Self {
|
||||
DirectoryLock(underlying)
|
||||
}
|
||||
@@ -187,6 +189,22 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Registers a callback that will be called whenever a change on the `meta.json`
|
||||
/// using the `atomic_write` API is detected.
|
||||
///
|
||||
/// The behavior when using `.watch()` on a file using `.open_write(...)` is, on the other
|
||||
/// hand, undefined.
|
||||
///
|
||||
/// The file will be watched for the lifetime of the returned `WatchHandle`. The caller is
|
||||
/// required to keep it.
|
||||
/// It does not override previous callbacks. When the file is modified, all callback that are
|
||||
/// registered (and whose `WatchHandle` is still alive) are triggered.
|
||||
///
|
||||
/// Internally, tantivy only uses this API to detect new commits to implement the
|
||||
/// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
|
||||
/// `OnCommit` `ReloadPolicy` to work properly.
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle;
|
||||
}
|
||||
|
||||
/// DirectoryClone
|
||||
|
||||
@@ -43,7 +43,7 @@ lazy_static! {
|
||||
is_blocking: false
|
||||
};
|
||||
/// The meta lock file is here to protect the segment files being opened by
|
||||
/// `.load_searchers()` from being garbage collected.
|
||||
/// `IndexReader::reload()` from being garbage collected.
|
||||
/// It makes it possible for another process to safely consume
|
||||
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
|
||||
/// here, but it is difficult to achieve on Windows.
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::path::PathBuf;
|
||||
/// Error while trying to acquire a directory lock.
|
||||
#[derive(Debug, Fail)]
|
||||
pub enum LockError {
|
||||
/// Failed to acquired a lock as it is already hold by another
|
||||
/// Failed to acquired a lock as it is already held by another
|
||||
/// client.
|
||||
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
|
||||
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
|
||||
@@ -73,6 +73,14 @@ pub enum OpenDirectoryError {
|
||||
DoesNotExist(PathBuf),
|
||||
/// The path exists but is not a directory.
|
||||
NotADirectory(PathBuf),
|
||||
/// IoError
|
||||
IoError(io::Error),
|
||||
}
|
||||
|
||||
impl From<io::Error> for OpenDirectoryError {
|
||||
fn from(io_err: io::Error) -> Self {
|
||||
OpenDirectoryError::IoError(io_err)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for OpenDirectoryError {
|
||||
@@ -84,6 +92,11 @@ impl fmt::Display for OpenDirectoryError {
|
||||
OpenDirectoryError::NotADirectory(ref path) => {
|
||||
write!(f, "the path '{:?}' exists but is not a directory", path)
|
||||
}
|
||||
OpenDirectoryError::IoError(ref err) => write!(
|
||||
f,
|
||||
"IOError while trying to open/create the directory. {:?}",
|
||||
err
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ use directory::DirectoryLock;
|
||||
use directory::Lock;
|
||||
use directory::META_LOCK;
|
||||
use directory::{ReadOnlySource, WritePtr};
|
||||
use directory::{WatchCallback, WatchHandle};
|
||||
use error::DataCorruption;
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
@@ -241,6 +242,10 @@ impl Directory for ManagedDirectory {
|
||||
fn acquire_lock(&self, lock: &Lock) -> result::Result<DirectoryLock, LockError> {
|
||||
self.directory.acquire_lock(lock)
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
self.directory.watch(watch_callback)
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for ManagedDirectory {
|
||||
@@ -255,95 +260,98 @@ impl Clone for ManagedDirectory {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
#[cfg(feature = "mmap")]
|
||||
use directory::MmapDirectory;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use tempdir::TempDir;
|
||||
mod mmap_specific {
|
||||
|
||||
lazy_static! {
|
||||
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
|
||||
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
|
||||
}
|
||||
use super::super::*;
|
||||
use std::path::Path;
|
||||
use tempdir::TempDir;
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_managed_directory() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
lazy_static! {
|
||||
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
|
||||
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
|
||||
}
|
||||
|
||||
use directory::MmapDirectory;
|
||||
use std::io::Write;
|
||||
|
||||
#[test]
|
||||
fn test_managed_directory() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
{
|
||||
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
{
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
}
|
||||
{
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
|
||||
#[test]
|
||||
fn test_managed_directory_gc_while_mmapped() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let living_files = HashSet::new();
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "mmap ")]
|
||||
fn test_managed_directory_gc_while_mmapped() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let living_files = HashSet::new();
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
if cfg!(target_os = "windows") {
|
||||
// On Windows, gc should try and fail the file as it is mmapped.
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
// unmap should happen here.
|
||||
drop(_mmap_read);
|
||||
// The file should still be in the list of managed file and
|
||||
// eventually be deleted once mmap is released.
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
} else {
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
if cfg!(target_os = "windows") {
|
||||
// On Windows, gc should try and fail the file as it is mmapped.
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
// unmap should happen here.
|
||||
drop(_mmap_read);
|
||||
// The file should still be in the list of managed file and
|
||||
// eventually be deleted once mmap is released.
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
} else {
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,18 +1,24 @@
|
||||
extern crate fs2;
|
||||
extern crate notify;
|
||||
|
||||
use self::fs2::FileExt;
|
||||
use self::notify::RawEvent;
|
||||
use self::notify::RecursiveMode;
|
||||
use self::notify::Watcher;
|
||||
use atomicwrites;
|
||||
use common::make_io_err;
|
||||
use core::META_FILEPATH;
|
||||
use directory::error::LockError;
|
||||
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||
use directory::shared_vec_slice::SharedVecSlice;
|
||||
use directory::read_only_source::BoxedData;
|
||||
use directory::Directory;
|
||||
use directory::DirectoryLock;
|
||||
use directory::Lock;
|
||||
use directory::ReadOnlySource;
|
||||
use directory::WatchCallback;
|
||||
use directory::WatchCallbackList;
|
||||
use directory::WatchHandle;
|
||||
use directory::WritePtr;
|
||||
use fst::raw::MmapReadOnly;
|
||||
use std::collections::hash_map::Entry as HashMapEntry;
|
||||
use memmap::Mmap;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::From;
|
||||
use std::fmt;
|
||||
@@ -22,14 +28,22 @@ use std::io::{self, Seek, SeekFrom};
|
||||
use std::io::{BufWriter, Read, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::result;
|
||||
use std::sync::mpsc::{channel, Receiver, Sender};
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::Weak;
|
||||
use std::thread;
|
||||
use tempdir::TempDir;
|
||||
|
||||
/// Create a default io error given a string.
|
||||
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
}
|
||||
|
||||
/// Returns None iff the file exists, can be read, but is empty (and hence
|
||||
/// cannot be mmapped).
|
||||
///
|
||||
fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadError> {
|
||||
/// cannot be mmapped)
|
||||
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
|
||||
let file = File::open(full_path).map_err(|e| {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
OpenReadError::FileDoesNotExist(full_path.to_owned())
|
||||
@@ -48,7 +62,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
|
||||
return Ok(None);
|
||||
}
|
||||
unsafe {
|
||||
MmapReadOnly::open(&file)
|
||||
memmap::Mmap::map(&file)
|
||||
.map(Some)
|
||||
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
||||
}
|
||||
@@ -71,7 +85,7 @@ pub struct CacheInfo {
|
||||
|
||||
struct MmapCache {
|
||||
counters: CacheCounters,
|
||||
cache: HashMap<PathBuf, MmapReadOnly>,
|
||||
cache: HashMap<PathBuf, Weak<BoxedData>>,
|
||||
}
|
||||
|
||||
impl Default for MmapCache {
|
||||
@@ -84,12 +98,7 @@ impl Default for MmapCache {
|
||||
}
|
||||
|
||||
impl MmapCache {
|
||||
/// Removes a `MmapReadOnly` entry from the mmap cache.
|
||||
fn discard_from_cache(&mut self, full_path: &Path) -> bool {
|
||||
self.cache.remove(full_path).is_some()
|
||||
}
|
||||
|
||||
fn get_info(&mut self) -> CacheInfo {
|
||||
fn get_info(&self) -> CacheInfo {
|
||||
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
|
||||
CacheInfo {
|
||||
counters: self.counters.clone(),
|
||||
@@ -97,26 +106,108 @@ impl MmapCache {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<MmapReadOnly>, OpenReadError> {
|
||||
Ok(match self.cache.entry(full_path.to_owned()) {
|
||||
HashMapEntry::Occupied(occupied_entry) => {
|
||||
let mmap = occupied_entry.get();
|
||||
fn remove_weak_ref(&mut self) {
|
||||
let keys_to_remove: Vec<PathBuf> = self
|
||||
.cache
|
||||
.iter()
|
||||
.filter(|(_, mmap_weakref)| mmap_weakref.upgrade().is_none())
|
||||
.map(|(key, _)| key.clone())
|
||||
.collect();
|
||||
for key in keys_to_remove {
|
||||
self.cache.remove(&key);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns None if the file exists but as a len of 0 (and hence is not mmappable).
|
||||
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<Arc<BoxedData>>, OpenReadError> {
|
||||
if let Some(mmap_weak) = self.cache.get(full_path) {
|
||||
if let Some(mmap_arc) = mmap_weak.upgrade() {
|
||||
self.counters.hit += 1;
|
||||
Some(mmap.clone())
|
||||
}
|
||||
HashMapEntry::Vacant(vacant_entry) => {
|
||||
self.counters.miss += 1;
|
||||
if let Some(mmap) = open_mmap(full_path)? {
|
||||
vacant_entry.insert(mmap.clone());
|
||||
Some(mmap)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
return Ok(Some(mmap_arc));
|
||||
}
|
||||
}
|
||||
self.cache.remove(full_path);
|
||||
self.counters.miss += 1;
|
||||
Ok(if let Some(mmap) = open_mmap(full_path)? {
|
||||
let mmap_arc: Arc<BoxedData> = Arc::new(Box::new(mmap));
|
||||
let mmap_weak = Arc::downgrade(&mmap_arc);
|
||||
self.cache.insert(full_path.to_owned(), mmap_weak);
|
||||
Some(mmap_arc)
|
||||
} else {
|
||||
None
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
struct InnerWatcherWrapper {
|
||||
_watcher: Mutex<notify::RecommendedWatcher>,
|
||||
watcher_router: WatchCallbackList,
|
||||
}
|
||||
|
||||
impl InnerWatcherWrapper {
|
||||
pub fn new(path: &Path) -> Result<(Self, Receiver<notify::RawEvent>), notify::Error> {
|
||||
let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel();
|
||||
// We need to initialize the
|
||||
let mut watcher = notify::raw_watcher(tx)?;
|
||||
watcher.watch(path, RecursiveMode::Recursive)?;
|
||||
let inner = InnerWatcherWrapper {
|
||||
_watcher: Mutex::new(watcher),
|
||||
watcher_router: Default::default(),
|
||||
};
|
||||
Ok((inner, watcher_recv))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct WatcherWrapper {
|
||||
inner: Arc<InnerWatcherWrapper>,
|
||||
}
|
||||
|
||||
impl WatcherWrapper {
|
||||
pub fn new(path: &Path) -> Result<Self, OpenDirectoryError> {
|
||||
let (inner, watcher_recv) = InnerWatcherWrapper::new(path).map_err(|err| match err {
|
||||
notify::Error::PathNotFound => OpenDirectoryError::DoesNotExist(path.to_owned()),
|
||||
_ => {
|
||||
panic!("Unknown error while starting watching directory {:?}", path);
|
||||
}
|
||||
})?;
|
||||
let watcher_wrapper = WatcherWrapper {
|
||||
inner: Arc::new(inner),
|
||||
};
|
||||
let watcher_wrapper_clone = watcher_wrapper.clone();
|
||||
thread::Builder::new()
|
||||
.name("meta-file-watch-thread".to_string())
|
||||
.spawn(move || {
|
||||
loop {
|
||||
match watcher_recv.recv().map(|evt| evt.path) {
|
||||
Ok(Some(changed_path)) => {
|
||||
// ... Actually subject to false positive.
|
||||
// We might want to be more accurate than this at one point.
|
||||
if let Some(filename) = changed_path.file_name() {
|
||||
if filename == *META_FILEPATH {
|
||||
watcher_wrapper_clone.inner.watcher_router.broadcast();
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None) => {
|
||||
// not an event we are interested in.
|
||||
}
|
||||
Err(_e) => {
|
||||
// the watch send channel was dropped
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
.expect("Failed to spawn thread to watch meta.json");
|
||||
Ok(watcher_wrapper)
|
||||
}
|
||||
|
||||
pub fn watch(&mut self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
self.inner.watcher_router.subscribe(watch_callback)
|
||||
}
|
||||
}
|
||||
|
||||
/// Directory storing data in files, read via mmap.
|
||||
///
|
||||
/// The Mmap object are cached to limit the
|
||||
@@ -131,31 +222,62 @@ impl MmapCache {
|
||||
/// On Windows the semantics are again different.
|
||||
#[derive(Clone)]
|
||||
pub struct MmapDirectory {
|
||||
inner: Arc<MmapDirectoryInner>,
|
||||
}
|
||||
|
||||
struct MmapDirectoryInner {
|
||||
root_path: PathBuf,
|
||||
mmap_cache: Arc<RwLock<MmapCache>>,
|
||||
_temp_directory: Arc<Option<TempDir>>,
|
||||
mmap_cache: RwLock<MmapCache>,
|
||||
_temp_directory: Option<TempDir>,
|
||||
watcher: RwLock<WatcherWrapper>,
|
||||
}
|
||||
|
||||
impl MmapDirectoryInner {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
) -> Result<MmapDirectoryInner, OpenDirectoryError> {
|
||||
let watch_wrapper = WatcherWrapper::new(&root_path)?;
|
||||
let mmap_directory_inner = MmapDirectoryInner {
|
||||
root_path,
|
||||
mmap_cache: Default::default(),
|
||||
_temp_directory: temp_directory,
|
||||
watcher: RwLock::new(watch_wrapper),
|
||||
};
|
||||
Ok(mmap_directory_inner)
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
let mut wlock = self.watcher.write().unwrap();
|
||||
wlock.watch(watch_callback)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for MmapDirectory {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "MmapDirectory({:?})", self.root_path)
|
||||
write!(f, "MmapDirectory({:?})", self.inner.root_path)
|
||||
}
|
||||
}
|
||||
|
||||
impl MmapDirectory {
|
||||
fn new(
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let inner = MmapDirectoryInner::new(root_path, temp_directory)?;
|
||||
Ok(MmapDirectory {
|
||||
inner: Arc::new(inner),
|
||||
})
|
||||
}
|
||||
|
||||
/// Creates a new MmapDirectory in a temporary directory.
|
||||
///
|
||||
/// This is mostly useful to test the MmapDirectory itself.
|
||||
/// For your unit tests, prefer the RAMDirectory.
|
||||
pub fn create_from_tempdir() -> io::Result<MmapDirectory> {
|
||||
let tempdir = TempDir::new("index")?;
|
||||
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let tempdir = TempDir::new("index").map_err(OpenDirectoryError::IoError)?;
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let directory = MmapDirectory {
|
||||
root_path: tempdir_path,
|
||||
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
|
||||
_temp_directory: Arc::new(Some(tempdir)),
|
||||
};
|
||||
Ok(directory)
|
||||
MmapDirectory::new(tempdir_path, Some(tempdir))
|
||||
}
|
||||
|
||||
/// Opens a MmapDirectory in a directory.
|
||||
@@ -173,18 +295,14 @@ impl MmapDirectory {
|
||||
directory_path,
|
||||
)))
|
||||
} else {
|
||||
Ok(MmapDirectory {
|
||||
root_path: PathBuf::from(directory_path),
|
||||
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
|
||||
_temp_directory: Arc::new(None),
|
||||
})
|
||||
Ok(MmapDirectory::new(PathBuf::from(directory_path), None)?)
|
||||
}
|
||||
}
|
||||
|
||||
/// Joins a relative_path to the directory `root_path`
|
||||
/// to create a proper complete `filepath`.
|
||||
fn resolve_path(&self, relative_path: &Path) -> PathBuf {
|
||||
self.root_path.join(relative_path)
|
||||
self.inner.root_path.join(relative_path)
|
||||
}
|
||||
|
||||
/// Sync the root directory.
|
||||
@@ -202,14 +320,14 @@ impl MmapDirectory {
|
||||
#[cfg(windows)]
|
||||
{
|
||||
use std::os::windows::fs::OpenOptionsExt;
|
||||
use winapi::winbase;
|
||||
use winapi::um::winbase;
|
||||
|
||||
open_opts
|
||||
.write(true)
|
||||
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
|
||||
}
|
||||
|
||||
let fd = open_opts.open(&self.root_path)?;
|
||||
let fd = open_opts.open(&self.inner.root_path)?;
|
||||
fd.sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -219,9 +337,15 @@ impl MmapDirectory {
|
||||
///
|
||||
/// The `MmapDirectory` embeds a `MmapDirectory`
|
||||
/// to avoid multiplying the `mmap` system calls.
|
||||
pub fn get_cache_info(&mut self) -> CacheInfo {
|
||||
self.mmap_cache
|
||||
pub fn get_cache_info(&self) -> CacheInfo {
|
||||
self.inner
|
||||
.mmap_cache
|
||||
.write()
|
||||
.expect("mmap cache lock is poisoned")
|
||||
.remove_weak_ref();
|
||||
self.inner
|
||||
.mmap_cache
|
||||
.read()
|
||||
.expect("Mmap cache lock is poisoned.")
|
||||
.get_info()
|
||||
}
|
||||
@@ -274,7 +398,7 @@ impl Directory for MmapDirectory {
|
||||
debug!("Open Read {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
|
||||
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
|
||||
let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| {
|
||||
let msg = format!(
|
||||
"Failed to acquired write lock \
|
||||
on mmap cache while reading {:?}",
|
||||
@@ -282,11 +406,34 @@ impl Directory for MmapDirectory {
|
||||
);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
|
||||
Ok(mmap_cache
|
||||
.get_mmap(&full_path)?
|
||||
.map(ReadOnlySource::Mmap)
|
||||
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
|
||||
.map(ReadOnlySource::from)
|
||||
.unwrap_or_else(ReadOnlySource::empty))
|
||||
}
|
||||
|
||||
/// Any entry associated to the path in the mmap will be
|
||||
/// removed before the file is deleted.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
debug!("Deleting file {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => self
|
||||
.sync_directory()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Err(DeleteError::FileDoesNotExist(path.to_owned()))
|
||||
} else {
|
||||
Err(IOError::with_path(path.to_owned(), e).into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
let full_path = self.resolve_path(path);
|
||||
full_path.exists()
|
||||
}
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
||||
@@ -319,44 +466,6 @@ impl Directory for MmapDirectory {
|
||||
Ok(BufWriter::new(Box::new(writer)))
|
||||
}
|
||||
|
||||
/// Any entry associated to the path in the mmap will be
|
||||
/// removed before the file is deleted.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
debug!("Deleting file {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
|
||||
let msg = format!(
|
||||
"Failed to acquired write lock \
|
||||
on mmap cache while deleting {:?}",
|
||||
path
|
||||
);
|
||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||
})?;
|
||||
mmap_cache.discard_from_cache(path);
|
||||
|
||||
// Removing the entry in the MMap cache.
|
||||
// The munmap will appear on Drop,
|
||||
// when the last reference is gone.
|
||||
mmap_cache.cache.remove(&full_path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => self
|
||||
.sync_directory()
|
||||
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
|
||||
Err(e) => {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Err(DeleteError::FileDoesNotExist(path.to_owned()))
|
||||
} else {
|
||||
Err(IOError::with_path(path.to_owned(), e).into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
let full_path = self.resolve_path(path);
|
||||
full_path.exists()
|
||||
}
|
||||
|
||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
|
||||
let full_path = self.resolve_path(path);
|
||||
let mut buffer = Vec::new();
|
||||
@@ -403,6 +512,10 @@ impl Directory for MmapDirectory {
|
||||
_file: file,
|
||||
})))
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
self.inner.watch(watch_callback)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -412,6 +525,13 @@ mod tests {
|
||||
// The following tests are specific to the MmapDirectory
|
||||
|
||||
use super::*;
|
||||
use schema::{Schema, SchemaBuilder, TEXT};
|
||||
use std::fs;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use Index;
|
||||
use ReloadPolicy;
|
||||
|
||||
#[test]
|
||||
fn test_open_non_existant_path() {
|
||||
@@ -436,7 +556,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_cache() {
|
||||
let content = "abc".as_bytes();
|
||||
let content = b"abc";
|
||||
|
||||
// here we test if the cache releases
|
||||
// mmaps correctly.
|
||||
@@ -452,26 +572,104 @@ mod tests {
|
||||
w.flush().unwrap();
|
||||
}
|
||||
}
|
||||
{
|
||||
for (i, path) in paths.iter().enumerate() {
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
|
||||
}
|
||||
for path in paths.iter() {
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
|
||||
}
|
||||
for (i, path) in paths.iter().enumerate() {
|
||||
mmap_directory.delete(path).unwrap();
|
||||
assert_eq!(
|
||||
mmap_directory.get_cache_info().mmapped.len(),
|
||||
num_paths - i - 1
|
||||
);
|
||||
}
|
||||
|
||||
let mut keep = vec![];
|
||||
for (i, path) in paths.iter().enumerate() {
|
||||
keep.push(mmap_directory.open_read(path).unwrap());
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
|
||||
}
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 0);
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
|
||||
for path in paths.iter() {
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
|
||||
}
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 10);
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
|
||||
|
||||
for path in paths.iter() {
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
|
||||
}
|
||||
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
|
||||
drop(keep);
|
||||
for path in paths.iter() {
|
||||
let _r = mmap_directory.open_read(path).unwrap();
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 1);
|
||||
}
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 20);
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
||||
|
||||
for path in &paths {
|
||||
mmap_directory.delete(path).unwrap();
|
||||
}
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 20);
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
||||
for path in paths.iter() {
|
||||
assert!(mmap_directory.open_read(path).is_err());
|
||||
}
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
|
||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 30);
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_watch_wrapper() {
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let counter_clone = counter.clone();
|
||||
let tmp_dir: TempDir = tempdir::TempDir::new("test_watch_wrapper").unwrap();
|
||||
let tmp_dirpath = tmp_dir.path().to_owned();
|
||||
let mut watch_wrapper = WatcherWrapper::new(&tmp_dirpath).unwrap();
|
||||
let tmp_file = tmp_dirpath.join("coucou");
|
||||
let _handle = watch_wrapper.watch(Box::new(move || {
|
||||
counter_clone.fetch_add(1, Ordering::SeqCst);
|
||||
}));
|
||||
assert_eq!(counter.load(Ordering::SeqCst), 0);
|
||||
fs::write(&tmp_file, b"whateverwilldo").unwrap();
|
||||
thread::sleep(Duration::new(0, 1_000u32));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mmap_released() {
|
||||
let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
||||
let mut schema_builder: SchemaBuilder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let index = Index::create(mmap_directory.clone(), schema).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for _num_commits in 0..16 {
|
||||
for _ in 0..10 {
|
||||
index_writer.add_document(doc!(text_field=>"abc"));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
for _ in 0..30 {
|
||||
index_writer.add_document(doc!(text_field=>"abc"));
|
||||
index_writer.commit().unwrap();
|
||||
reader.reload().unwrap();
|
||||
}
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
reader.reload().unwrap();
|
||||
let num_segments = reader.searcher().segment_readers().len();
|
||||
assert_eq!(num_segments, 4);
|
||||
assert_eq!(
|
||||
num_segments * 7,
|
||||
mmap_directory.get_cache_info().mmapped.len()
|
||||
);
|
||||
}
|
||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ mod directory_lock;
|
||||
mod managed_directory;
|
||||
mod ram_directory;
|
||||
mod read_only_source;
|
||||
mod shared_vec_slice;
|
||||
mod watch_event_router;
|
||||
|
||||
/// Errors specific to the directory module.
|
||||
pub mod error;
|
||||
@@ -22,6 +22,8 @@ pub use self::directory::{Directory, DirectoryClone};
|
||||
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
|
||||
pub use self::ram_directory::RAMDirectory;
|
||||
pub use self::read_only_source::ReadOnlySource;
|
||||
pub(crate) use self::watch_event_router::WatchCallbackList;
|
||||
pub use self::watch_event_router::{WatchCallback, WatchHandle};
|
||||
use std::io::{BufWriter, Seek, Write};
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
use common::make_io_err;
|
||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||
use core::META_FILEPATH;
|
||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||
use directory::WatchCallbackList;
|
||||
use directory::WritePtr;
|
||||
use directory::{Directory, ReadOnlySource};
|
||||
use directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
|
||||
@@ -22,13 +22,13 @@ use std::sync::{Arc, RwLock};
|
||||
///
|
||||
struct VecWriter {
|
||||
path: PathBuf,
|
||||
shared_directory: InnerDirectory,
|
||||
shared_directory: RAMDirectory,
|
||||
data: Cursor<Vec<u8>>,
|
||||
is_flushed: bool,
|
||||
}
|
||||
|
||||
impl VecWriter {
|
||||
fn new(path_buf: PathBuf, shared_directory: InnerDirectory) -> VecWriter {
|
||||
fn new(path_buf: PathBuf, shared_directory: RAMDirectory) -> VecWriter {
|
||||
VecWriter {
|
||||
path: path_buf,
|
||||
data: Cursor::new(Vec::new()),
|
||||
@@ -64,75 +64,48 @@ impl Write for VecWriter {
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.is_flushed = true;
|
||||
self.shared_directory
|
||||
.write(self.path.clone(), self.data.get_ref())?;
|
||||
let mut fs = self.shared_directory.fs.write().unwrap();
|
||||
fs.write(self.path.clone(), self.data.get_ref());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct InnerDirectory(Arc<RwLock<HashMap<PathBuf, Arc<Vec<u8>>>>>);
|
||||
#[derive(Default)]
|
||||
struct InnerDirectory {
|
||||
fs: HashMap<PathBuf, ReadOnlySource>,
|
||||
watch_router: WatchCallbackList,
|
||||
}
|
||||
|
||||
impl InnerDirectory {
|
||||
fn new() -> InnerDirectory {
|
||||
InnerDirectory(Arc::new(RwLock::new(HashMap::new())))
|
||||
}
|
||||
|
||||
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
|
||||
let mut map = self.0.write().map_err(|_| {
|
||||
make_io_err(format!(
|
||||
"Failed to lock the directory, when trying to write {:?}",
|
||||
path
|
||||
))
|
||||
})?;
|
||||
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
|
||||
Ok(prev_value.is_some())
|
||||
fn write(&mut self, path: PathBuf, data: &[u8]) -> bool {
|
||||
let data = ReadOnlySource::new(Vec::from(data));
|
||||
self.fs.insert(path, data).is_some()
|
||||
}
|
||||
|
||||
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
||||
self.0
|
||||
.read()
|
||||
.map_err(|_| {
|
||||
let msg = format!(
|
||||
"Failed to acquire read lock for the \
|
||||
directory when trying to read {:?}",
|
||||
path
|
||||
);
|
||||
let io_err = make_io_err(msg);
|
||||
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
})
|
||||
.and_then(|readable_map| {
|
||||
readable_map
|
||||
.get(path)
|
||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||
.map(Arc::clone)
|
||||
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data)))
|
||||
})
|
||||
self.fs
|
||||
.get(path)
|
||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||
.map(Clone::clone)
|
||||
}
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
self.0
|
||||
.write()
|
||||
.map_err(|_| {
|
||||
let msg = format!(
|
||||
"Failed to acquire write lock for the \
|
||||
directory when trying to delete {:?}",
|
||||
path
|
||||
);
|
||||
let io_err = make_io_err(msg);
|
||||
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||
})
|
||||
.and_then(|mut writable_map| match writable_map.remove(path) {
|
||||
Some(_) => Ok(()),
|
||||
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
||||
})
|
||||
fn delete(&mut self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
match self.fs.remove(path) {
|
||||
Some(_) => Ok(()),
|
||||
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
||||
}
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
self.0
|
||||
.read()
|
||||
.expect("Failed to get read lock directory.")
|
||||
.contains_key(path)
|
||||
self.fs.contains_key(path)
|
||||
}
|
||||
|
||||
fn watch(&mut self, watch_handle: WatchCallback) -> WatchHandle {
|
||||
self.watch_router.subscribe(watch_handle)
|
||||
}
|
||||
|
||||
fn total_mem_usage(&self) -> usize {
|
||||
self.fs.values().map(|f| f.len()).sum()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,33 +120,42 @@ impl fmt::Debug for RAMDirectory {
|
||||
/// It is mainly meant for unit testing.
|
||||
/// Writes are only made visible upon flushing.
|
||||
///
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Default)]
|
||||
pub struct RAMDirectory {
|
||||
fs: InnerDirectory,
|
||||
fs: Arc<RwLock<InnerDirectory>>,
|
||||
}
|
||||
|
||||
impl RAMDirectory {
|
||||
/// Constructor
|
||||
pub fn create() -> RAMDirectory {
|
||||
RAMDirectory {
|
||||
fs: InnerDirectory::new(),
|
||||
}
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Returns the sum of the size of the different files
|
||||
/// in the RAMDirectory.
|
||||
pub fn total_mem_usage(&self) -> usize {
|
||||
self.fs.read().unwrap().total_mem_usage()
|
||||
}
|
||||
}
|
||||
|
||||
impl Directory for RAMDirectory {
|
||||
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||
self.fs.open_read(path)
|
||||
self.fs.read().unwrap().open_read(path)
|
||||
}
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
self.fs.write().unwrap().delete(path)
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
self.fs.read().unwrap().exists(path)
|
||||
}
|
||||
|
||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
||||
let mut fs = self.fs.write().unwrap();
|
||||
let path_buf = PathBuf::from(path);
|
||||
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
|
||||
let exists = self
|
||||
.fs
|
||||
.write(path_buf.clone(), &Vec::new())
|
||||
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
|
||||
let vec_writer = VecWriter::new(path_buf.clone(), self.clone());
|
||||
let exists = fs.write(path_buf.clone(), &[]);
|
||||
// force the creation of the file to mimic the MMap directory.
|
||||
if exists {
|
||||
Err(OpenWriteError::FileAlreadyExists(path_buf))
|
||||
@@ -182,17 +164,8 @@ impl Directory for RAMDirectory {
|
||||
}
|
||||
}
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
self.fs.delete(path)
|
||||
}
|
||||
|
||||
fn exists(&self, path: &Path) -> bool {
|
||||
self.fs.exists(path)
|
||||
}
|
||||
|
||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
|
||||
let read = self.open_read(path)?;
|
||||
Ok(read.as_slice().to_owned())
|
||||
Ok(self.open_read(path)?.as_slice().to_owned())
|
||||
}
|
||||
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
@@ -201,10 +174,20 @@ impl Directory for RAMDirectory {
|
||||
msg.unwrap_or("Undefined".to_string())
|
||||
)));
|
||||
let path_buf = PathBuf::from(path);
|
||||
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||
self.fs.write(path_buf, &Vec::new())?;
|
||||
|
||||
// Reserve the path to prevent calls to .write() to succeed.
|
||||
self.fs.write().unwrap().write(path_buf.clone(), &[]);
|
||||
|
||||
let mut vec_writer = VecWriter::new(path_buf.clone(), self.clone());
|
||||
vec_writer.write_all(data)?;
|
||||
vec_writer.flush()?;
|
||||
if path == Path::new(&*META_FILEPATH) {
|
||||
self.fs.write().unwrap().watch_router.broadcast();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
self.fs.write().unwrap().watch(watch_callback)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use super::shared_vec_slice::SharedVecSlice;
|
||||
use common::HasLen;
|
||||
#[cfg(feature = "mmap")]
|
||||
use fst::raw::MmapReadOnly;
|
||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub type BoxedData = Box<Deref<Target = [u8]> + Send + Sync + 'static>;
|
||||
|
||||
/// Read object that represents files in tantivy.
|
||||
///
|
||||
@@ -11,12 +11,10 @@ use std::ops::Deref;
|
||||
/// the data in the form of a constant read-only `&[u8]`.
|
||||
/// Whatever happens to the directory file, the data
|
||||
/// hold by this object should never be altered or destroyed.
|
||||
pub enum ReadOnlySource {
|
||||
/// Mmap source of data
|
||||
#[cfg(feature = "mmap")]
|
||||
Mmap(MmapReadOnly),
|
||||
/// Wrapping a `Vec<u8>`
|
||||
Anonymous(SharedVecSlice),
|
||||
pub struct ReadOnlySource {
|
||||
data: Arc<BoxedData>,
|
||||
start: usize,
|
||||
stop: usize,
|
||||
}
|
||||
|
||||
unsafe impl StableDeref for ReadOnlySource {}
|
||||
@@ -30,19 +28,38 @@ impl Deref for ReadOnlySource {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Arc<BoxedData>> for ReadOnlySource {
|
||||
fn from(data: Arc<BoxedData>) -> Self {
|
||||
let len = data.len();
|
||||
ReadOnlySource {
|
||||
data,
|
||||
start: 0,
|
||||
stop: len,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ReadOnlySource {
|
||||
pub(crate) fn new<D>(data: D) -> ReadOnlySource
|
||||
where
|
||||
D: Deref<Target = [u8]> + Send + Sync + 'static,
|
||||
{
|
||||
let len = data.len();
|
||||
ReadOnlySource {
|
||||
data: Arc::new(Box::new(data)),
|
||||
start: 0,
|
||||
stop: len,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates an empty ReadOnlySource
|
||||
pub fn empty() -> ReadOnlySource {
|
||||
ReadOnlySource::Anonymous(SharedVecSlice::empty())
|
||||
ReadOnlySource::new(&[][..])
|
||||
}
|
||||
|
||||
/// Returns the data underlying the ReadOnlySource object.
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
match *self {
|
||||
#[cfg(feature = "mmap")]
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
|
||||
}
|
||||
&self.data[self.start..self.stop]
|
||||
}
|
||||
|
||||
/// Splits into 2 `ReadOnlySource`, at the offset given
|
||||
@@ -63,22 +80,18 @@ impl ReadOnlySource {
|
||||
/// worth of data in anonymous memory, and only a
|
||||
/// 1KB slice is remaining, the whole `500MBs`
|
||||
/// are retained in memory.
|
||||
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
|
||||
pub fn slice(&self, start: usize, stop: usize) -> ReadOnlySource {
|
||||
assert!(
|
||||
from_offset <= to_offset,
|
||||
start <= stop,
|
||||
"Requested negative slice [{}..{}]",
|
||||
from_offset,
|
||||
to_offset
|
||||
start,
|
||||
stop
|
||||
);
|
||||
match *self {
|
||||
#[cfg(feature = "mmap")]
|
||||
ReadOnlySource::Mmap(ref mmap_read_only) => {
|
||||
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
|
||||
ReadOnlySource::Mmap(sliced_mmap)
|
||||
}
|
||||
ReadOnlySource::Anonymous(ref shared_vec) => {
|
||||
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
|
||||
}
|
||||
assert!(stop <= self.len());
|
||||
ReadOnlySource {
|
||||
data: self.data.clone(),
|
||||
start: self.start + start,
|
||||
stop: self.start + stop,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,8 +100,7 @@ impl ReadOnlySource {
|
||||
///
|
||||
/// Equivalent to `.slice(from_offset, self.len())`
|
||||
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
|
||||
let len = self.len();
|
||||
self.slice(from_offset, len)
|
||||
self.slice(from_offset, self.len())
|
||||
}
|
||||
|
||||
/// Like `.slice(...)` but enforcing only the `to`
|
||||
@@ -102,19 +114,18 @@ impl ReadOnlySource {
|
||||
|
||||
impl HasLen for ReadOnlySource {
|
||||
fn len(&self) -> usize {
|
||||
self.as_slice().len()
|
||||
self.stop - self.start
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for ReadOnlySource {
|
||||
fn clone(&self) -> Self {
|
||||
self.slice(0, self.len())
|
||||
self.slice_from(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<u8>> for ReadOnlySource {
|
||||
fn from(data: Vec<u8>) -> ReadOnlySource {
|
||||
let shared_data = SharedVecSlice::from(data);
|
||||
ReadOnlySource::Anonymous(shared_data)
|
||||
ReadOnlySource::new(data)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SharedVecSlice {
|
||||
pub data: Arc<Vec<u8>>,
|
||||
pub start: usize,
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl SharedVecSlice {
|
||||
pub fn empty() -> SharedVecSlice {
|
||||
SharedVecSlice::new(Arc::new(Vec::new()))
|
||||
}
|
||||
|
||||
pub fn new(data: Arc<Vec<u8>>) -> SharedVecSlice {
|
||||
let data_len = data.len();
|
||||
SharedVecSlice {
|
||||
data,
|
||||
start: 0,
|
||||
len: data_len,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_slice(&self) -> &[u8] {
|
||||
&self.data[self.start..self.start + self.len]
|
||||
}
|
||||
|
||||
pub fn slice(&self, from_offset: usize, to_offset: usize) -> SharedVecSlice {
|
||||
SharedVecSlice {
|
||||
data: Arc::clone(&self.data),
|
||||
start: self.start + from_offset,
|
||||
len: to_offset - from_offset,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<u8>> for SharedVecSlice {
|
||||
fn from(data: Vec<u8>) -> SharedVecSlice {
|
||||
SharedVecSlice::new(Arc::new(data))
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,13 @@
|
||||
use super::*;
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::mem;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::time;
|
||||
use std::time::Duration;
|
||||
|
||||
lazy_static! {
|
||||
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
|
||||
@@ -30,19 +36,18 @@ fn ram_directory_panics_if_flush_forgotten() {
|
||||
|
||||
fn test_simple(directory: &mut Directory) {
|
||||
{
|
||||
{
|
||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
write_file.write_all(&[4]).unwrap();
|
||||
write_file.write_all(&[3]).unwrap();
|
||||
write_file.write_all(&[7, 3, 5]).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
write_file.write_all(&[4]).unwrap();
|
||||
write_file.write_all(&[3]).unwrap();
|
||||
write_file.write_all(&[7, 3, 5]).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
{
|
||||
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
||||
let data: &[u8] = &*read_file;
|
||||
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
|
||||
}
|
||||
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
assert!(!directory.exists(*TEST_PATH));
|
||||
}
|
||||
@@ -121,6 +126,41 @@ fn test_directory(directory: &mut Directory) {
|
||||
test_directory_delete(directory);
|
||||
test_lock_non_blocking(directory);
|
||||
test_lock_blocking(directory);
|
||||
test_watch(directory);
|
||||
}
|
||||
|
||||
fn test_watch(directory: &mut Directory) {
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let counter_clone = counter.clone();
|
||||
let watch_callback = Box::new(move || {
|
||||
counter_clone.fetch_add(1, Ordering::SeqCst);
|
||||
});
|
||||
assert!(directory
|
||||
.atomic_write(Path::new("meta.json"), b"random_test_data")
|
||||
.is_ok());
|
||||
thread::sleep(Duration::new(0, 10_000));
|
||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
||||
|
||||
let watch_handle = directory.watch(watch_callback);
|
||||
for i in 0..10 {
|
||||
assert_eq!(i, counter.load(Ordering::SeqCst));
|
||||
assert!(directory
|
||||
.atomic_write(Path::new("meta.json"), b"random_test_data_2")
|
||||
.is_ok());
|
||||
for _ in 0..100 {
|
||||
if counter.load(Ordering::SeqCst) > i {
|
||||
break;
|
||||
}
|
||||
thread::sleep(Duration::from_millis(10));
|
||||
}
|
||||
assert_eq!(i + 1, counter.load(Ordering::SeqCst));
|
||||
}
|
||||
mem::drop(watch_handle);
|
||||
assert!(directory
|
||||
.atomic_write(Path::new("meta.json"), b"random_test_data")
|
||||
.is_ok());
|
||||
thread::sleep(Duration::from_millis(200));
|
||||
assert_eq!(10, counter.load(Ordering::SeqCst));
|
||||
}
|
||||
|
||||
fn test_lock_non_blocking(directory: &mut Directory) {
|
||||
|
||||
156
src/directory/watch_event_router.rs
Normal file
156
src/directory/watch_event_router.rs
Normal file
@@ -0,0 +1,156 @@
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::Weak;
|
||||
|
||||
/// Type alias for callbacks registered when watching files of a `Directory`.
|
||||
pub type WatchCallback = Box<Fn() -> () + Sync + Send>;
|
||||
|
||||
/// Helper struct to implement the watch method in `Directory` implementations.
|
||||
///
|
||||
/// It registers callbacks (See `.subscribe(...)`) and
|
||||
/// calls them upon calls to `.broadcast(...)`.
|
||||
#[derive(Default)]
|
||||
pub struct WatchCallbackList {
|
||||
router: RwLock<Vec<Weak<WatchCallback>>>,
|
||||
}
|
||||
|
||||
/// Controls how long a directory should watch for a file change.
|
||||
///
|
||||
/// After all the clones of `WatchHandle` are dropped, the associated will not be called when a
|
||||
/// file change is detected.
|
||||
#[must_use = "This `WatchHandle` controls the lifetime of the watch and should therefore be used."]
|
||||
#[derive(Clone)]
|
||||
pub struct WatchHandle(Arc<WatchCallback>);
|
||||
|
||||
impl WatchCallbackList {
|
||||
/// Suscribes a new callback and returns a handle that controls the lifetime of the callback.
|
||||
pub fn subscribe(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
let watch_callback_arc = Arc::new(watch_callback);
|
||||
let watch_callback_weak = Arc::downgrade(&watch_callback_arc);
|
||||
self.router.write().unwrap().push(watch_callback_weak);
|
||||
WatchHandle(watch_callback_arc)
|
||||
}
|
||||
|
||||
fn list_callback(&self) -> Vec<Arc<WatchCallback>> {
|
||||
let mut callbacks = vec![];
|
||||
let mut router_wlock = self.router.write().unwrap();
|
||||
let mut i = 0;
|
||||
while i < router_wlock.len() {
|
||||
if let Some(watch) = router_wlock[i].upgrade() {
|
||||
callbacks.push(watch);
|
||||
i += 1;
|
||||
} else {
|
||||
router_wlock.swap_remove(i);
|
||||
}
|
||||
}
|
||||
callbacks
|
||||
}
|
||||
|
||||
/// Triggers all callbacks
|
||||
pub fn broadcast(&self) {
|
||||
let callbacks = self.list_callback();
|
||||
let spawn_res = std::thread::Builder::new()
|
||||
.name("watch-callbacks".to_string())
|
||||
.spawn(move || {
|
||||
for callback in callbacks {
|
||||
callback();
|
||||
}
|
||||
});
|
||||
if let Err(err) = spawn_res {
|
||||
error!(
|
||||
"Failed to spawn thread to call watch callbacks. Cause: {:?}",
|
||||
err
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use directory::WatchCallbackList;
|
||||
use std::mem;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
const WAIT_TIME: u64 = 20;
|
||||
|
||||
#[test]
|
||||
fn test_watch_event_router_simple() {
|
||||
let watch_event_router = WatchCallbackList::default();
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let counter_clone = counter.clone();
|
||||
let inc_callback = Box::new(move || {
|
||||
counter_clone.fetch_add(1, Ordering::SeqCst);
|
||||
});
|
||||
watch_event_router.broadcast();
|
||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
||||
let handle_a = watch_event_router.subscribe(inc_callback);
|
||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
||||
watch_event_router.broadcast();
|
||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
||||
assert_eq!(1, counter.load(Ordering::SeqCst));
|
||||
watch_event_router.broadcast();
|
||||
watch_event_router.broadcast();
|
||||
watch_event_router.broadcast();
|
||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
||||
assert_eq!(4, counter.load(Ordering::SeqCst));
|
||||
mem::drop(handle_a);
|
||||
watch_event_router.broadcast();
|
||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
||||
assert_eq!(4, counter.load(Ordering::SeqCst));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_watch_event_router_multiple_callback_same_key() {
|
||||
let watch_event_router = WatchCallbackList::default();
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let inc_callback = |inc: usize| {
|
||||
let counter_clone = counter.clone();
|
||||
Box::new(move || {
|
||||
counter_clone.fetch_add(inc, Ordering::SeqCst);
|
||||
})
|
||||
};
|
||||
let handle_a = watch_event_router.subscribe(inc_callback(1));
|
||||
let handle_a2 = watch_event_router.subscribe(inc_callback(10));
|
||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
||||
watch_event_router.broadcast();
|
||||
watch_event_router.broadcast();
|
||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
||||
assert_eq!(22, counter.load(Ordering::SeqCst));
|
||||
mem::drop(handle_a);
|
||||
watch_event_router.broadcast();
|
||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
||||
assert_eq!(32, counter.load(Ordering::SeqCst));
|
||||
mem::drop(handle_a2);
|
||||
watch_event_router.broadcast();
|
||||
watch_event_router.broadcast();
|
||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
||||
assert_eq!(32, counter.load(Ordering::SeqCst));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_watch_event_router_multiple_callback_different_key() {
|
||||
let watch_event_router = WatchCallbackList::default();
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let counter_clone = counter.clone();
|
||||
let inc_callback = Box::new(move || {
|
||||
counter_clone.fetch_add(1, Ordering::SeqCst);
|
||||
});
|
||||
let handle_a = watch_event_router.subscribe(inc_callback);
|
||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
||||
watch_event_router.broadcast();
|
||||
watch_event_router.broadcast();
|
||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
||||
assert_eq!(2, counter.load(Ordering::SeqCst));
|
||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
||||
mem::drop(handle_a);
|
||||
watch_event_router.broadcast();
|
||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
||||
assert_eq!(2, counter.load(Ordering::SeqCst));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,4 +1,5 @@
|
||||
use common::BitSet;
|
||||
use fastfield::DeleteBitSet;
|
||||
use std::borrow::Borrow;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::cmp::Ordering;
|
||||
@@ -95,9 +96,23 @@ pub trait DocSet {
|
||||
}
|
||||
|
||||
/// Returns the number documents matching.
|
||||
///
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
fn count(&mut self) -> u32 {
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
let mut count = 0u32;
|
||||
while self.advance() {
|
||||
if !delete_bitset.is_deleted(self.doc()) {
|
||||
count += 1u32;
|
||||
}
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
/// Returns the count of documents, deleted or not.
|
||||
/// Calling this method consumes the `DocSet`.
|
||||
///
|
||||
/// Of course, the result is an upper bound of the result
|
||||
/// given by `count()`.
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
let mut count = 0u32;
|
||||
while self.advance() {
|
||||
count += 1u32;
|
||||
@@ -127,9 +142,14 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
unboxed.size_hint()
|
||||
}
|
||||
|
||||
fn count(&mut self) -> u32 {
|
||||
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count()
|
||||
unboxed.count(delete_bitset)
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.count_including_deleted()
|
||||
}
|
||||
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
|
||||
@@ -162,6 +162,7 @@ impl From<OpenDirectoryError> for TantivyError {
|
||||
OpenDirectoryError::NotADirectory(directory_path) => {
|
||||
TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path))
|
||||
}
|
||||
OpenDirectoryError::IoError(err) => TantivyError::IOError(IOError::from(err)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,17 +22,15 @@ mod tests {
|
||||
index_writer.add_document(doc!(field=>vec![1u8, 3, 5, 7, 9]));
|
||||
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let bytes_reader = segment_reader.fast_fields().bytes(field).unwrap();
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap();
|
||||
|
||||
assert_eq!(bytes_reader.get_val(0), &[0u8, 1, 2, 3]);
|
||||
assert!(bytes_reader.get_val(1).is_empty());
|
||||
assert_eq!(bytes_reader.get_val(2), &[255u8]);
|
||||
assert_eq!(bytes_reader.get_val(3), &[1u8, 3, 5, 7, 9]);
|
||||
assert_eq!(bytes_reader.get_bytes(0), &[0u8, 1, 2, 3]);
|
||||
assert!(bytes_reader.get_bytes(1).is_empty());
|
||||
assert_eq!(bytes_reader.get_bytes(2), &[255u8]);
|
||||
assert_eq!(bytes_reader.get_bytes(3), &[1u8, 3, 5, 7, 9]);
|
||||
let long = vec![0u8; 1000];
|
||||
assert_eq!(bytes_reader.get_val(4), long.as_slice());
|
||||
assert_eq!(bytes_reader.get_bytes(4), long.as_slice());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@ use DocId;
|
||||
///
|
||||
/// Reading the value for a document is done by reading the start index for it,
|
||||
/// and the start index for the next document, and keeping the bytes in between.
|
||||
#[derive(Clone)]
|
||||
pub struct BytesFastFieldReader {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
values: OwningRef<ReadOnlySource, [u8]>,
|
||||
@@ -28,10 +29,20 @@ impl BytesFastFieldReader {
|
||||
BytesFastFieldReader { idx_reader, values }
|
||||
}
|
||||
|
||||
/// Returns the bytes associated to the given `doc`
|
||||
pub fn get_val(&self, doc: DocId) -> &[u8] {
|
||||
fn range(&self, doc: DocId) -> (usize, usize) {
|
||||
let start = self.idx_reader.get(doc) as usize;
|
||||
let stop = self.idx_reader.get(doc + 1) as usize;
|
||||
(start, stop)
|
||||
}
|
||||
|
||||
/// Returns the bytes associated to the given `doc`
|
||||
pub fn get_bytes(&self, doc: DocId) -> &[u8] {
|
||||
let (start, stop) = self.range(doc);
|
||||
&self.values[start..stop]
|
||||
}
|
||||
|
||||
/// Returns the overall number of bytes in this bytes fast field.
|
||||
pub fn total_num_bytes(&self) -> usize {
|
||||
self.values.len()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,16 +53,18 @@ impl DeleteBitSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns whether the document has been marked as deleted.
|
||||
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
|
||||
pub fn is_alive(&self, doc: DocId) -> bool {
|
||||
!self.is_deleted(doc)
|
||||
}
|
||||
|
||||
/// Returns true iff the document has been marked as deleted.
|
||||
#[inline(always)]
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
if self.len == 0 {
|
||||
false
|
||||
} else {
|
||||
let byte_offset = doc / 8u32;
|
||||
let b: u8 = (*self.data)[byte_offset as usize];
|
||||
let shift = (doc & 7u32) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
let byte_offset = doc / 8u32;
|
||||
let b: u8 = (*self.data)[byte_offset as usize];
|
||||
let shift = (doc & 7u32) as u8;
|
||||
b & (1u8 << shift) != 0
|
||||
}
|
||||
|
||||
/// Summarize total space usage of this bitset.
|
||||
|
||||
@@ -30,6 +30,7 @@ pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use common;
|
||||
@@ -43,6 +44,7 @@ mod error;
|
||||
mod facet_reader;
|
||||
mod multivalued;
|
||||
mod reader;
|
||||
mod readers;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
|
||||
@@ -78,10 +80,6 @@ impl FastValue for u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
@@ -89,6 +87,10 @@ impl FastValue for u64 {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for i64 {
|
||||
|
||||
@@ -7,7 +7,13 @@ pub use self::writer::MultiValueIntFastFieldWriter;
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
extern crate time;
|
||||
|
||||
use self::time::Duration;
|
||||
use collector::TopDocs;
|
||||
use query::QueryParser;
|
||||
use schema::Cardinality;
|
||||
use schema::Facet;
|
||||
use schema::IntOptions;
|
||||
use schema::Schema;
|
||||
use Index;
|
||||
@@ -28,11 +34,10 @@ mod tests {
|
||||
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = reader.multi_fast_field_reader::<u64>(field).unwrap();
|
||||
let multi_value_reader = segment_reader.fast_fields().u64s(field).unwrap();
|
||||
{
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[4u64]);
|
||||
@@ -47,6 +52,133 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_date() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field(
|
||||
"multi_date_field",
|
||||
IntOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed()
|
||||
.set_stored(),
|
||||
);
|
||||
let time_i =
|
||||
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let first_time_stamp = chrono::Utc::now();
|
||||
index_writer.add_document(
|
||||
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
|
||||
);
|
||||
index_writer.add_document(doc!(time_i=>0i64));
|
||||
// add one second
|
||||
index_writer
|
||||
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64));
|
||||
// add another second
|
||||
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
|
||||
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
assert_eq!(reader.num_docs(), 4);
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let query = parser
|
||||
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()).to_string())
|
||||
.expect("could not parse query");
|
||||
let results = searcher
|
||||
.search(&query, &TopDocs::with_limit(5))
|
||||
.expect("could not query index");
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
for (_score, doc_address) in results {
|
||||
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
.get_first(date_field)
|
||||
.expect("cannot find value")
|
||||
.date_value()
|
||||
.timestamp(),
|
||||
first_time_stamp.timestamp()
|
||||
);
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
.get_first(time_i)
|
||||
.expect("cannot find value")
|
||||
.i64_value(),
|
||||
1i64
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
let query = parser
|
||||
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()).to_string())
|
||||
.expect("could not parse query");
|
||||
let results = searcher
|
||||
.search(&query, &TopDocs::with_limit(5))
|
||||
.expect("could not query index");
|
||||
|
||||
assert_eq!(results.len(), 1);
|
||||
|
||||
for (_score, doc_address) in results {
|
||||
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
.get_first(date_field)
|
||||
.expect("cannot find value")
|
||||
.date_value()
|
||||
.timestamp(),
|
||||
two_secs_ahead.timestamp()
|
||||
);
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
.get_first(time_i)
|
||||
.expect("cannot find value")
|
||||
.i64_value(),
|
||||
3i64
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: support Date range queries
|
||||
// {
|
||||
// let parser = QueryParser::for_index(&index, vec![date_field]);
|
||||
// let range_q = format!("\"{}\"..\"{}\"",
|
||||
// (first_time_stamp + Duration::seconds(1)).to_rfc3339(),
|
||||
// (first_time_stamp + Duration::seconds(3)).to_rfc3339()
|
||||
// );
|
||||
// let query = parser.parse_query(&range_q)
|
||||
// .expect("could not parse query");
|
||||
// let results = searcher.search(&query, &TopDocs::with_limit(5))
|
||||
// .expect("could not query index");
|
||||
//
|
||||
//
|
||||
// assert_eq!(results.len(), 2);
|
||||
// for (i, doc_pair) in results.iter().enumerate() {
|
||||
// let retrieved_doc = searcher.doc(doc_pair.1).expect("cannot fetch doc");
|
||||
// let offset_sec = match i {
|
||||
// 0 => 1,
|
||||
// 1 => 3,
|
||||
// _ => panic!("should not have more than 2 docs")
|
||||
// };
|
||||
// let time_i_val = match i {
|
||||
// 0 => 2,
|
||||
// 1 => 3,
|
||||
// _ => panic!("should not have more than 2 docs")
|
||||
// };
|
||||
// assert_eq!(retrieved_doc.get_first(date_field).expect("cannot find value").date_value().timestamp(),
|
||||
// (first_time_stamp + Duration::seconds(offset_sec)).timestamp());
|
||||
// assert_eq!(retrieved_doc.get_first(time_i).expect("cannot find value").i64_value(), time_i_val);
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_i64() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -63,11 +195,10 @@ mod tests {
|
||||
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap();
|
||||
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
|
||||
{
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[-4i64]);
|
||||
@@ -85,4 +216,17 @@ mod tests {
|
||||
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_many_facets() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_facet_field("facetfield");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for i in 0..100_000 {
|
||||
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,6 +26,13 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn into_u64s_reader(self) -> MultiValueIntFastFieldReader<u64> {
|
||||
MultiValueIntFastFieldReader {
|
||||
idx_reader: self.idx_reader,
|
||||
vals_reader: self.vals_reader.into_u64_reader(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
fn range(&self, doc: DocId) -> (u64, u64) {
|
||||
@@ -39,7 +46,18 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
let (start, stop) = self.range(doc);
|
||||
let len = (stop - start) as usize;
|
||||
vals.resize(len, Item::default());
|
||||
self.vals_reader.get_range(start as u32, &mut vals[..]);
|
||||
self.vals_reader.get_range_u64(start, &mut vals[..]);
|
||||
}
|
||||
|
||||
/// Returns the number of values associated with the document `DocId`.
|
||||
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||
let (start, stop) = self.range(doc);
|
||||
(stop - start) as usize
|
||||
}
|
||||
|
||||
/// Returns the overall number of values in this field .
|
||||
pub fn total_num_vals(&self) -> u64 {
|
||||
self.idx_reader.max_value()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,7 +65,7 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
||||
mod tests {
|
||||
|
||||
use core::Index;
|
||||
use schema::{Document, Facet, Schema};
|
||||
use schema::{Facet, Schema};
|
||||
|
||||
#[test]
|
||||
fn test_multifastfield_reader() {
|
||||
@@ -58,25 +76,14 @@ mod tests {
|
||||
let mut index_writer = index
|
||||
.writer_with_num_threads(1, 30_000_000)
|
||||
.expect("Failed to create index writer.");
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat2");
|
||||
doc.add_facet(facet_field, "/category/cat1");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat2");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let mut doc = Document::new();
|
||||
doc.add_facet(facet_field, "/category/cat3");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from("/category/cat2"),
|
||||
facet_field => Facet::from("/category/cat1"),
|
||||
));
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")));
|
||||
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")));
|
||||
index_writer.commit().expect("Commit failed");
|
||||
index.load_searchers().expect("Reloading searchers");
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ use DocId;
|
||||
/// term ids when the segment is getting serialized.
|
||||
pub struct MultiValueIntFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u64>,
|
||||
vals: Vec<UnorderedTermId>,
|
||||
doc_index: Vec<u64>,
|
||||
is_facet: bool,
|
||||
}
|
||||
|
||||
@@ -50,6 +50,15 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn into_u64_reader(self) -> FastFieldReader<u64> {
|
||||
FastFieldReader {
|
||||
bit_unpacker: self.bit_unpacker,
|
||||
min_value_u64: self.min_value_u64,
|
||||
max_value_u64: self.max_value_u64,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
@@ -59,7 +68,29 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
pub fn get(&self, doc: DocId) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc as usize))
|
||||
self.get_u64(u64::from(doc))
|
||||
}
|
||||
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
|
||||
}
|
||||
|
||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||
/// It works as follows... A first column contains the list of start index
|
||||
/// for each document, a second column contains the actual values.
|
||||
///
|
||||
/// The values associated to a given doc, are then
|
||||
/// `second_column[first_column.get(doc)..first_column.get(doc+1)]`.
|
||||
///
|
||||
/// Which means single value fast field reader can be indexed internally with
|
||||
/// something different from a `DocId`. For this use case, we want to use `u64`
|
||||
/// values.
|
||||
///
|
||||
/// See `get_range` for an actual documentation about this method.
|
||||
pub(crate) fn get_range_u64(&self, start: u64, output: &mut [Item]) {
|
||||
for (i, out) in output.iter_mut().enumerate() {
|
||||
*out = self.get_u64(start + (i as u64));
|
||||
}
|
||||
}
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
@@ -75,13 +106,8 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
///
|
||||
// TODO change start to `u64`.
|
||||
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
||||
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
||||
for (i, out) in output.iter_mut().enumerate() {
|
||||
*out = self.get(start + i as u32);
|
||||
}
|
||||
pub fn get_range(&self, start: DocId, output: &mut [Item]) {
|
||||
self.get_range_u64(u64::from(start), output);
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
|
||||
191
src/fastfield/readers.rs
Normal file
191
src/fastfield/readers.rs
Normal file
@@ -0,0 +1,191 @@
|
||||
use common::CompositeFile;
|
||||
use fastfield::BytesFastFieldReader;
|
||||
use fastfield::MultiValueIntFastFieldReader;
|
||||
use fastfield::{FastFieldNotAvailableError, FastFieldReader};
|
||||
use schema::{Cardinality, Field, FieldType, Schema};
|
||||
use space_usage::PerFieldSpaceUsage;
|
||||
use std::collections::HashMap;
|
||||
use Result;
|
||||
|
||||
/// Provides access to all of the FastFieldReader.
|
||||
///
|
||||
/// Internally, `FastFieldReaders` have preloaded fast field readers,
|
||||
/// and just wraps several `HashMap`.
|
||||
pub struct FastFieldReaders {
|
||||
fast_field_i64: HashMap<Field, FastFieldReader<i64>>,
|
||||
fast_field_u64: HashMap<Field, FastFieldReader<u64>>,
|
||||
fast_field_i64s: HashMap<Field, MultiValueIntFastFieldReader<i64>>,
|
||||
fast_field_u64s: HashMap<Field, MultiValueIntFastFieldReader<u64>>,
|
||||
fast_bytes: HashMap<Field, BytesFastFieldReader>,
|
||||
fast_fields_composite: CompositeFile,
|
||||
}
|
||||
|
||||
enum FastType {
|
||||
I64,
|
||||
U64,
|
||||
}
|
||||
|
||||
fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
|
||||
match field_type {
|
||||
FieldType::U64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::U64, cardinality)),
|
||||
FieldType::I64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::I64, cardinality)),
|
||||
FieldType::HierarchicalFacet => Some((FastType::U64, Cardinality::MultiValues)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldReaders {
|
||||
pub(crate) fn load_all(
|
||||
schema: &Schema,
|
||||
fast_fields_composite: &CompositeFile,
|
||||
) -> Result<FastFieldReaders> {
|
||||
let mut fast_field_readers = FastFieldReaders {
|
||||
fast_field_i64: Default::default(),
|
||||
fast_field_u64: Default::default(),
|
||||
fast_field_i64s: Default::default(),
|
||||
fast_field_u64s: Default::default(),
|
||||
fast_bytes: Default::default(),
|
||||
fast_fields_composite: fast_fields_composite.clone(),
|
||||
};
|
||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
let field_type = field_entry.field_type();
|
||||
if field_type == &FieldType::Bytes {
|
||||
let idx_reader = fast_fields_composite
|
||||
.open_read_with_idx(field, 0)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||
.map(FastFieldReader::open)?;
|
||||
let data = fast_fields_composite
|
||||
.open_read_with_idx(field, 1)
|
||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||
fast_field_readers
|
||||
.fast_bytes
|
||||
.insert(field, BytesFastFieldReader::open(idx_reader, data));
|
||||
} else if let Some((fast_type, cardinality)) = type_and_cardinality(field_type) {
|
||||
match cardinality {
|
||||
Cardinality::SingleValue => {
|
||||
if let Some(fast_field_data) = fast_fields_composite.open_read(field) {
|
||||
match fast_type {
|
||||
FastType::U64 => {
|
||||
let fast_field_reader = FastFieldReader::open(fast_field_data);
|
||||
fast_field_readers
|
||||
.fast_field_u64
|
||||
.insert(field, fast_field_reader);
|
||||
}
|
||||
FastType::I64 => {
|
||||
fast_field_readers.fast_field_i64.insert(
|
||||
field,
|
||||
FastFieldReader::open(fast_field_data.clone()),
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||
}
|
||||
}
|
||||
Cardinality::MultiValues => {
|
||||
let idx_opt = fast_fields_composite.open_read_with_idx(field, 0);
|
||||
let data_opt = fast_fields_composite.open_read_with_idx(field, 1);
|
||||
if let (Some(fast_field_idx), Some(fast_field_data)) = (idx_opt, data_opt) {
|
||||
let idx_reader = FastFieldReader::open(fast_field_idx);
|
||||
match fast_type {
|
||||
FastType::I64 => {
|
||||
let vals_reader = FastFieldReader::open(fast_field_data);
|
||||
let multivalued_int_fast_field =
|
||||
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
|
||||
fast_field_readers
|
||||
.fast_field_i64s
|
||||
.insert(field, multivalued_int_fast_field);
|
||||
}
|
||||
FastType::U64 => {
|
||||
let vals_reader = FastFieldReader::open(fast_field_data);
|
||||
let multivalued_int_fast_field =
|
||||
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
|
||||
fast_field_readers
|
||||
.fast_field_u64s
|
||||
.insert(field, multivalued_int_fast_field);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(fast_field_readers)
|
||||
}
|
||||
|
||||
pub(crate) fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||
self.fast_fields_composite.space_usage()
|
||||
}
|
||||
|
||||
/// Returns the `u64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 fast field, this method returns `None`.
|
||||
pub fn u64(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||
self.fast_field_u64.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// If the field is a u64-fast field return the associated reader.
|
||||
/// If the field is a i64-fast field, return the associated u64 reader. Values are
|
||||
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping. ///
|
||||
///
|
||||
/// This method is useful when merging segment reader.
|
||||
pub(crate) fn u64_lenient(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||
if let Some(u64_ff_reader) = self.u64(field) {
|
||||
return Some(u64_ff_reader);
|
||||
}
|
||||
if let Some(i64_ff_reader) = self.i64(field) {
|
||||
return Some(i64_ff_reader.into_u64_reader());
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns `None`.
|
||||
pub fn i64(&self, field: Field) -> Option<FastFieldReader<i64>> {
|
||||
self.fast_field_i64.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 multi-valued fast field, this method returns `None`.
|
||||
pub fn u64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
|
||||
self.fast_field_u64s.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// If the field is a u64s-fast field return the associated reader.
|
||||
/// If the field is a i64s-fast field, return the associated u64s reader. Values are
|
||||
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping.
|
||||
///
|
||||
/// This method is useful when merging segment reader.
|
||||
pub(crate) fn u64s_lenient(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
|
||||
if let Some(u64s_ff_reader) = self.u64s(field) {
|
||||
return Some(u64s_ff_reader);
|
||||
}
|
||||
if let Some(i64s_ff_reader) = self.i64s(field) {
|
||||
return Some(i64s_ff_reader.into_u64s_reader());
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Returns a `i64s` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 multi-valued fast field, this method returns `None`.
|
||||
pub fn i64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<i64>> {
|
||||
self.fast_field_i64s.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns the `bytes` fast field reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a bytes fast field, returns `None`.
|
||||
pub fn bytes(&self, field: Field) -> Option<BytesFastFieldReader> {
|
||||
self.fast_bytes.get(&field).cloned()
|
||||
}
|
||||
}
|
||||
@@ -13,15 +13,15 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
#[cfg(feature = "mmap")]
|
||||
fn test_indexing() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
|
||||
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_from_tempdir(schema).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
let mut rng = thread_rng();
|
||||
|
||||
@@ -36,8 +36,8 @@ fn test_indexing() {
|
||||
index_writer.commit().expect("Commit failed");
|
||||
committed_docs.extend(&uncommitted_docs);
|
||||
uncommitted_docs.clear();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
// check that everything is correct.
|
||||
check_index_content(&searcher, &committed_docs);
|
||||
} else {
|
||||
|
||||
@@ -2,6 +2,7 @@ use super::operation::DeleteOperation;
|
||||
use std::mem;
|
||||
use std::ops::DerefMut;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use Opstamp;
|
||||
|
||||
// The DeleteQueue is similar in conceptually to a multiple
|
||||
// consumer single producer broadcast channel.
|
||||
@@ -184,7 +185,7 @@ impl DeleteCursor {
|
||||
/// queue are consume and the next get will return None.
|
||||
/// - the next get will return the first operation with an
|
||||
/// `opstamp >= target_opstamp`.
|
||||
pub fn skip_to(&mut self, target_opstamp: u64) {
|
||||
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
|
||||
// TODO Can be optimize as we work with block.
|
||||
while self.is_behind_opstamp(target_opstamp) {
|
||||
self.advance();
|
||||
@@ -192,7 +193,7 @@ impl DeleteCursor {
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
|
||||
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
|
||||
fn is_behind_opstamp(&mut self, target_opstamp: Opstamp) -> bool {
|
||||
self.get()
|
||||
.map(|operation| operation.opstamp < target_opstamp)
|
||||
.unwrap_or(false)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::sync::Arc;
|
||||
use DocId;
|
||||
use Opstamp;
|
||||
|
||||
// Doc to opstamp is used to identify which
|
||||
// document should be deleted.
|
||||
@@ -23,7 +24,7 @@ pub enum DocToOpstampMapping {
|
||||
}
|
||||
|
||||
impl From<Vec<u64>> for DocToOpstampMapping {
|
||||
fn from(opstamps: Vec<u64>) -> DocToOpstampMapping {
|
||||
fn from(opstamps: Vec<Opstamp>) -> DocToOpstampMapping {
|
||||
DocToOpstampMapping::WithMap(Arc::new(opstamps))
|
||||
}
|
||||
}
|
||||
@@ -35,7 +36,7 @@ impl DocToOpstampMapping {
|
||||
//
|
||||
// The edge case opstamp = some doc opstamp is in practise
|
||||
// never called.
|
||||
pub fn compute_doc_limit(&self, target_opstamp: u64) -> DocId {
|
||||
pub fn compute_doc_limit(&self, target_opstamp: Opstamp) -> DocId {
|
||||
match *self {
|
||||
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
|
||||
match doc_opstamps.binary_search(&target_opstamp) {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::operation::AddOperation;
|
||||
use super::operation::{AddOperation, UserOperation};
|
||||
use super::segment_updater::SegmentUpdater;
|
||||
use super::PreparedCommit;
|
||||
use bit_set::BitSet;
|
||||
@@ -26,9 +26,11 @@ use schema::Document;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use std::mem;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
use Opstamp;
|
||||
use Result;
|
||||
|
||||
// Size of the margin for the heap. A segment is closed when the remaining memory
|
||||
@@ -43,8 +45,8 @@ pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
|
||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
|
||||
type DocumentSender = channel::Sender<AddOperation>;
|
||||
type DocumentReceiver = channel::Receiver<AddOperation>;
|
||||
type OperationSender = channel::Sender<Vec<AddOperation>>;
|
||||
type OperationReceiver = channel::Receiver<Vec<AddOperation>>;
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
@@ -84,8 +86,8 @@ pub struct IndexWriter {
|
||||
|
||||
workers_join_handle: Vec<JoinHandle<Result<()>>>,
|
||||
|
||||
document_receiver: DocumentReceiver,
|
||||
document_sender: DocumentSender,
|
||||
operation_receiver: OperationReceiver,
|
||||
operation_sender: OperationSender,
|
||||
|
||||
segment_updater: SegmentUpdater,
|
||||
|
||||
@@ -98,7 +100,7 @@ pub struct IndexWriter {
|
||||
delete_queue: DeleteQueue,
|
||||
|
||||
stamper: Stamper,
|
||||
committed_opstamp: u64,
|
||||
committed_opstamp: Opstamp,
|
||||
}
|
||||
|
||||
/// Open a new index writer. Attempts to acquire a lockfile.
|
||||
@@ -132,7 +134,7 @@ pub fn open_index_writer(
|
||||
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
@@ -150,8 +152,8 @@ pub fn open_index_writer(
|
||||
heap_size_in_bytes_per_thread,
|
||||
index: index.clone(),
|
||||
|
||||
document_receiver,
|
||||
document_sender,
|
||||
operation_receiver: document_receiver,
|
||||
operation_sender: document_sender,
|
||||
|
||||
segment_updater,
|
||||
|
||||
@@ -176,7 +178,7 @@ pub fn compute_deleted_bitset(
|
||||
segment_reader: &SegmentReader,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &DocToOpstampMapping,
|
||||
target_opstamp: u64,
|
||||
target_opstamp: Opstamp,
|
||||
) -> Result<bool> {
|
||||
let mut might_have_changed = false;
|
||||
|
||||
@@ -218,7 +220,7 @@ pub fn compute_deleted_bitset(
|
||||
pub fn advance_deletes(
|
||||
mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: u64,
|
||||
target_opstamp: Opstamp,
|
||||
) -> Result<()> {
|
||||
{
|
||||
if segment_entry.meta().delete_opstamp() == Some(target_opstamp) {
|
||||
@@ -258,7 +260,7 @@ pub fn advance_deletes(
|
||||
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
||||
}
|
||||
}
|
||||
segment_entry.set_meta((*segment.meta()).clone());
|
||||
segment_entry.set_meta(segment.meta().clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -266,7 +268,7 @@ fn index_documents(
|
||||
memory_budget: usize,
|
||||
segment: &Segment,
|
||||
generation: usize,
|
||||
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||
document_iterator: &mut Iterator<Item = Vec<AddOperation>>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
@@ -274,11 +276,11 @@ fn index_documents(
|
||||
let segment_id = segment.id();
|
||||
let table_size = initial_table_size(memory_budget);
|
||||
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
|
||||
for doc in document_iterator {
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
|
||||
for documents in document_iterator {
|
||||
for doc in documents {
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
}
|
||||
let mem_usage = segment_writer.mem_usage();
|
||||
|
||||
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
|
||||
info!(
|
||||
"Buffer limit reached, flushing segment with maxdoc={}.",
|
||||
@@ -298,11 +300,11 @@ fn index_documents(
|
||||
// the worker thread.
|
||||
assert!(num_docs > 0);
|
||||
|
||||
let doc_opstamps: Vec<u64> = segment_writer.finalize()?;
|
||||
let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
|
||||
|
||||
let segment_meta = SegmentMeta::new(segment_id, num_docs);
|
||||
|
||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
||||
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
|
||||
|
||||
let delete_bitset_opt = if delete_cursor.get().is_some() {
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
@@ -330,11 +332,12 @@ fn index_documents(
|
||||
}
|
||||
|
||||
impl IndexWriter {
|
||||
/// The index writer
|
||||
/// If there are some merging threads, blocks until they all finish their work and
|
||||
/// then drop the `IndexWriter`.
|
||||
pub fn wait_merging_threads(mut self) -> Result<()> {
|
||||
// this will stop the indexing thread,
|
||||
// dropping the last reference to the segment_updater.
|
||||
drop(self.document_sender);
|
||||
drop(self.operation_sender);
|
||||
|
||||
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]);
|
||||
for join_handle in former_workers_handles {
|
||||
@@ -381,9 +384,8 @@ impl IndexWriter {
|
||||
|
||||
/// Spawns a new worker thread for indexing.
|
||||
/// The thread consumes documents from the pipeline.
|
||||
///
|
||||
fn add_indexing_worker(&mut self) -> Result<()> {
|
||||
let document_receiver_clone = self.document_receiver.clone();
|
||||
let document_receiver_clone = self.operation_receiver.clone();
|
||||
let mut segment_updater = self.segment_updater.clone();
|
||||
|
||||
let generation = self.generation;
|
||||
@@ -409,8 +411,12 @@ impl IndexWriter {
|
||||
// this is a valid guarantee as the
|
||||
// peeked document now belongs to
|
||||
// our local iterator.
|
||||
if let Some(operation) = document_iterator.peek() {
|
||||
delete_cursor.skip_to(operation.opstamp);
|
||||
if let Some(operations) = document_iterator.peek() {
|
||||
if let Some(first) = operations.first() {
|
||||
delete_cursor.skip_to(first.opstamp);
|
||||
} else {
|
||||
return Ok(());
|
||||
}
|
||||
} else {
|
||||
// No more documents.
|
||||
// Happens when there is a commit, or if the `IndexWriter`
|
||||
@@ -456,6 +462,52 @@ impl IndexWriter {
|
||||
self.segment_updater.garbage_collect_files()
|
||||
}
|
||||
|
||||
/// Deletes all documents from the index
|
||||
///
|
||||
/// Requires `commit`ing
|
||||
/// Enables users to rebuild the index,
|
||||
/// by clearing and resubmitting necessary documents
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::query::QueryParser;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::schema::*;
|
||||
/// use tantivy::Index;
|
||||
///
|
||||
/// fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
/// let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||
/// let schema = schema_builder.build();
|
||||
///
|
||||
/// let index = Index::create_in_ram(schema.clone());
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The modern Promotheus"));
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let clear_res = index_writer.delete_all_documents().unwrap();
|
||||
/// // have to commit, otherwise deleted terms remain available
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let searcher = index.reader()?.searcher();
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query_promo = query_parser.parse_query("Promotheus")?;
|
||||
/// let top_docs_promo = searcher.search(&query_promo, &TopDocs::with_limit(1))?;
|
||||
///
|
||||
/// assert!(top_docs_promo.is_empty());
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
pub fn delete_all_documents(&mut self) -> Result<Opstamp> {
|
||||
// Delete segments
|
||||
self.segment_updater.remove_all_segments();
|
||||
// Return new stamp - reverted stamp
|
||||
self.stamper.revert(self.committed_opstamp);
|
||||
Ok(self.committed_opstamp)
|
||||
}
|
||||
|
||||
/// Merges a given list of segments
|
||||
///
|
||||
/// `segment_ids` is required to be non-empty.
|
||||
@@ -474,29 +526,32 @@ impl IndexWriter {
|
||||
/// when no documents are remaining.
|
||||
///
|
||||
/// Returns the former segment_ready channel.
|
||||
fn recreate_document_channel(&mut self) -> DocumentReceiver {
|
||||
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
fn recreate_document_channel(&mut self) -> OperationReceiver {
|
||||
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
mem::replace(&mut self.document_sender, document_sender);
|
||||
mem::replace(&mut self.document_receiver, document_receiver)
|
||||
mem::replace(&mut self.operation_sender, document_sender);
|
||||
mem::replace(&mut self.operation_receiver, document_receiver)
|
||||
}
|
||||
|
||||
/// Rollback to the last commit
|
||||
///
|
||||
/// This cancels all of the update that
|
||||
/// happened before after the last commit.
|
||||
/// This cancels all of the updates that
|
||||
/// happened after the last commit.
|
||||
/// After calling rollback, the index is in the same
|
||||
/// state as it was after the last commit.
|
||||
///
|
||||
/// The opstamp at the last commit is returned.
|
||||
pub fn rollback(&mut self) -> Result<()> {
|
||||
pub fn rollback(&mut self) -> Result<Opstamp> {
|
||||
info!("Rolling back to opstamp {}", self.committed_opstamp);
|
||||
self.rollback_impl()
|
||||
}
|
||||
|
||||
/// Private, implementation of rollback
|
||||
fn rollback_impl(&mut self) -> Result<Opstamp> {
|
||||
// marks the segment updater as killed. From now on, all
|
||||
// segment updates will be ignored.
|
||||
self.segment_updater.kill();
|
||||
|
||||
let document_receiver = self.document_receiver.clone();
|
||||
let document_receiver = self.operation_receiver.clone();
|
||||
|
||||
// take the directory lock to create a new index_writer.
|
||||
let directory_lock = self
|
||||
@@ -524,7 +579,7 @@ impl IndexWriter {
|
||||
// was dropped with the index_writer.
|
||||
for _ in document_receiver.clone() {}
|
||||
|
||||
Ok(())
|
||||
Ok(self.committed_opstamp)
|
||||
}
|
||||
|
||||
/// Prepares a commit.
|
||||
@@ -562,7 +617,7 @@ impl IndexWriter {
|
||||
info!("Preparing commit");
|
||||
|
||||
// this will drop the current document channel
|
||||
// and recreate a new one channels.
|
||||
// and recreate a new one.
|
||||
self.recreate_document_channel();
|
||||
|
||||
let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
|
||||
@@ -596,7 +651,7 @@ impl IndexWriter {
|
||||
/// Commit returns the `opstamp` of the last document
|
||||
/// that made it in the commit.
|
||||
///
|
||||
pub fn commit(&mut self) -> Result<u64> {
|
||||
pub fn commit(&mut self) -> Result<Opstamp> {
|
||||
self.prepare_commit()?.commit()
|
||||
}
|
||||
|
||||
@@ -612,7 +667,7 @@ impl IndexWriter {
|
||||
///
|
||||
/// Like adds, the deletion itself will be visible
|
||||
/// only after calling `commit()`.
|
||||
pub fn delete_term(&mut self, term: Term) -> u64 {
|
||||
pub fn delete_term(&self, term: Term) -> Opstamp {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let delete_operation = DeleteOperation { opstamp, term };
|
||||
self.delete_queue.push(delete_operation);
|
||||
@@ -626,7 +681,7 @@ impl IndexWriter {
|
||||
///
|
||||
/// This is also the opstamp of the commit that is currently
|
||||
/// available for searchers.
|
||||
pub fn commit_opstamp(&self) -> u64 {
|
||||
pub fn commit_opstamp(&self) -> Opstamp {
|
||||
self.committed_opstamp
|
||||
}
|
||||
|
||||
@@ -640,28 +695,171 @@ impl IndexWriter {
|
||||
///
|
||||
/// Currently it represents the number of documents that
|
||||
/// have been added since the creation of the index.
|
||||
pub fn add_document(&mut self, document: Document) -> u64 {
|
||||
pub fn add_document(&self, document: Document) -> Opstamp {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let add_operation = AddOperation { opstamp, document };
|
||||
let send_result = self.document_sender.send(add_operation);
|
||||
let send_result = self.operation_sender.send(vec![add_operation]);
|
||||
if let Err(e) = send_result {
|
||||
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||
}
|
||||
opstamp
|
||||
}
|
||||
|
||||
/// Gets a range of stamps from the stamper and "pops" the last stamp
|
||||
/// from the range returning a tuple of the last optstamp and the popped
|
||||
/// range.
|
||||
///
|
||||
/// The total number of stamps generated by this method is `count + 1`;
|
||||
/// each operation gets a stamp from the `stamps` iterator and `last_opstamp`
|
||||
/// is for the batch itself.
|
||||
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
|
||||
let Range { start, end } = self.stamper.stamps(count + 1u64);
|
||||
let last_opstamp = end - 1;
|
||||
let stamps = Range {
|
||||
start,
|
||||
end: last_opstamp,
|
||||
};
|
||||
(last_opstamp, stamps)
|
||||
}
|
||||
|
||||
/// Runs a group of document operations ensuring that the operations are
|
||||
/// assigned contigous u64 opstamps and that add operations of the same
|
||||
/// group are flushed into the same segment.
|
||||
///
|
||||
/// If the indexing pipeline is full, this call may block.
|
||||
///
|
||||
/// Each operation of the given `user_operations` will receive an in-order,
|
||||
/// contiguous u64 opstamp. The entire batch itself is also given an
|
||||
/// opstamp that is 1 greater than the last given operation. This
|
||||
/// `batch_opstamp` is the return value of `run`. An empty group of
|
||||
/// `user_operations`, an empty `Vec<UserOperation>`, still receives
|
||||
/// a valid opstamp even though no changes were _actually_ made to the index.
|
||||
///
|
||||
/// Like adds and deletes (see `IndexWriter.add_document` and
|
||||
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
|
||||
/// visible to readers only after calling `commit()`.
|
||||
pub fn run(&self, user_operations: Vec<UserOperation>) -> Opstamp {
|
||||
let count = user_operations.len() as u64;
|
||||
if count == 0 {
|
||||
return self.stamper.stamp();
|
||||
}
|
||||
let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
|
||||
|
||||
let mut adds: Vec<AddOperation> = Vec::new();
|
||||
|
||||
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
|
||||
match user_op {
|
||||
UserOperation::Delete(term) => {
|
||||
let delete_operation = DeleteOperation { opstamp, term };
|
||||
self.delete_queue.push(delete_operation);
|
||||
}
|
||||
UserOperation::Add(document) => {
|
||||
let add_operation = AddOperation { opstamp, document };
|
||||
adds.push(add_operation);
|
||||
}
|
||||
}
|
||||
}
|
||||
let send_result = self.operation_sender.send(adds);
|
||||
if let Err(e) = send_result {
|
||||
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||
};
|
||||
|
||||
batch_opstamp
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::super::operation::UserOperation;
|
||||
use super::initial_table_size;
|
||||
use collector::TopDocs;
|
||||
use directory::error::LockError;
|
||||
use error::*;
|
||||
use indexer::NoMergePolicy;
|
||||
use schema::{self, Document};
|
||||
use query::TermQuery;
|
||||
use schema::{self, IndexRecordOption};
|
||||
use Index;
|
||||
use ReloadPolicy;
|
||||
use Term;
|
||||
|
||||
#[test]
|
||||
fn test_operations_group() {
|
||||
// an operations group with 2 items should cause 3 opstamps 0, 1, and 2.
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let operations = vec![
|
||||
UserOperation::Add(doc!(text_field=>"a")),
|
||||
UserOperation::Add(doc!(text_field=>"b")),
|
||||
];
|
||||
let batch_opstamp1 = index_writer.run(operations);
|
||||
assert_eq!(batch_opstamp1, 2u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ordered_batched_operations() {
|
||||
// * one delete for `doc!(field=>"a")`
|
||||
// * one add for `doc!(field=>"a")`
|
||||
// * one add for `doc!(field=>"b")`
|
||||
// * one delete for `doc!(field=>"b")`
|
||||
// after commit there is one doc with "a" and 0 doc with "b"
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let a_term = Term::from_field_text(text_field, "a");
|
||||
let b_term = Term::from_field_text(text_field, "b");
|
||||
let operations = vec![
|
||||
UserOperation::Delete(a_term),
|
||||
UserOperation::Add(doc!(text_field=>"a")),
|
||||
UserOperation::Add(doc!(text_field=>"b")),
|
||||
UserOperation::Delete(b_term),
|
||||
];
|
||||
|
||||
index_writer.run(operations);
|
||||
index_writer.commit().expect("failed to commit");
|
||||
reader.reload().expect("failed to load searchers");
|
||||
|
||||
let a_term = Term::from_field_text(text_field, "a");
|
||||
let b_term = Term::from_field_text(text_field, "b");
|
||||
|
||||
let a_query = TermQuery::new(a_term, IndexRecordOption::Basic);
|
||||
let b_query = TermQuery::new(b_term, IndexRecordOption::Basic);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let a_docs = searcher
|
||||
.search(&a_query, &TopDocs::with_limit(1))
|
||||
.expect("search for a failed");
|
||||
|
||||
let b_docs = searcher
|
||||
.search(&b_query, &TopDocs::with_limit(1))
|
||||
.expect("search for b failed");
|
||||
|
||||
assert_eq!(a_docs.len(), 1);
|
||||
assert_eq!(b_docs.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_operations_group() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let index_writer = index.writer(3_000_000).unwrap();
|
||||
let operations1 = vec![];
|
||||
let batch_opstamp1 = index_writer.run(operations1);
|
||||
assert_eq!(batch_opstamp1, 0u64);
|
||||
let operations2 = vec![];
|
||||
let batch_opstamp2 = index_writer.run(operations2);
|
||||
assert_eq!(batch_opstamp2, 1u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lockfile_stops_duplicates() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
@@ -722,9 +920,13 @@ mod tests {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let num_docs_containing = |s: &str| {
|
||||
let searcher = index.searcher();
|
||||
let searcher = reader.searcher();
|
||||
let term = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term)
|
||||
};
|
||||
@@ -734,7 +936,6 @@ mod tests {
|
||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.rollback().unwrap();
|
||||
|
||||
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
{
|
||||
@@ -742,13 +943,13 @@ mod tests {
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index.load_searchers().unwrap();
|
||||
reader.reload().unwrap();
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
assert_eq!(num_docs_containing("b"), 1);
|
||||
assert_eq!(num_docs_containing("c"), 1);
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
index.searcher();
|
||||
reader.reload().unwrap();
|
||||
reader.searcher();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -756,32 +957,33 @@ mod tests {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let num_docs_containing = |s: &str| {
|
||||
let searcher = index.searcher();
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term_a)
|
||||
reader.searcher().doc_freq(&term_a)
|
||||
};
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||
// create 8 segments with 100 tiny docs
|
||||
for _doc in 0..100 {
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a");
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
}
|
||||
index_writer.commit().expect("commit failed");
|
||||
for _doc in 0..100 {
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "a");
|
||||
index_writer.add_document(doc);
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
}
|
||||
// this should create 8 segments and trigger a merge.
|
||||
// this should create 8 segments and trigger a merge.
|
||||
index_writer.commit().expect("commit failed");
|
||||
index_writer
|
||||
.wait_merging_threads()
|
||||
.expect("waiting merging thread failed");
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
reader.reload().unwrap();
|
||||
|
||||
assert_eq!(num_docs_containing("a"), 200);
|
||||
assert!(index.searchable_segments().unwrap().len() < 8);
|
||||
@@ -848,11 +1050,15 @@ mod tests {
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let num_docs_containing = |s: &str| {
|
||||
let searcher = index.searcher();
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term_a)
|
||||
index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap()
|
||||
.searcher()
|
||||
.doc_freq(&term_a)
|
||||
};
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
assert_eq!(num_docs_containing("b"), 100);
|
||||
@@ -860,9 +1066,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(initial_table_size(100_000), 12);
|
||||
assert_eq!(initial_table_size(1_000_000), 15);
|
||||
assert_eq!(initial_table_size(10_000_000), 18);
|
||||
assert_eq!(initial_table_size(100_000), 11);
|
||||
assert_eq!(initial_table_size(1_000_000), 14);
|
||||
assert_eq!(initial_table_size(10_000_000), 17);
|
||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||
}
|
||||
|
||||
@@ -884,14 +1090,153 @@ mod tests {
|
||||
index_writer.add_document(doc!(text_field => "b"));
|
||||
}
|
||||
assert!(index_writer.commit().is_err());
|
||||
index.load_searchers().unwrap();
|
||||
let num_docs_containing = |s: &str| {
|
||||
let searcher = index.searcher();
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term_a)
|
||||
index.reader().unwrap().searcher().doc_freq(&term_a)
|
||||
};
|
||||
assert_eq!(num_docs_containing("a"), 100);
|
||||
assert_eq!(num_docs_containing("b"), 0);
|
||||
fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_add_then_delete_all_documents() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let num_docs_containing = |s: &str| {
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let term = Term::from_field_text(text_field, s);
|
||||
searcher.doc_freq(&term)
|
||||
};
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
|
||||
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
|
||||
let commit_tstamp = index_writer.commit().unwrap();
|
||||
assert!(commit_tstamp > add_tstamp);
|
||||
index_writer.delete_all_documents().unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
// Search for documents with the same term that we added
|
||||
assert_eq!(num_docs_containing("a"), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_rollback_correct_stamp() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
|
||||
let add_tstamp = index_writer.add_document(doc!(text_field => "a"));
|
||||
|
||||
// commit documents - they are now available
|
||||
let first_commit = index_writer.commit();
|
||||
assert!(first_commit.is_ok());
|
||||
let first_commit_tstamp = first_commit.unwrap();
|
||||
assert!(first_commit_tstamp > add_tstamp);
|
||||
|
||||
// delete_all_documents the index
|
||||
let clear_tstamp = index_writer.delete_all_documents().unwrap();
|
||||
assert_eq!(clear_tstamp, add_tstamp);
|
||||
|
||||
// commit the clear command - now documents aren't available
|
||||
let second_commit = index_writer.commit();
|
||||
assert!(second_commit.is_ok());
|
||||
let second_commit_tstamp = second_commit.unwrap();
|
||||
|
||||
// add new documents again
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "b"));
|
||||
}
|
||||
|
||||
// rollback to last commit, when index was empty
|
||||
let rollback = index_writer.rollback();
|
||||
assert!(rollback.is_ok());
|
||||
let rollback_tstamp = rollback.unwrap();
|
||||
assert_eq!(rollback_tstamp, second_commit_tstamp);
|
||||
|
||||
// working with an empty index == no documents
|
||||
let term_b = Term::from_field_text(text_field, "b");
|
||||
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_b), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_then_add() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
let res = index_writer.delete_all_documents();
|
||||
assert!(res.is_ok());
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
// add one simple doc
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
// expect the document with that term to be in the index
|
||||
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_and_rollback() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
|
||||
// add one simple doc
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
let comm = index_writer.commit();
|
||||
assert!(comm.is_ok());
|
||||
let commit_tstamp = comm.unwrap();
|
||||
|
||||
// clear but don't commit!
|
||||
let clear_tstamp = index_writer.delete_all_documents().unwrap();
|
||||
// clear_tstamp should reset to before the last commit
|
||||
assert!(clear_tstamp < commit_tstamp);
|
||||
|
||||
// rollback
|
||||
let _rollback_tstamp = index_writer.rollback().unwrap();
|
||||
// Find original docs in the index
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
// expect the document with that term to be in the index
|
||||
assert_eq!(index.reader().unwrap().searcher().doc_freq(&term_a), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_empty_index() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
let clear = index_writer.delete_all_documents();
|
||||
let commit = index_writer.commit();
|
||||
assert!(clear.is_ok());
|
||||
assert!(commit.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delete_all_documents_index_twice() {
|
||||
let schema_builder = schema::Schema::builder();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||
let clear = index_writer.delete_all_documents();
|
||||
let commit = index_writer.commit();
|
||||
assert!(clear.is_ok());
|
||||
assert!(commit.is_ok());
|
||||
let clear_again = index_writer.delete_all_documents();
|
||||
let commit_again = index_writer.commit();
|
||||
assert!(clear_again.is_ok());
|
||||
assert!(commit_again.is_ok());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ impl MergePolicy for LogMergePolicy {
|
||||
|
||||
let mut size_sorted_tuples = segments
|
||||
.iter()
|
||||
.map(|x| x.num_docs())
|
||||
.map(SegmentMeta::num_docs)
|
||||
.enumerate()
|
||||
.collect::<Vec<(usize, u32)>>();
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use census::{Inventory, TrackedObject};
|
||||
use std::collections::HashSet;
|
||||
use Opstamp;
|
||||
use SegmentId;
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -17,8 +18,8 @@ impl MergeOperationInventory {
|
||||
}
|
||||
}
|
||||
|
||||
/// A `MergeOperation` has two role.
|
||||
/// It carries all of the information required to describe a merge :
|
||||
/// A `MergeOperation` has two roles.
|
||||
/// It carries all of the information required to describe a merge:
|
||||
/// - `target_opstamp` is the opstamp up to which we want to consume the
|
||||
/// delete queue and reflect their deletes.
|
||||
/// - `segment_ids` is the list of segment to be merged.
|
||||
@@ -35,14 +36,14 @@ pub struct MergeOperation {
|
||||
}
|
||||
|
||||
struct InnerMergeOperation {
|
||||
target_opstamp: u64,
|
||||
target_opstamp: Opstamp,
|
||||
segment_ids: Vec<SegmentId>,
|
||||
}
|
||||
|
||||
impl MergeOperation {
|
||||
pub fn new(
|
||||
inventory: &MergeOperationInventory,
|
||||
target_opstamp: u64,
|
||||
target_opstamp: Opstamp,
|
||||
segment_ids: Vec<SegmentId>,
|
||||
) -> MergeOperation {
|
||||
let inner_merge_operation = InnerMergeOperation {
|
||||
@@ -54,7 +55,7 @@ impl MergeOperation {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn target_opstamp(&self) -> u64 {
|
||||
pub fn target_opstamp(&self) -> Opstamp {
|
||||
self.inner.target_opstamp
|
||||
}
|
||||
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use common::MAX_DOC_LIMIT;
|
||||
use core::Segment;
|
||||
use core::SegmentReader;
|
||||
use core::SerializableSegment;
|
||||
use docset::DocSet;
|
||||
use fastfield::BytesFastFieldReader;
|
||||
use fastfield::DeleteBitSet;
|
||||
use fastfield::FastFieldReader;
|
||||
use fastfield::FastFieldSerializer;
|
||||
@@ -23,6 +25,7 @@ use termdict::TermMerger;
|
||||
use termdict::TermOrdinal;
|
||||
use DocId;
|
||||
use Result;
|
||||
use TantivyError;
|
||||
|
||||
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
|
||||
let mut total_tokens = 0u64;
|
||||
@@ -70,7 +73,7 @@ fn compute_min_max_val(
|
||||
// some deleted documents,
|
||||
// we need to recompute the max / min
|
||||
(0..max_doc)
|
||||
.filter(|doc_id| !delete_bitset.is_deleted(*doc_id))
|
||||
.filter(|doc_id| delete_bitset.is_alive(*doc_id))
|
||||
.map(|doc_id| u64_reader.get(doc_id))
|
||||
.minmax()
|
||||
.into_option()
|
||||
@@ -150,6 +153,14 @@ impl IndexMerger {
|
||||
readers.push(reader);
|
||||
}
|
||||
}
|
||||
if max_doc >= MAX_DOC_LIMIT {
|
||||
let err_msg = format!(
|
||||
"The segment resulting from this merge would have {} docs,\
|
||||
which exceeds the limit {}.",
|
||||
max_doc, MAX_DOC_LIMIT
|
||||
);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
Ok(IndexMerger {
|
||||
schema,
|
||||
readers,
|
||||
@@ -194,17 +205,17 @@ impl IndexMerger {
|
||||
fast_field_serializer,
|
||||
)?;
|
||||
}
|
||||
FieldType::U64(ref options) | FieldType::I64(ref options) => {
|
||||
match options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
self.write_single_fast_field(field, fast_field_serializer)?;
|
||||
}
|
||||
Some(Cardinality::MultiValues) => {
|
||||
self.write_multi_fast_field(field, fast_field_serializer)?;
|
||||
}
|
||||
None => {}
|
||||
FieldType::U64(ref options)
|
||||
| FieldType::I64(ref options)
|
||||
| FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
self.write_single_fast_field(field, fast_field_serializer)?;
|
||||
}
|
||||
}
|
||||
Some(Cardinality::MultiValues) => {
|
||||
self.write_multi_fast_field(field, fast_field_serializer)?;
|
||||
}
|
||||
None => {}
|
||||
},
|
||||
FieldType::Str(_) => {
|
||||
// We don't handle str fast field for the moment
|
||||
// They can be implemented using what is done
|
||||
@@ -229,7 +240,10 @@ impl IndexMerger {
|
||||
let mut max_value = u64::min_value();
|
||||
|
||||
for reader in &self.readers {
|
||||
let u64_reader: FastFieldReader<u64> = reader.fast_field_reader(field)?;
|
||||
let u64_reader: FastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.u64_lenient(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
if let Some((seg_min_val, seg_max_val)) =
|
||||
compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset())
|
||||
{
|
||||
@@ -272,24 +286,28 @@ impl IndexMerger {
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
) -> Result<()> {
|
||||
let mut total_num_vals = 0u64;
|
||||
let mut u64s_readers: Vec<MultiValueIntFastFieldReader<u64>> = Vec::new();
|
||||
|
||||
// In the first pass, we compute the total number of vals.
|
||||
//
|
||||
// This is required by the bitpacker, as it needs to know
|
||||
// what should be the bit length use for bitpacking.
|
||||
for reader in &self.readers {
|
||||
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?;
|
||||
let u64s_reader = reader.fast_fields()
|
||||
.u64s_lenient(field)
|
||||
.expect("Failed to find index for multivalued field. This is a bug in tantivy, please report.");
|
||||
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
for doc in 0u32..reader.max_doc() {
|
||||
if !delete_bitset.is_deleted(doc) {
|
||||
let start = idx_reader.get(doc);
|
||||
let end = idx_reader.get(doc + 1);
|
||||
total_num_vals += end - start;
|
||||
if delete_bitset.is_alive(doc) {
|
||||
let num_vals = u64s_reader.num_vals(doc) as u64;
|
||||
total_num_vals += num_vals;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
total_num_vals += idx_reader.max_value();
|
||||
total_num_vals += u64s_reader.total_num_vals();
|
||||
}
|
||||
u64s_readers.push(u64s_reader);
|
||||
}
|
||||
|
||||
// We can now create our `idx` serializer, and in a second pass,
|
||||
@@ -297,13 +315,10 @@ impl IndexMerger {
|
||||
let mut serialize_idx =
|
||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||
let mut idx = 0;
|
||||
for reader in &self.readers {
|
||||
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?;
|
||||
for doc in reader.doc_ids_alive() {
|
||||
for (segment_reader, u64s_reader) in self.readers.iter().zip(&u64s_readers) {
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
serialize_idx.add_val(idx)?;
|
||||
let start = idx_reader.get(doc);
|
||||
let end = idx_reader.get(doc + 1);
|
||||
idx += end - start;
|
||||
idx += u64s_reader.num_vals(doc) as u64;
|
||||
}
|
||||
}
|
||||
serialize_idx.add_val(idx)?;
|
||||
@@ -334,8 +349,10 @@ impl IndexMerger {
|
||||
for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
|
||||
let term_ordinal_mapping: &[TermOrdinal] =
|
||||
term_ordinal_mappings.get_segment(segment_ord);
|
||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
||||
segment_reader.multi_fast_field_reader(field)?;
|
||||
let ff_reader: MultiValueIntFastFieldReader<u64> = segment_reader
|
||||
.fast_fields()
|
||||
.u64s(field)
|
||||
.expect("Could not find multivalued u64 fast value reader.");
|
||||
// TODO optimize if no deletes
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
@@ -367,6 +384,8 @@ impl IndexMerger {
|
||||
|
||||
let mut vals = Vec::with_capacity(100);
|
||||
|
||||
let mut ff_readers = Vec::new();
|
||||
|
||||
// Our values are bitpacked and we need to know what should be
|
||||
// our bitwidth and our minimum value before serializing any values.
|
||||
//
|
||||
@@ -375,7 +394,10 @@ impl IndexMerger {
|
||||
// maximum value and initialize our Serializer.
|
||||
for reader in &self.readers {
|
||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
||||
reader.multi_fast_field_reader(field)?;
|
||||
reader.fast_fields().u64s_lenient(field).expect(
|
||||
"Failed to find multivalued fast field reader. This is a bug in \
|
||||
tantivy. Please report.",
|
||||
);
|
||||
for doc in reader.doc_ids_alive() {
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
for &val in &vals {
|
||||
@@ -383,6 +405,7 @@ impl IndexMerger {
|
||||
max_value = cmp::max(val, max_value);
|
||||
}
|
||||
}
|
||||
ff_readers.push(ff_reader);
|
||||
// TODO optimize when no deletes
|
||||
}
|
||||
|
||||
@@ -395,9 +418,7 @@ impl IndexMerger {
|
||||
{
|
||||
let mut serialize_vals = fast_field_serializer
|
||||
.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
|
||||
for reader in &self.readers {
|
||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
||||
reader.multi_fast_field_reader(field)?;
|
||||
for (reader, ff_reader) in self.readers.iter().zip(ff_readers) {
|
||||
// TODO optimize if no deletes
|
||||
for doc in reader.doc_ids_alive() {
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
@@ -416,19 +437,53 @@ impl IndexMerger {
|
||||
field: Field,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
) -> Result<()> {
|
||||
self.write_fast_field_idx(field, fast_field_serializer)?;
|
||||
let mut total_num_vals = 0u64;
|
||||
let mut bytes_readers: Vec<BytesFastFieldReader> = Vec::new();
|
||||
|
||||
for reader in &self.readers {
|
||||
let bytes_reader = reader.fast_fields().bytes(field).expect(
|
||||
"Failed to find bytes fast field reader. This is a bug in tantivy, please report.",
|
||||
);
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
for doc in 0u32..reader.max_doc() {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
let num_vals = bytes_reader.get_bytes(doc).len() as u64;
|
||||
total_num_vals += num_vals;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
total_num_vals += bytes_reader.total_num_bytes() as u64;
|
||||
}
|
||||
bytes_readers.push(bytes_reader);
|
||||
}
|
||||
|
||||
{
|
||||
// We can now create our `idx` serializer, and in a second pass,
|
||||
// can effectively push the different indexes.
|
||||
let mut serialize_idx =
|
||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||
let mut idx = 0;
|
||||
for (segment_reader, bytes_reader) in self.readers.iter().zip(&bytes_readers) {
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
serialize_idx.add_val(idx)?;
|
||||
idx += bytes_reader.get_bytes(doc).len() as u64;
|
||||
}
|
||||
}
|
||||
serialize_idx.add_val(idx)?;
|
||||
serialize_idx.close_field()?;
|
||||
}
|
||||
|
||||
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1)?;
|
||||
for reader in &self.readers {
|
||||
let bytes_reader = reader.bytes_fast_field_reader(field)?;
|
||||
for segment_reader in &self.readers {
|
||||
let bytes_reader = segment_reader.fast_fields().bytes(field)
|
||||
.expect("Failed to find bytes field in fast field reader. This is a bug in tantivy. Please report.");
|
||||
// TODO: optimize if no deletes
|
||||
for doc in reader.doc_ids_alive() {
|
||||
let val = bytes_reader.get_val(doc);
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
let val = bytes_reader.get_bytes(doc);
|
||||
serialize_vals.write_all(val)?;
|
||||
}
|
||||
}
|
||||
serialize_vals.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -654,7 +709,7 @@ mod tests {
|
||||
use schema::IntOptions;
|
||||
use schema::Term;
|
||||
use schema::TextFieldIndexing;
|
||||
use schema::INT_INDEXED;
|
||||
use schema::INDEXED;
|
||||
use std::io::Cursor;
|
||||
use DocAddress;
|
||||
use IndexWriter;
|
||||
@@ -671,11 +726,13 @@ mod tests {
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let date_field = schema_builder.add_date_field("date", INDEXED);
|
||||
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let curr_time = chrono::Utc::now();
|
||||
let add_score_bytes = |doc: &mut Document, score: u32| {
|
||||
let mut bytes = Vec::new();
|
||||
bytes
|
||||
@@ -692,6 +749,7 @@ mod tests {
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "af b");
|
||||
doc.add_u64(score_field, 3);
|
||||
doc.add_date(date_field, &curr_time);
|
||||
add_score_bytes(&mut doc, 3);
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
@@ -717,6 +775,7 @@ mod tests {
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
doc.add_text(text_field, "af b");
|
||||
doc.add_date(date_field, &curr_time);
|
||||
doc.add_u64(score_field, 11);
|
||||
add_score_bytes(&mut doc, 11);
|
||||
index_writer.add_document(doc);
|
||||
@@ -744,8 +803,8 @@ mod tests {
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
}
|
||||
{
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let get_doc_ids = |terms: Vec<Term>| {
|
||||
let query = BooleanQuery::new_multiterms_query(terms);
|
||||
let top_docs = searcher.search(&query, &TestCollector).unwrap();
|
||||
@@ -774,6 +833,10 @@ mod tests {
|
||||
DocAddress(0, 4)
|
||||
]
|
||||
);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_date(date_field, &curr_time)]),
|
||||
vec![DocAddress(0, 0), DocAddress(0, 3)]
|
||||
);
|
||||
}
|
||||
{
|
||||
let doc = searcher.doc(DocAddress(0, 0)).unwrap();
|
||||
@@ -837,7 +900,7 @@ mod tests {
|
||||
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let search_term = |searcher: &Searcher, term: Term| {
|
||||
let collector = FastFieldTestCollector::for_field(score_field);
|
||||
let bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
|
||||
@@ -874,8 +937,8 @@ mod tests {
|
||||
bytes_score_field => vec![0u8, 0, 0, 3],
|
||||
));
|
||||
index_writer.commit().expect("committed");
|
||||
index.load_searchers().unwrap();
|
||||
let ref searcher = *index.searcher();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||
@@ -921,8 +984,8 @@ mod tests {
|
||||
bytes_score_field => vec![0u8, 0, 27, 88],
|
||||
));
|
||||
index_writer.commit().expect("committed");
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
assert_eq!(searcher.segment_readers().len(), 2);
|
||||
assert_eq!(searcher.num_docs(), 3);
|
||||
@@ -961,14 +1024,16 @@ mod tests {
|
||||
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(0)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 4000);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(1)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 1);
|
||||
assert_eq!(score_field_reader.max_value(), 3);
|
||||
@@ -983,8 +1048,8 @@ mod tests {
|
||||
.expect("Failed to initiate merge")
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.num_docs(), 3);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
|
||||
@@ -1019,7 +1084,8 @@ mod tests {
|
||||
);
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(0)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 3);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
@@ -1029,8 +1095,8 @@ mod tests {
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
@@ -1065,7 +1131,8 @@ mod tests {
|
||||
);
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(0)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 3);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
@@ -1080,9 +1147,9 @@ mod tests {
|
||||
.expect("Failed to initiate merge")
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index.load_searchers().unwrap();
|
||||
reader.reload().unwrap();
|
||||
|
||||
let ref searcher = *index.searcher();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||
@@ -1117,7 +1184,8 @@ mod tests {
|
||||
);
|
||||
let score_field_reader = searcher
|
||||
.segment_reader(0)
|
||||
.fast_field_reader::<u64>(score_field)
|
||||
.fast_fields()
|
||||
.u64(score_field)
|
||||
.unwrap();
|
||||
assert_eq!(score_field_reader.min_value(), 6000);
|
||||
assert_eq!(score_field_reader.max_value(), 7000);
|
||||
@@ -1130,9 +1198,9 @@ mod tests {
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
index.load_searchers().unwrap();
|
||||
reader.reload().unwrap();
|
||||
|
||||
let ref searcher = *index.searcher();
|
||||
let searcher = reader.searcher();
|
||||
assert!(segment_ids.is_empty());
|
||||
assert!(searcher.segment_readers().is_empty());
|
||||
assert_eq!(searcher.num_docs(), 0);
|
||||
@@ -1144,6 +1212,7 @@ mod tests {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet");
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let reader = index.reader().unwrap();
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
||||
@@ -1173,9 +1242,9 @@ mod tests {
|
||||
index_writer.commit().expect("committed");
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
reader.reload().unwrap();
|
||||
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
|
||||
let searcher = index.searcher();
|
||||
let searcher = reader.searcher();
|
||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||
facet_collector.add_facet(Facet::from("/top"));
|
||||
let (count, facet_counts) = searcher
|
||||
@@ -1217,7 +1286,7 @@ mod tests {
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
reader.reload().unwrap();
|
||||
test_searcher(
|
||||
11,
|
||||
&[
|
||||
@@ -1238,7 +1307,7 @@ mod tests {
|
||||
let facet_term = Term::from_facet(facet_field, &facet);
|
||||
index_writer.delete_term(facet_term);
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
reader.reload().unwrap();
|
||||
test_searcher(
|
||||
9,
|
||||
&[
|
||||
@@ -1256,15 +1325,15 @@ mod tests {
|
||||
#[test]
|
||||
fn test_bug_merge() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("intvals", INT_INDEXED);
|
||||
let int_field = schema_builder.add_u64_field("intvals", INDEXED);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(int_field => 1u64));
|
||||
index_writer.commit().expect("commit failed");
|
||||
index_writer.add_document(doc!(int_field => 1u64));
|
||||
index_writer.commit().expect("commit failed");
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
||||
let segment_ids = index
|
||||
@@ -1275,10 +1344,10 @@ mod tests {
|
||||
.expect("Failed to initiate merge")
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index.load_searchers().unwrap();
|
||||
reader.reload().unwrap();
|
||||
// commit has not been called yet. The document should still be
|
||||
// there.
|
||||
assert_eq!(index.searcher().num_docs(), 2);
|
||||
assert_eq!(reader.searcher().num_docs(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1289,7 +1358,7 @@ mod tests {
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intvals", int_options);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let mut doc = Document::default();
|
||||
@@ -1310,8 +1379,8 @@ mod tests {
|
||||
.expect("Merging failed");
|
||||
|
||||
// assert delete has not been committed
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
reader.reload().expect("failed to load searcher 1");
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 2);
|
||||
|
||||
index_writer.commit().unwrap();
|
||||
@@ -1319,13 +1388,13 @@ mod tests {
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_multivalued_int_fields() {
|
||||
fn test_merge_multivalued_int_fields_simple() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = IntOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
@@ -1342,7 +1411,6 @@ mod tests {
|
||||
}
|
||||
index_writer.add_document(doc);
|
||||
};
|
||||
|
||||
index_doc(&mut index_writer, &[1, 2]);
|
||||
index_doc(&mut index_writer, &[1, 2, 3]);
|
||||
index_doc(&mut index_writer, &[4, 5]);
|
||||
@@ -1351,24 +1419,19 @@ mod tests {
|
||||
index_doc(&mut index_writer, &[3]);
|
||||
index_doc(&mut index_writer, &[17]);
|
||||
index_writer.commit().expect("committed");
|
||||
|
||||
index_doc(&mut index_writer, &[20]);
|
||||
index_writer.commit().expect("committed");
|
||||
|
||||
index_doc(&mut index_writer, &[28, 27]);
|
||||
index_doc(&mut index_writer, &[1_000]);
|
||||
|
||||
index_writer.commit().expect("committed");
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let searcher = index.searcher();
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let mut vals: Vec<u64> = Vec::new();
|
||||
|
||||
{
|
||||
let segment = searcher.segment_reader(0u32);
|
||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1, 2]);
|
||||
@@ -1403,7 +1466,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let segment = searcher.segment_reader(1u32);
|
||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[28, 27]);
|
||||
|
||||
@@ -1413,7 +1476,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let segment = searcher.segment_reader(2u32);
|
||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[20]);
|
||||
}
|
||||
@@ -1429,13 +1492,14 @@ mod tests {
|
||||
.expect("Failed to initiate merge")
|
||||
.wait()
|
||||
.expect("Merging failed");
|
||||
index_writer.wait_merging_threads().unwrap();
|
||||
index_writer
|
||||
.wait_merging_threads()
|
||||
.expect("Wait for merging threads");
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
reader.reload().expect("Load searcher");
|
||||
|
||||
{
|
||||
let searcher = index.searcher();
|
||||
let searcher = reader.searcher();
|
||||
println!(
|
||||
"{:?}",
|
||||
searcher
|
||||
@@ -1445,7 +1509,7 @@ mod tests {
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
let segment = searcher.segment_reader(0u32);
|
||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
|
||||
|
||||
ff_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[1, 2]);
|
||||
|
||||
@@ -1,16 +1,24 @@
|
||||
use schema::Document;
|
||||
use schema::Term;
|
||||
use Opstamp;
|
||||
|
||||
/// Timestamped Delete operation.
|
||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||
pub struct DeleteOperation {
|
||||
pub opstamp: u64,
|
||||
pub opstamp: Opstamp,
|
||||
pub term: Term,
|
||||
}
|
||||
|
||||
/// Timestamped Add operation.
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub struct AddOperation {
|
||||
pub opstamp: u64,
|
||||
pub opstamp: Opstamp,
|
||||
pub document: Document,
|
||||
}
|
||||
|
||||
/// UserOperation is an enum type that encapsulates other operation types.
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub enum UserOperation {
|
||||
Add(Document),
|
||||
Delete(Term),
|
||||
}
|
||||
|
||||
@@ -1,15 +1,16 @@
|
||||
use super::IndexWriter;
|
||||
use Opstamp;
|
||||
use Result;
|
||||
|
||||
/// A prepared commit
|
||||
pub struct PreparedCommit<'a> {
|
||||
index_writer: &'a mut IndexWriter,
|
||||
payload: Option<String>,
|
||||
opstamp: u64,
|
||||
opstamp: Opstamp,
|
||||
}
|
||||
|
||||
impl<'a> PreparedCommit<'a> {
|
||||
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit {
|
||||
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit {
|
||||
PreparedCommit {
|
||||
index_writer,
|
||||
payload: None,
|
||||
@@ -17,7 +18,7 @@ impl<'a> PreparedCommit<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn opstamp(&self) -> u64 {
|
||||
pub fn opstamp(&self) -> Opstamp {
|
||||
self.opstamp
|
||||
}
|
||||
|
||||
@@ -25,11 +26,11 @@ impl<'a> PreparedCommit<'a> {
|
||||
self.payload = Some(payload.to_string())
|
||||
}
|
||||
|
||||
pub fn abort(self) -> Result<()> {
|
||||
pub fn abort(self) -> Result<Opstamp> {
|
||||
self.index_writer.rollback()
|
||||
}
|
||||
|
||||
pub fn commit(self) -> Result<u64> {
|
||||
pub fn commit(self) -> Result<Opstamp> {
|
||||
info!("committing {}", self.opstamp);
|
||||
self.index_writer
|
||||
.segment_updater()
|
||||
|
||||
@@ -118,6 +118,12 @@ impl SegmentManager {
|
||||
});
|
||||
}
|
||||
|
||||
pub(crate) fn remove_all_segments(&self) {
|
||||
let mut registers_lock = self.write();
|
||||
registers_lock.committed.clear();
|
||||
registers_lock.uncommitted.clear();
|
||||
}
|
||||
|
||||
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
|
||||
let mut registers_lock = self.write();
|
||||
registers_lock.committed.clear();
|
||||
|
||||
@@ -56,7 +56,7 @@ impl SegmentRegister {
|
||||
.values()
|
||||
.map(|segment_entry| segment_entry.meta().clone())
|
||||
.collect();
|
||||
segment_ids.sort_by_key(|meta| meta.id());
|
||||
segment_ids.sort_by_key(SegmentMeta::id);
|
||||
segment_ids
|
||||
}
|
||||
|
||||
|
||||
@@ -36,14 +36,15 @@ use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
use Opstamp;
|
||||
use Result;
|
||||
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
/// Either
|
||||
// - it fails, in which case an error is returned,
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it success, and `meta.json` is written
|
||||
/// - it succeeds, and `meta.json` is written
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
@@ -62,13 +63,14 @@ pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic:
|
||||
/// Either
|
||||
// - it fails, in which case an error is returned,
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it success, and `meta.json` is written
|
||||
/// and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
fn save_metas(metas: &IndexMeta, directory: &mut Directory) -> Result<()> {
|
||||
info!("save metas");
|
||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||
// Just adding a new line at the end of the buffer.
|
||||
writeln!(&mut buffer)?;
|
||||
@@ -212,6 +214,11 @@ impl SegmentUpdater {
|
||||
}
|
||||
}
|
||||
|
||||
/// Orders `SegmentManager` to remove all segments
|
||||
pub(crate) fn remove_all_segments(&self) {
|
||||
self.0.segment_manager.remove_all_segments();
|
||||
}
|
||||
|
||||
pub fn kill(&mut self) {
|
||||
self.0.killed.store(true, Ordering::Release);
|
||||
}
|
||||
@@ -222,9 +229,9 @@ impl SegmentUpdater {
|
||||
|
||||
/// Apply deletes up to the target opstamp to all segments.
|
||||
///
|
||||
/// Tne method returns copies of the segment entries,
|
||||
/// The method returns copies of the segment entries,
|
||||
/// updated with the delete information.
|
||||
fn purge_deletes(&self, target_opstamp: u64) -> Result<Vec<SegmentEntry>> {
|
||||
fn purge_deletes(&self, target_opstamp: Opstamp) -> Result<Vec<SegmentEntry>> {
|
||||
let mut segment_entries = self.0.segment_manager.segment_entries();
|
||||
for segment_entry in &mut segment_entries {
|
||||
let segment = self.0.index.segment(segment_entry.meta().clone());
|
||||
@@ -233,7 +240,7 @@ impl SegmentUpdater {
|
||||
Ok(segment_entries)
|
||||
}
|
||||
|
||||
pub fn save_metas(&self, opstamp: u64, commit_message: Option<String>) {
|
||||
pub fn save_metas(&self, opstamp: Opstamp, commit_message: Option<String>) {
|
||||
if self.is_alive() {
|
||||
let index = &self.0.index;
|
||||
let directory = index.directory();
|
||||
@@ -280,7 +287,7 @@ impl SegmentUpdater {
|
||||
.garbage_collect(|| self.0.segment_manager.list_files());
|
||||
}
|
||||
|
||||
pub fn commit(&self, opstamp: u64, payload: Option<String>) -> Result<()> {
|
||||
pub fn commit(&self, opstamp: Opstamp, payload: Option<String>) -> Result<()> {
|
||||
self.run_async(move |segment_updater| {
|
||||
if segment_updater.is_alive() {
|
||||
let segment_entries = segment_updater
|
||||
@@ -420,6 +427,7 @@ impl SegmentUpdater {
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
merge_candidates.extend(committed_merge_candidates.into_iter());
|
||||
|
||||
for merge_operation in merge_candidates {
|
||||
match self.start_merge_impl(merge_operation) {
|
||||
Ok(merge_future) => {
|
||||
@@ -444,38 +452,41 @@ impl SegmentUpdater {
|
||||
) -> Result<()> {
|
||||
self.run_async(move |segment_updater| {
|
||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
if let Some(delete_operation) = delete_cursor.get() {
|
||||
let committed_opstamp = segment_updater.load_metas().opstamp;
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let index = &segment_updater.0.index;
|
||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||
if let Err(e) =
|
||||
advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp)
|
||||
{
|
||||
error!(
|
||||
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||
merge_operation.segment_ids(),
|
||||
e
|
||||
);
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
{
|
||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||
if let Some(delete_operation) = delete_cursor.get() {
|
||||
let committed_opstamp = segment_updater.load_metas().opstamp;
|
||||
if delete_operation.opstamp < committed_opstamp {
|
||||
let index = &segment_updater.0.index;
|
||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||
if let Err(e) = advance_deletes(
|
||||
segment,
|
||||
&mut after_merge_segment_entry,
|
||||
committed_opstamp,
|
||||
) {
|
||||
error!(
|
||||
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||
merge_operation.segment_ids(),
|
||||
e
|
||||
);
|
||||
if cfg!(test) {
|
||||
panic!("Merge failed.");
|
||||
}
|
||||
// ... cancel merge
|
||||
// `merge_operations` are tracked. As it is dropped, the
|
||||
// the segment_ids will be available again for merge.
|
||||
return;
|
||||
}
|
||||
// ... cancel merge
|
||||
// `merge_operations` are tracked. As it is dropped, the
|
||||
// the segment_ids will be available again for merge.
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
segment_updater
|
||||
.0
|
||||
.segment_manager
|
||||
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
info!("save metas");
|
||||
let previous_metas = segment_updater.load_metas();
|
||||
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
|
||||
let previous_metas = segment_updater.load_metas();
|
||||
segment_updater
|
||||
.0
|
||||
.segment_manager
|
||||
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
|
||||
} // we drop all possible handle to a now useless `SegmentMeta`.
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
})
|
||||
.wait()
|
||||
@@ -565,9 +576,8 @@ mod tests {
|
||||
index_writer.delete_term(term);
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
assert_eq!(index.searcher().num_docs(), 302);
|
||||
let reader = index.reader().unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 302);
|
||||
|
||||
{
|
||||
index_writer
|
||||
@@ -575,9 +585,9 @@ mod tests {
|
||||
.expect("waiting for merging threads");
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
assert_eq!(index.searcher().segment_readers().len(), 1);
|
||||
assert_eq!(index.searcher().num_docs(), 302);
|
||||
reader.reload().unwrap();
|
||||
assert_eq!(reader.searcher().segment_readers().len(), 1);
|
||||
assert_eq!(reader.searcher().num_docs(), 302);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -636,18 +646,45 @@ mod tests {
|
||||
.expect("waiting for merging threads");
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
assert_eq!(index.searcher().num_docs(), 0);
|
||||
let reader = index.reader().unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
|
||||
let seg_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
assert!(seg_ids.is_empty());
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
assert_eq!(index.searcher().num_docs(), 0);
|
||||
reader.reload().unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
// empty segments should be erased
|
||||
assert!(index.searchable_segment_metas().unwrap().is_empty());
|
||||
assert!(index.searcher().segment_readers().is_empty());
|
||||
assert!(reader.searcher().segment_readers().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_remove_all_segments() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
|
||||
{
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index_writer.segment_updater().remove_all_segments();
|
||||
let seg_vec = index_writer
|
||||
.segment_updater()
|
||||
.0
|
||||
.segment_manager
|
||||
.segment_entries();
|
||||
assert!(seg_vec.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ use fastfield::FastFieldsWriter;
|
||||
use fieldnorm::FieldNormsWriter;
|
||||
use indexer::segment_serializer::SegmentSerializer;
|
||||
use postings::MultiFieldPostingsWriter;
|
||||
use schema::FieldEntry;
|
||||
use schema::FieldType;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
@@ -15,10 +16,11 @@ use tokenizer::BoxedTokenizer;
|
||||
use tokenizer::FacetTokenizer;
|
||||
use tokenizer::{TokenStream, Tokenizer};
|
||||
use DocId;
|
||||
use Opstamp;
|
||||
use Result;
|
||||
|
||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||
/// documents.
|
||||
/// set of documents.
|
||||
///
|
||||
/// They creates the postings list in anonymous memory.
|
||||
/// The segment is layed on disk when the segment gets `finalized`.
|
||||
@@ -28,7 +30,7 @@ pub struct SegmentWriter {
|
||||
segment_serializer: SegmentSerializer,
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FieldNormsWriter,
|
||||
doc_opstamps: Vec<u64>,
|
||||
doc_opstamps: Vec<Opstamp>,
|
||||
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
|
||||
}
|
||||
|
||||
@@ -53,7 +55,7 @@ impl SegmentWriter {
|
||||
schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|field_entry| field_entry.field_type())
|
||||
.map(FieldEntry::field_type)
|
||||
.map(|field_type| match *field_type {
|
||||
FieldType::Str(ref text_options) => text_options
|
||||
.get_indexing_options()
|
||||
@@ -171,6 +173,17 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::Date(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_i64(
|
||||
field_value.field(),
|
||||
field_value.value().date_value().timestamp(),
|
||||
);
|
||||
self.multifield_postings.subscribe(doc_id, &term);
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::I64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
|
||||
@@ -1,65 +1,39 @@
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
use Opstamp;
|
||||
|
||||
// AtomicU64 have not landed in stable.
|
||||
// For the moment let's just use AtomicUsize on
|
||||
// x86/64 bit platform, and a mutex on other platform.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod archicture_impl {
|
||||
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AtomicU64Ersatz(AtomicUsize);
|
||||
|
||||
impl AtomicU64Ersatz {
|
||||
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
||||
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize))
|
||||
}
|
||||
|
||||
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
|
||||
self.0.fetch_add(val as usize, order) as u64
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(target_arch = "x86_64"))]
|
||||
mod archicture_impl {
|
||||
|
||||
use std::sync::atomic::Ordering;
|
||||
/// Under other architecture, we rely on a mutex.
|
||||
use std::sync::RwLock;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AtomicU64Ersatz(RwLock<u64>);
|
||||
|
||||
impl AtomicU64Ersatz {
|
||||
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
||||
AtomicU64Ersatz(RwLock::new(first_opstamp))
|
||||
}
|
||||
|
||||
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
|
||||
let mut lock = self.0.write().unwrap();
|
||||
let previous_val = *lock;
|
||||
*lock = previous_val + incr;
|
||||
previous_val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
use self::archicture_impl::AtomicU64Ersatz;
|
||||
|
||||
/// Stamper provides Opstamps, which is just an auto-increment id to label
|
||||
/// an operation.
|
||||
///
|
||||
/// Cloning does not "fork" the stamp generation. The stamper actually wraps an `Arc`.
|
||||
#[derive(Clone, Default)]
|
||||
pub struct Stamper(Arc<AtomicU64Ersatz>);
|
||||
pub struct Stamper(Arc<AtomicU64>);
|
||||
|
||||
impl Stamper {
|
||||
pub fn new(first_opstamp: u64) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
|
||||
pub fn new(first_opstamp: Opstamp) -> Stamper {
|
||||
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
||||
}
|
||||
|
||||
pub fn stamp(&self) -> u64 {
|
||||
pub fn stamp(&self) -> Opstamp {
|
||||
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
||||
}
|
||||
|
||||
/// Given a desired count `n`, `stamps` returns an iterator that
|
||||
/// will supply `n` number of u64 stamps.
|
||||
pub fn stamps(&self, n: u64) -> Range<Opstamp> {
|
||||
let start = self.0.fetch_add(n, Ordering::SeqCst);
|
||||
Range {
|
||||
start,
|
||||
end: start + n,
|
||||
}
|
||||
}
|
||||
|
||||
/// Reverts the stamper to a given `Opstamp` value and returns it
|
||||
pub fn revert(&self, to_opstamp: Opstamp) -> Opstamp {
|
||||
self.0.store(to_opstamp, Ordering::SeqCst);
|
||||
to_opstamp
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -78,5 +52,21 @@ mod test {
|
||||
|
||||
assert_eq!(stamper.stamp(), 10u64);
|
||||
assert_eq!(stamper_clone.stamp(), 11u64);
|
||||
assert_eq!(stamper.stamps(3u64), (12..15));
|
||||
assert_eq!(stamper.stamp(), 15u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stamper_revert() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
assert_eq!(stamper.stamp(), 7u64);
|
||||
assert_eq!(stamper.stamp(), 8u64);
|
||||
|
||||
let stamper_clone = stamper.clone();
|
||||
assert_eq!(stamper_clone.stamp(), 9u64);
|
||||
|
||||
stamper.revert(6);
|
||||
assert_eq!(stamper.stamp(), 6);
|
||||
assert_eq!(stamper_clone.stamp(), 7);
|
||||
}
|
||||
}
|
||||
|
||||
196
src/lib.rs
196
src/lib.rs
@@ -75,9 +75,9 @@
|
||||
//!
|
||||
//! // # Searching
|
||||
//!
|
||||
//! index.load_searchers()?;
|
||||
//! let reader = index.reader()?;
|
||||
//!
|
||||
//! let searcher = index.searcher();
|
||||
//! let searcher = reader.searcher();
|
||||
//!
|
||||
//! let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
//!
|
||||
@@ -132,13 +132,13 @@ extern crate byteorder;
|
||||
extern crate combine;
|
||||
extern crate crossbeam;
|
||||
extern crate fnv;
|
||||
extern crate fst;
|
||||
extern crate fst_regex;
|
||||
extern crate futures;
|
||||
extern crate futures_cpupool;
|
||||
extern crate htmlescape;
|
||||
extern crate itertools;
|
||||
extern crate levenshtein_automata;
|
||||
#[cfg(feature = "mmap")]
|
||||
extern crate memmap;
|
||||
extern crate num_cpus;
|
||||
extern crate owning_ref;
|
||||
extern crate regex;
|
||||
@@ -146,6 +146,7 @@ extern crate rust_stemmers;
|
||||
extern crate scoped_pool;
|
||||
extern crate serde;
|
||||
extern crate stable_deref_trait;
|
||||
extern crate tantivy_fst;
|
||||
extern crate tempdir;
|
||||
extern crate tempfile;
|
||||
extern crate uuid;
|
||||
@@ -168,11 +169,12 @@ extern crate maplit;
|
||||
extern crate test;
|
||||
|
||||
#[macro_use]
|
||||
extern crate downcast;
|
||||
extern crate downcast_rs;
|
||||
|
||||
#[macro_use]
|
||||
extern crate fail;
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(test)]
|
||||
mod functional_test;
|
||||
|
||||
@@ -185,11 +187,15 @@ pub use error::TantivyError;
|
||||
pub use error::TantivyError as Error;
|
||||
|
||||
extern crate census;
|
||||
pub extern crate chrono;
|
||||
extern crate owned_read;
|
||||
|
||||
/// Tantivy result.
|
||||
pub type Result<T> = std::result::Result<T, error::TantivyError>;
|
||||
|
||||
/// Tantivy DateTime
|
||||
pub type DateTime = chrono::DateTime<chrono::Utc>;
|
||||
|
||||
mod common;
|
||||
mod core;
|
||||
mod indexer;
|
||||
@@ -210,6 +216,9 @@ pub mod space_usage;
|
||||
pub mod store;
|
||||
pub mod termdict;
|
||||
|
||||
mod reader;
|
||||
|
||||
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy};
|
||||
mod snippet;
|
||||
pub use self::snippet::{Snippet, SnippetGenerator};
|
||||
|
||||
@@ -217,7 +226,7 @@ mod docset;
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
|
||||
pub use core::SegmentComponent;
|
||||
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use core::{InvertedIndexReader, SegmentReader};
|
||||
pub use directory::Directory;
|
||||
pub use indexer::IndexWriter;
|
||||
@@ -245,6 +254,16 @@ pub mod merge_policy {
|
||||
/// as they are added in the segment.
|
||||
pub type DocId = u32;
|
||||
|
||||
/// A u64 assigned to every operation incrementally
|
||||
///
|
||||
/// All operations modifying the index receives an monotonic Opstamp.
|
||||
/// The resulting state of the index is consistent with the opstamp ordering.
|
||||
///
|
||||
/// For instance, a commit with opstamp `32_423` will reflect all Add and Delete operations
|
||||
/// with an opstamp `<= 32_423`. A delete operation with opstamp n will no affect a document added
|
||||
/// with opstamp `n+1`.
|
||||
pub type Opstamp = u64;
|
||||
|
||||
/// A f32 that represents the relevance of the document to the query
|
||||
///
|
||||
/// This is modelled internally as a `f32`. The
|
||||
@@ -298,6 +317,7 @@ mod tests {
|
||||
use Index;
|
||||
use IndexWriter;
|
||||
use Postings;
|
||||
use ReloadPolicy;
|
||||
|
||||
pub fn assert_nearly_equals(expected: f32, val: f32) {
|
||||
assert!(
|
||||
@@ -386,8 +406,8 @@ mod tests {
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
assert_eq!(searcher.doc_freq(&term_a), 3);
|
||||
let term_b = Term::from_field_text(text_field, "b");
|
||||
@@ -414,8 +434,8 @@ mod tests {
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let index_reader = index.reader().unwrap();
|
||||
let searcher = index_reader.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
{
|
||||
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field);
|
||||
@@ -450,8 +470,8 @@ mod tests {
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
||||
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field);
|
||||
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
|
||||
@@ -479,6 +499,11 @@ mod tests {
|
||||
let term_c = Term::from_field_text(text_field, "c");
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
@@ -500,10 +525,10 @@ mod tests {
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(text_field);
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let inverted_index = segment_reader.inverted_index(text_field);
|
||||
assert!(inverted_index
|
||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||
.is_none());
|
||||
@@ -511,19 +536,19 @@ mod tests {
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert!(advance_undeleted(&mut postings, segment_reader));
|
||||
assert_eq!(postings.doc(), 5);
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
assert!(!advance_undeleted(&mut postings, segment_reader));
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert!(advance_undeleted(&mut postings, segment_reader));
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert!(advance_undeleted(&mut postings, segment_reader));
|
||||
assert_eq!(postings.doc(), 4);
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
assert!(!advance_undeleted(&mut postings, segment_reader));
|
||||
}
|
||||
}
|
||||
{
|
||||
@@ -536,10 +561,10 @@ mod tests {
|
||||
index_writer.rollback().unwrap();
|
||||
}
|
||||
{
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(term_abcd.field());
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let seg_reader = searcher.segment_reader(0);
|
||||
let inverted_index = seg_reader.inverted_index(term_abcd.field());
|
||||
|
||||
assert!(inverted_index
|
||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||
@@ -548,19 +573,19 @@ mod tests {
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert!(advance_undeleted(&mut postings, seg_reader));
|
||||
assert_eq!(postings.doc(), 5);
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
assert!(!advance_undeleted(&mut postings, seg_reader));
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert!(advance_undeleted(&mut postings, seg_reader));
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert!(advance_undeleted(&mut postings, seg_reader));
|
||||
assert_eq!(postings.doc(), 4);
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
assert!(!advance_undeleted(&mut postings, seg_reader));
|
||||
}
|
||||
}
|
||||
{
|
||||
@@ -573,10 +598,10 @@ mod tests {
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(term_abcd.field());
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let inverted_index = segment_reader.inverted_index(term_abcd.field());
|
||||
assert!(inverted_index
|
||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||
.is_none());
|
||||
@@ -584,25 +609,25 @@ mod tests {
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
assert!(!advance_undeleted(&mut postings, segment_reader));
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert!(advance_undeleted(&mut postings, segment_reader));
|
||||
assert_eq!(postings.doc(), 3);
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert!(advance_undeleted(&mut postings, segment_reader));
|
||||
assert_eq!(postings.doc(), 4);
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
assert!(!advance_undeleted(&mut postings, segment_reader));
|
||||
}
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions)
|
||||
.unwrap();
|
||||
assert!(advance_undeleted(&mut postings, reader));
|
||||
assert!(advance_undeleted(&mut postings, segment_reader));
|
||||
assert_eq!(postings.doc(), 4);
|
||||
assert!(!advance_undeleted(&mut postings, reader));
|
||||
assert!(!advance_undeleted(&mut postings, segment_reader));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -610,15 +635,15 @@ mod tests {
|
||||
#[test]
|
||||
fn test_indexed_u64() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("value", INT_INDEXED);
|
||||
let field = schema_builder.add_u64_field("value", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(field=>1u64));
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let term = Term::from_field_u64(field, 1u64);
|
||||
let mut postings = searcher
|
||||
.segment_reader(0)
|
||||
@@ -633,7 +658,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_indexed_i64() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let value_field = schema_builder.add_i64_field("value", INT_INDEXED);
|
||||
let value_field = schema_builder.add_i64_field("value", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -641,8 +666,8 @@ mod tests {
|
||||
let negative_val = -1i64;
|
||||
index_writer.add_document(doc!(value_field => negative_val));
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let term = Term::from_field_i64(value_field, negative_val);
|
||||
let mut postings = searcher
|
||||
.segment_reader(0)
|
||||
@@ -664,8 +689,8 @@ mod tests {
|
||||
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
|
||||
index_writer.add_document(doc!(text_field=>"a"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
assert!(index.load_searchers().is_ok());
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
segment_reader.inverted_index(absent_field); //< should not panic
|
||||
}
|
||||
@@ -676,6 +701,11 @@ mod tests {
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
|
||||
@@ -701,8 +731,8 @@ mod tests {
|
||||
remove_document(&mut index_writer, "38");
|
||||
remove_document(&mut index_writer, "34");
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.num_docs(), 6);
|
||||
}
|
||||
|
||||
@@ -722,8 +752,8 @@ mod tests {
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let index_reader = index.reader().unwrap();
|
||||
let searcher = index_reader.searcher();
|
||||
let reader = searcher.segment_reader(0);
|
||||
let inverted_index = reader.inverted_index(text_field);
|
||||
let term_abcd = Term::from_field_text(text_field, "abcd");
|
||||
@@ -747,7 +777,7 @@ mod tests {
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
@@ -757,8 +787,8 @@ mod tests {
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
{
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let get_doc_ids = |terms: Vec<Term>| {
|
||||
let query = BooleanQuery::new_multiterms_query(terms);
|
||||
let topdocs = searcher.search(&query, &TestCollector).unwrap();
|
||||
@@ -800,25 +830,22 @@ mod tests {
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let reader = index
|
||||
.reader_builder()
|
||||
.reload_policy(ReloadPolicy::Manual)
|
||||
.try_into()
|
||||
.unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 0u64);
|
||||
{
|
||||
// writing the segment
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
{
|
||||
let doc = doc!(text_field=>"af b");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field=>"a b c");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{
|
||||
let doc = doc!(text_field=>"a b c d");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.add_document(doc!(text_field=>"af b"));
|
||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||
index_writer.add_document(doc!(text_field=>"a b c d"));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.searcher();
|
||||
reader.reload().unwrap();
|
||||
assert_eq!(reader.searcher().num_docs(), 3u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -845,7 +872,7 @@ mod tests {
|
||||
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
|
||||
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let stored_int_field = schema_builder.add_u64_field("text", INT_STORED);
|
||||
let stored_int_field = schema_builder.add_u64_field("text", STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -855,33 +882,32 @@ mod tests {
|
||||
index_writer.add_document(document);
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(text_field);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(text_field);
|
||||
assert!(fast_field_reader_opt.is_none());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(stored_int_field);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(stored_int_field);
|
||||
assert!(fast_field_reader_opt.is_none());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_err());
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_signed);
|
||||
assert!(fast_field_reader_opt.is_none());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
|
||||
assert!(fast_field_reader_opt.is_some());
|
||||
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
}
|
||||
|
||||
{
|
||||
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
|
||||
assert!(fast_field_reader_res.is_ok());
|
||||
let fast_field_reader = fast_field_reader_res.unwrap();
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
|
||||
assert!(fast_field_reader_opt.is_some());
|
||||
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,7 +61,7 @@ macro_rules! doc(
|
||||
};
|
||||
// if there is a trailing comma retry with the trailing comma stripped.
|
||||
($($field:expr => $value:expr),+ ,) => {
|
||||
doc!( $( $field => $value ), *);
|
||||
doc!( $( $field => $value ), *)
|
||||
};
|
||||
);
|
||||
|
||||
|
||||
@@ -34,10 +34,6 @@ const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
||||
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
|
||||
const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) as u64;
|
||||
|
||||
lazy_static! {
|
||||
static ref BIT_PACKER: BitPacker4x = BitPacker4x::new();
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
|
||||
@@ -1,4 +1,23 @@
|
||||
use super::BIT_PACKER;
|
||||
/// Positions works as a long sequence of compressed block.
|
||||
/// All terms are chained one after the other.
|
||||
///
|
||||
/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
|
||||
/// This means we need to skip to the `nth` positions efficiently.
|
||||
///
|
||||
/// This is done thanks to two levels of skiping that we refer to in the code
|
||||
/// as `long_skip` and `short_skip`.
|
||||
///
|
||||
/// The `long_skip` makes it possible to skip every 1_024 compression blocks (= 131_072 positions).
|
||||
/// Skipping offset are simply stored one after as an offset stored over 8 bytes.
|
||||
///
|
||||
/// We find the number of long skips, as `n / long_skip`.
|
||||
///
|
||||
/// Blocks are compressed using bitpacking, so `skip_read` contains the number of bytes
|
||||
/// (values can go from 0bit to 32 bits) required to decompressed every block.
|
||||
///
|
||||
/// A given block obviously takes `(128 x num_bit_for_the_block / num_bits_in_a_byte)`,
|
||||
/// so skipping a block without decompressing it is just a matter of advancing that many
|
||||
/// bytes.
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use directory::ReadOnlySource;
|
||||
@@ -8,9 +27,65 @@ use positions::LONG_SKIP_INTERVAL;
|
||||
use positions::LONG_SKIP_IN_BLOCKS;
|
||||
use postings::compression::compressed_block_size;
|
||||
|
||||
struct Positions {
|
||||
bit_packer: BitPacker4x,
|
||||
skip_source: ReadOnlySource,
|
||||
position_source: ReadOnlySource,
|
||||
long_skip_source: ReadOnlySource,
|
||||
}
|
||||
|
||||
impl Positions {
|
||||
pub fn new(position_source: ReadOnlySource, skip_source: ReadOnlySource) -> Positions {
|
||||
let skip_len = skip_source.len();
|
||||
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
|
||||
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
|
||||
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
|
||||
let (skip_source, long_skip_source) = body.split(body_split);
|
||||
Positions {
|
||||
bit_packer: BitPacker4x::new(),
|
||||
skip_source,
|
||||
long_skip_source,
|
||||
position_source,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the offset of the block associated to the given `long_skip_id`.
|
||||
///
|
||||
/// One `long_skip_id` means `LONG_SKIP_IN_BLOCKS` blocks.
|
||||
fn long_skip(&self, long_skip_id: usize) -> u64 {
|
||||
if long_skip_id == 0 {
|
||||
return 0;
|
||||
}
|
||||
let long_skip_slice = self.long_skip_source.as_slice();
|
||||
let mut long_skip_blocks: &[u8] = &long_skip_slice[(long_skip_id - 1) * 8..][..8];
|
||||
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted")
|
||||
}
|
||||
|
||||
fn reader(&self, offset: u64) -> PositionReader {
|
||||
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
|
||||
let small_skip = (offset % LONG_SKIP_INTERVAL) as usize;
|
||||
let offset_num_bytes: u64 = self.long_skip(long_skip_id);
|
||||
let mut position_read = OwnedRead::new(self.position_source.clone());
|
||||
position_read.advance(offset_num_bytes as usize);
|
||||
let mut skip_read = OwnedRead::new(self.skip_source.clone());
|
||||
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
|
||||
let mut position_reader = PositionReader {
|
||||
bit_packer: self.bit_packer,
|
||||
skip_read,
|
||||
position_read,
|
||||
inner_offset: 0,
|
||||
buffer: Box::new([0u32; 128]),
|
||||
ahead: None,
|
||||
};
|
||||
position_reader.skip(small_skip);
|
||||
position_reader
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PositionReader {
|
||||
skip_read: OwnedRead,
|
||||
position_read: OwnedRead,
|
||||
bit_packer: BitPacker4x,
|
||||
inner_offset: usize,
|
||||
buffer: Box<[u32; 128]>,
|
||||
ahead: Option<usize>, // if None, no block is loaded.
|
||||
@@ -27,6 +102,7 @@ pub struct PositionReader {
|
||||
// If the requested number of els ends exactly at a given block, the next
|
||||
// block is not decompressed.
|
||||
fn read_impl(
|
||||
bit_packer: BitPacker4x,
|
||||
mut position: &[u8],
|
||||
buffer: &mut [u32; 128],
|
||||
mut inner_offset: usize,
|
||||
@@ -37,21 +113,23 @@ fn read_impl(
|
||||
let mut output_len = output.len();
|
||||
let mut ahead = 0;
|
||||
loop {
|
||||
let available_len = 128 - inner_offset;
|
||||
let available_len = COMPRESSION_BLOCK_SIZE - inner_offset;
|
||||
// We have enough elements in the current block.
|
||||
// Let's copy the requested elements in the output buffer,
|
||||
// and return.
|
||||
if output_len <= available_len {
|
||||
output[output_start..].copy_from_slice(&buffer[inner_offset..][..output_len]);
|
||||
return ahead;
|
||||
} else {
|
||||
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
|
||||
output_len -= available_len;
|
||||
output_start += available_len;
|
||||
inner_offset = 0;
|
||||
let num_bits = num_bits[ahead];
|
||||
BitPacker4x::new().decompress(position, &mut buffer[..], num_bits);
|
||||
let block_len = compressed_block_size(num_bits);
|
||||
position = &position[block_len..];
|
||||
ahead += 1;
|
||||
}
|
||||
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
|
||||
output_len -= available_len;
|
||||
output_start += available_len;
|
||||
inner_offset = 0;
|
||||
let num_bits = num_bits[ahead];
|
||||
bit_packer.decompress(position, &mut buffer[..], num_bits);
|
||||
let block_len = compressed_block_size(num_bits);
|
||||
position = &position[block_len..];
|
||||
ahead += 1;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -61,35 +139,7 @@ impl PositionReader {
|
||||
skip_source: ReadOnlySource,
|
||||
offset: u64,
|
||||
) -> PositionReader {
|
||||
let skip_len = skip_source.len();
|
||||
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
|
||||
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
|
||||
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
|
||||
let (skip_body, long_skips) = body.split(body_split);
|
||||
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
|
||||
let small_skip = (offset - (long_skip_id as u64) * (LONG_SKIP_INTERVAL as u64)) as usize;
|
||||
let offset_num_bytes: u64 = {
|
||||
if long_skip_id > 0 {
|
||||
let mut long_skip_blocks: &[u8] =
|
||||
&long_skips.as_slice()[(long_skip_id - 1) * 8..][..8];
|
||||
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") * 16
|
||||
} else {
|
||||
0
|
||||
}
|
||||
};
|
||||
let mut position_read = OwnedRead::new(position_source);
|
||||
position_read.advance(offset_num_bytes as usize);
|
||||
let mut skip_read = OwnedRead::new(skip_body);
|
||||
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
|
||||
let mut position_reader = PositionReader {
|
||||
skip_read,
|
||||
position_read,
|
||||
inner_offset: 0,
|
||||
buffer: Box::new([0u32; 128]),
|
||||
ahead: None,
|
||||
};
|
||||
position_reader.skip(small_skip);
|
||||
position_reader
|
||||
Positions::new(position_source, skip_source).reader(offset)
|
||||
}
|
||||
|
||||
/// Fills a buffer with the next `output.len()` integers.
|
||||
@@ -101,10 +151,13 @@ impl PositionReader {
|
||||
if self.ahead != Some(0) {
|
||||
// the block currently available is not the block
|
||||
// for the current position
|
||||
BIT_PACKER.decompress(position_data, self.buffer.as_mut(), num_bits);
|
||||
self.bit_packer
|
||||
.decompress(position_data, self.buffer.as_mut(), num_bits);
|
||||
self.ahead = Some(0);
|
||||
}
|
||||
let block_len = compressed_block_size(num_bits);
|
||||
self.ahead = Some(read_impl(
|
||||
self.bit_packer,
|
||||
&position_data[block_len..],
|
||||
self.buffer.as_mut(),
|
||||
self.inner_offset,
|
||||
@@ -133,14 +186,13 @@ impl PositionReader {
|
||||
}
|
||||
});
|
||||
|
||||
let skip_len = self.skip_read.as_ref()[..num_blocks_to_advance]
|
||||
let skip_len_in_bits = self.skip_read.as_ref()[..num_blocks_to_advance]
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|num_bit| num_bit as usize)
|
||||
.map(|num_bits| *num_bits as usize)
|
||||
.sum::<usize>()
|
||||
* (COMPRESSION_BLOCK_SIZE / 8);
|
||||
|
||||
* COMPRESSION_BLOCK_SIZE;
|
||||
let skip_len_in_bytes = skip_len_in_bits / 8;
|
||||
self.skip_read.advance(num_blocks_to_advance);
|
||||
self.position_read.advance(skip_len);
|
||||
self.position_read.advance(skip_len_in_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,29 +1,30 @@
|
||||
use super::BIT_PACKER;
|
||||
use bitpacking::BitPacker;
|
||||
use bitpacking::BitPacker4x;
|
||||
use common::BinarySerializable;
|
||||
use common::CountingWriter;
|
||||
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
|
||||
use std::io;
|
||||
use std::io::{self, Write};
|
||||
|
||||
pub struct PositionSerializer<W: io::Write> {
|
||||
write_stream: W,
|
||||
bit_packer: BitPacker4x,
|
||||
write_stream: CountingWriter<W>,
|
||||
write_skiplist: W,
|
||||
block: Vec<u32>,
|
||||
buffer: Vec<u8>,
|
||||
num_ints: u64,
|
||||
long_skips: Vec<u64>,
|
||||
cumulated_num_bits: u64,
|
||||
}
|
||||
|
||||
impl<W: io::Write> PositionSerializer<W> {
|
||||
pub fn new(write_stream: W, write_skiplist: W) -> PositionSerializer<W> {
|
||||
PositionSerializer {
|
||||
write_stream,
|
||||
bit_packer: BitPacker4x::new(),
|
||||
write_stream: CountingWriter::wrap(write_stream),
|
||||
write_skiplist,
|
||||
block: Vec::with_capacity(128),
|
||||
buffer: vec![0u8; 128 * 4],
|
||||
num_ints: 0u64,
|
||||
long_skips: Vec::new(),
|
||||
cumulated_num_bits: 0u64,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,14 +51,15 @@ impl<W: io::Write> PositionSerializer<W> {
|
||||
}
|
||||
|
||||
fn flush_block(&mut self) -> io::Result<()> {
|
||||
let num_bits = BIT_PACKER.num_bits(&self.block[..]);
|
||||
self.cumulated_num_bits += u64::from(num_bits);
|
||||
let num_bits = self.bit_packer.num_bits(&self.block[..]);
|
||||
self.write_skiplist.write_all(&[num_bits])?;
|
||||
let written_len = BIT_PACKER.compress(&self.block[..], &mut self.buffer, num_bits);
|
||||
let written_len = self
|
||||
.bit_packer
|
||||
.compress(&self.block[..], &mut self.buffer, num_bits);
|
||||
self.write_stream.write_all(&self.buffer[..written_len])?;
|
||||
self.block.clear();
|
||||
if (self.num_ints % LONG_SKIP_INTERVAL) == 0u64 {
|
||||
self.long_skips.push(self.cumulated_num_bits);
|
||||
self.long_skips.push(self.write_stream.written_bytes());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
249
src/postings/block_search.rs
Normal file
249
src/postings/block_search.rs
Normal file
@@ -0,0 +1,249 @@
|
||||
use postings::compression::AlignedBuffer;
|
||||
|
||||
/// This modules define the logic used to search for a doc in a given
|
||||
/// block. (at most 128 docs)
|
||||
///
|
||||
/// Searching within a block is a hotspot when running intersection.
|
||||
/// so it was worth defining it in its own module.
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
mod sse2 {
|
||||
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
use std::arch::x86_64::__m128i as DataType;
|
||||
use std::arch::x86_64::_mm_add_epi32 as op_add;
|
||||
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
|
||||
use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
|
||||
use std::arch::x86_64::_mm_set1_epi32 as set1;
|
||||
use std::arch::x86_64::_mm_setzero_si128 as set0;
|
||||
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
|
||||
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
|
||||
|
||||
const MASK1: i32 = 78;
|
||||
const MASK2: i32 = 177;
|
||||
|
||||
/// Performs an exhaustive linear search over the
|
||||
///
|
||||
/// There is no early exit here. We simply count the
|
||||
/// number of elements that are `< target`.
|
||||
pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
|
||||
unsafe {
|
||||
let ptr = arr as *const AlignedBuffer as *const DataType;
|
||||
let vkey = set1(target as i32);
|
||||
let mut cnt = set0();
|
||||
// We work over 4 `__m128i` at a time.
|
||||
// A single `__m128i` actual contains 4 `u32`.
|
||||
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
|
||||
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
|
||||
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
|
||||
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
|
||||
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
|
||||
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
|
||||
cnt = op_sub(cnt, sum);
|
||||
}
|
||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
|
||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
|
||||
_mm_cvtsi128_si32(cnt) as usize
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::linear_search_sse2_128;
|
||||
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
#[test]
|
||||
fn test_linear_search_sse2_128_u32() {
|
||||
let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
|
||||
for el in 0u32..128u32 {
|
||||
block[el as usize] = el * 2 + 1 << 18;
|
||||
}
|
||||
let target = block[64] + 1;
|
||||
assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This `linear search` browser exhaustively through the array.
|
||||
/// but the early exit is very difficult to predict.
|
||||
///
|
||||
/// Coupled with `exponential search` this function is likely
|
||||
/// to be called with the same `len`
|
||||
fn linear_search(arr: &[u32], target: u32) -> usize {
|
||||
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
|
||||
}
|
||||
|
||||
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
||||
let end = arr.len();
|
||||
let mut begin = 0;
|
||||
for &pivot in &[1, 3, 7, 15, 31, 63] {
|
||||
if pivot >= end {
|
||||
break;
|
||||
}
|
||||
if arr[pivot] > target {
|
||||
return (begin, pivot);
|
||||
}
|
||||
begin = pivot;
|
||||
}
|
||||
(begin, end)
|
||||
}
|
||||
|
||||
fn galloping(block_docs: &[u32], target: u32) -> usize {
|
||||
let (start, end) = exponential_search(&block_docs, target);
|
||||
start + linear_search(&block_docs[start..end], target)
|
||||
}
|
||||
|
||||
/// Tantivy may rely on SIMD instructions to search for a specific document within
|
||||
/// a given block.
|
||||
#[derive(Clone, Copy, PartialEq)]
|
||||
pub enum BlockSearcher {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
SSE2,
|
||||
Scalar,
|
||||
}
|
||||
|
||||
impl BlockSearcher {
|
||||
/// Search the first index containing an element greater or equal to
|
||||
/// the target.
|
||||
///
|
||||
/// The results should be equivalent to
|
||||
/// ```ignore
|
||||
/// block[..]
|
||||
// .iter()
|
||||
// .take_while(|&&val| val < target)
|
||||
// .count()
|
||||
/// ```
|
||||
///
|
||||
/// The `start` argument is just used to hint that the response is
|
||||
/// greater than beyond `start`. The implementation may or may not use
|
||||
/// it for optimization.
|
||||
///
|
||||
/// # Assumption
|
||||
///
|
||||
/// The array len is > start.
|
||||
/// The block is sorted
|
||||
/// The target is assumed greater or equal to the `arr[start]`.
|
||||
/// The target is assumed smaller or equal to the last element of the block.
|
||||
///
|
||||
/// Currently the scalar implementation starts by an exponential search, and
|
||||
/// then operates a linear search in the result subarray.
|
||||
///
|
||||
/// If SSE2 instructions are available in the `(platform, running CPU)`,
|
||||
/// then we use a different implementation that does an exhaustive linear search over
|
||||
/// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
|
||||
/// of branch.
|
||||
pub(crate) fn search_in_block(
|
||||
self,
|
||||
block_docs: &AlignedBuffer,
|
||||
len: usize,
|
||||
start: usize,
|
||||
target: u32,
|
||||
) -> usize {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
use postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
if self == BlockSearcher::SSE2 && len == COMPRESSION_BLOCK_SIZE {
|
||||
return sse2::linear_search_sse2_128(block_docs, target);
|
||||
}
|
||||
}
|
||||
start + galloping(&block_docs.0[start..len], target)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BlockSearcher {
|
||||
fn default() -> BlockSearcher {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if is_x86_feature_detected!("sse2") {
|
||||
return BlockSearcher::SSE2;
|
||||
}
|
||||
}
|
||||
BlockSearcher::Scalar
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::exponential_search;
|
||||
use super::linear_search;
|
||||
use super::BlockSearcher;
|
||||
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
|
||||
|
||||
#[test]
|
||||
fn test_linear_search() {
|
||||
let len: usize = 50;
|
||||
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
|
||||
for target in 1..*arr.last().unwrap() {
|
||||
let res = linear_search(&arr[..], target);
|
||||
if res > 0 {
|
||||
assert!(arr[res - 1] < target);
|
||||
}
|
||||
if res < len {
|
||||
assert!(arr[res] >= target);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exponentiel_search() {
|
||||
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
|
||||
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
|
||||
assert_eq!(
|
||||
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
|
||||
(3, 7)
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
|
||||
let cursor = search_in_block_trivial_but_slow(block, target);
|
||||
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
|
||||
let mut output_buffer = [u32::max_value(); COMPRESSION_BLOCK_SIZE];
|
||||
output_buffer[..block.len()].copy_from_slice(block);
|
||||
for i in 0..cursor {
|
||||
assert_eq!(
|
||||
block_searcher.search_in_block(
|
||||
&AlignedBuffer(output_buffer),
|
||||
block.len(),
|
||||
i,
|
||||
target
|
||||
),
|
||||
cursor
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
|
||||
use std::collections::HashSet;
|
||||
let mut targets = HashSet::new();
|
||||
for (i, val) in block.iter().cloned().enumerate() {
|
||||
if i > 0 {
|
||||
targets.insert(val - 1);
|
||||
}
|
||||
targets.insert(val);
|
||||
}
|
||||
for target in targets {
|
||||
util_test_search_in_block(block_searcher, block, target);
|
||||
}
|
||||
}
|
||||
|
||||
fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
||||
block.iter().take_while(|&&val| val < target).count()
|
||||
}
|
||||
|
||||
fn test_search_in_block_util(block_searcher: BlockSearcher) {
|
||||
for len in 1u32..128u32 {
|
||||
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
|
||||
util_test_search_in_block_all(block_searcher, &v[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_search_in_block_scalar() {
|
||||
test_search_in_block_util(BlockSearcher::Scalar);
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[test]
|
||||
fn test_search_in_block_sse2() {
|
||||
test_search_in_block_util(BlockSearcher::SSE2);
|
||||
}
|
||||
}
|
||||
@@ -43,9 +43,14 @@ impl BlockEncoder {
|
||||
}
|
||||
}
|
||||
|
||||
/// We ensure that the OutputBuffer is align on 128 bits
|
||||
/// in order to run SSE2 linear search on it.
|
||||
#[repr(align(128))]
|
||||
pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
|
||||
|
||||
pub struct BlockDecoder {
|
||||
bitpacker: BitPacker4x,
|
||||
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
|
||||
output: AlignedBuffer,
|
||||
pub output_len: usize,
|
||||
}
|
||||
|
||||
@@ -55,11 +60,9 @@ impl BlockDecoder {
|
||||
}
|
||||
|
||||
pub fn with_val(val: u32) -> BlockDecoder {
|
||||
let mut output = [val; COMPRESSION_BLOCK_SIZE + 1];
|
||||
output[COMPRESSION_BLOCK_SIZE] = 0u32;
|
||||
BlockDecoder {
|
||||
bitpacker: BitPacker4x::new(),
|
||||
output,
|
||||
output: AlignedBuffer([val; COMPRESSION_BLOCK_SIZE]),
|
||||
output_len: 0,
|
||||
}
|
||||
}
|
||||
@@ -72,23 +75,28 @@ impl BlockDecoder {
|
||||
) -> usize {
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker
|
||||
.decompress_sorted(offset, &compressed_data, &mut self.output, num_bits)
|
||||
.decompress_sorted(offset, &compressed_data, &mut self.output.0, num_bits)
|
||||
}
|
||||
|
||||
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
|
||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||
self.bitpacker
|
||||
.decompress(&compressed_data, &mut self.output, num_bits)
|
||||
.decompress(&compressed_data, &mut self.output.0, num_bits)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output_array(&self) -> &[u32] {
|
||||
&self.output[..self.output_len]
|
||||
&self.output.0[..self.output_len]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn output_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||
(&self.output, self.output_len)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn output(&self, idx: usize) -> u32 {
|
||||
self.output[idx]
|
||||
self.output.0[idx]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,12 +167,12 @@ impl VIntDecoder for BlockDecoder {
|
||||
num_els: usize,
|
||||
) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
|
||||
self.output_len = num_els;
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
Postings module (also called inverted index)
|
||||
*/
|
||||
|
||||
mod block_search;
|
||||
pub(crate) mod compression;
|
||||
/// Postings module
|
||||
///
|
||||
@@ -16,6 +17,8 @@ mod skip;
|
||||
mod stacker;
|
||||
mod term_info;
|
||||
|
||||
pub(crate) use self::block_search::BlockSearcher;
|
||||
|
||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||
|
||||
@@ -31,7 +34,6 @@ pub(crate) use self::stacker::compute_table_size;
|
||||
pub use common::HasLen;
|
||||
|
||||
pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32;
|
||||
|
||||
pub(crate) type UnorderedTermId = u64;
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))]
|
||||
@@ -53,13 +55,15 @@ pub mod tests {
|
||||
use fieldnorm::FieldNormReader;
|
||||
use indexer::operation::AddOperation;
|
||||
use indexer::SegmentWriter;
|
||||
use merge_policy::NoMergePolicy;
|
||||
use query::Scorer;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use schema::Field;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Document, Schema, Term, INT_INDEXED, STRING, TEXT};
|
||||
use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
|
||||
use schema::{Field, TextOptions};
|
||||
use schema::{IndexRecordOption, TextFieldIndexing};
|
||||
use std::iter;
|
||||
use tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
@@ -101,14 +105,11 @@ pub mod tests {
|
||||
}
|
||||
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
|
||||
let term = Term::from_field_text(title, "abc");
|
||||
|
||||
let mut positions = Vec::new();
|
||||
|
||||
{
|
||||
let mut postings = inverted_index
|
||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||
@@ -161,6 +162,52 @@ pub mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_drop_token_that_are_too_long() {
|
||||
let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
|
||||
let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
|
||||
exceeding_token_text.push_str(" hello");
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_options = TextOptions::default().set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
||||
.set_tokenizer("simple_no_truncation"),
|
||||
);
|
||||
let text_field = schema_builder.add_text_field("text", text_options);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
index
|
||||
.tokenizers()
|
||||
.register("simple_no_truncation", SimpleTokenizer);
|
||||
let reader = index.reader().unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>exceeding_token_text));
|
||||
index_writer.commit().unwrap();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field);
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
|
||||
assert_eq!(&bytes, b"hello");
|
||||
}
|
||||
{
|
||||
index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
|
||||
index_writer.commit().unwrap();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(1u32);
|
||||
let inverted_index = segment_reader.inverted_index(text_field);
|
||||
assert_eq!(inverted_index.terms().num_terms(), 1);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
|
||||
assert_eq!(&bytes[..], ok_token_text.as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_position_and_fieldnorm1() {
|
||||
let mut positions = Vec::new();
|
||||
@@ -293,9 +340,8 @@ pub mod tests {
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut postings = segment_reader
|
||||
.inverted_index(text_field)
|
||||
@@ -317,7 +363,7 @@ pub mod tests {
|
||||
|
||||
let index = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let value_field = schema_builder.add_u64_field("value", INT_INDEXED);
|
||||
let value_field = schema_builder.add_u64_field("value", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -332,10 +378,9 @@ pub mod tests {
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
index
|
||||
};
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
// check that the basic usage works
|
||||
@@ -403,8 +448,7 @@ pub mod tests {
|
||||
index_writer.delete_term(term_0);
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
// make sure seeking still works
|
||||
@@ -451,12 +495,9 @@ pub mod tests {
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.delete_term(term_1);
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
|
||||
// finally, check that it's empty
|
||||
{
|
||||
@@ -512,7 +553,6 @@ pub mod tests {
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
index
|
||||
};
|
||||
}
|
||||
|
||||
@@ -12,8 +12,8 @@ use std::io;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use termdict::TermOrdinal;
|
||||
use tokenizer::Token;
|
||||
use tokenizer::TokenStream;
|
||||
use tokenizer::{Token, MAX_TOKEN_LEN};
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
@@ -33,9 +33,10 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||
}
|
||||
FieldType::U64(_)
|
||||
| FieldType::I64(_)
|
||||
| FieldType::Date(_)
|
||||
| FieldType::HierarchicalFacet => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
|
||||
FieldType::Bytes => {
|
||||
// FieldType::Bytes cannot actually be indexed.
|
||||
// TODO fix during the indexer refactoring described in #276
|
||||
@@ -51,6 +52,31 @@ pub struct MultiFieldPostingsWriter {
|
||||
per_field_postings_writers: Vec<Box<PostingsWriter>>,
|
||||
}
|
||||
|
||||
fn make_field_partition(
|
||||
term_offsets: &[(&[u8], Addr, UnorderedTermId)],
|
||||
) -> Vec<(Field, usize, usize)> {
|
||||
let term_offsets_it = term_offsets
|
||||
.iter()
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
.enumerate();
|
||||
let mut prev_field = Field(u32::max_value());
|
||||
let mut fields = vec![];
|
||||
let mut offsets = vec![];
|
||||
for (offset, field) in term_offsets_it {
|
||||
if field != prev_field {
|
||||
prev_field = field;
|
||||
fields.push(field);
|
||||
offsets.push(offset);
|
||||
}
|
||||
}
|
||||
offsets.push(term_offsets.len());
|
||||
let mut field_offsets = vec![];
|
||||
for i in 0..fields.len() {
|
||||
field_offsets.push((fields[i], offsets[i], offsets[i + 1]));
|
||||
}
|
||||
field_offsets
|
||||
}
|
||||
|
||||
impl MultiFieldPostingsWriter {
|
||||
/// Create a new `MultiFieldPostingsWriter` given
|
||||
/// a schema and a heap.
|
||||
@@ -96,36 +122,16 @@ impl MultiFieldPostingsWriter {
|
||||
&self,
|
||||
serializer: &mut InvertedIndexSerializer,
|
||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self
|
||||
.term_index
|
||||
.iter()
|
||||
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
|
||||
.collect();
|
||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
|
||||
self.term_index.iter().collect();
|
||||
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
|
||||
|
||||
let mut offsets: Vec<(Field, usize)> = vec![];
|
||||
let term_offsets_it = term_offsets
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|(key, _, _)| Term::wrap(key).field())
|
||||
.enumerate();
|
||||
|
||||
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
|
||||
HashMap::new();
|
||||
|
||||
let mut prev_field = Field(u32::max_value());
|
||||
for (offset, field) in term_offsets_it {
|
||||
if field != prev_field {
|
||||
offsets.push((field, offset));
|
||||
prev_field = field;
|
||||
}
|
||||
}
|
||||
offsets.push((Field(0), term_offsets.len()));
|
||||
|
||||
for i in 0..(offsets.len() - 1) {
|
||||
let (field, start) = offsets[i];
|
||||
let (_, stop) = offsets[i + 1];
|
||||
let field_offsets = make_field_partition(&term_offsets);
|
||||
|
||||
for (field, start, stop) in field_offsets {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
|
||||
match *field_entry.field_type() {
|
||||
@@ -143,7 +149,7 @@ impl MultiFieldPostingsWriter {
|
||||
.collect();
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
}
|
||||
FieldType::U64(_) | FieldType::I64(_) => {}
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::Date(_) => {}
|
||||
FieldType::Bytes => {}
|
||||
}
|
||||
|
||||
@@ -204,8 +210,18 @@ pub trait PostingsWriter {
|
||||
) -> u32 {
|
||||
let mut term = Term::for_field(field);
|
||||
let mut sink = |token: &Token| {
|
||||
term.set_text(token.text.as_str());
|
||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||
// We skip all tokens with a len greater than u16.
|
||||
if token.text.len() <= MAX_TOKEN_LEN {
|
||||
term.set_text(token.text.as_str());
|
||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||
} else {
|
||||
info!(
|
||||
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
|
||||
MAX_TOKEN_LEN in the documentation for more information.",
|
||||
token.text.len(),
|
||||
MAX_TOKEN_LEN
|
||||
);
|
||||
}
|
||||
};
|
||||
token_stream.process(&mut sink)
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ use postings::FieldSerializer;
|
||||
use std::io;
|
||||
use DocId;
|
||||
|
||||
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
|
||||
const POSITION_END: u32 = 0;
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -115,7 +114,7 @@ impl Recorder for NothingRecorder {
|
||||
let buffer = buffer_lender.lend_u8();
|
||||
self.stack.read_to_end(heap, buffer);
|
||||
for doc in VInt32Reader::new(&buffer[..]) {
|
||||
serializer.write_doc(doc as u32, 0u32, &EMPTY_ARRAY)?;
|
||||
serializer.write_doc(doc as u32, 0u32, &[][..])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -168,7 +167,7 @@ impl Recorder for TermFrequencyRecorder {
|
||||
let mut u32_it = VInt32Reader::new(&buffer[..]);
|
||||
while let Some(doc) = u32_it.next() {
|
||||
let term_freq = u32_it.next().unwrap_or(self.current_tf);
|
||||
serializer.write_doc(doc as u32, term_freq, &EMPTY_ARRAY)?;
|
||||
serializer.write_doc(doc as u32, term_freq, &[][..])?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -2,22 +2,21 @@ use common::BitSet;
|
||||
use common::HasLen;
|
||||
use common::{BinarySerializable, VInt};
|
||||
use docset::{DocSet, SkipResult};
|
||||
use fst::Streamer;
|
||||
use owned_read::OwnedRead;
|
||||
use positions::PositionReader;
|
||||
use postings::compression::compressed_block_size;
|
||||
use postings::compression::{compressed_block_size, AlignedBuffer};
|
||||
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||
use postings::serializer::PostingsSerializer;
|
||||
use postings::BlockSearcher;
|
||||
use postings::FreqReadingOption;
|
||||
use postings::Postings;
|
||||
use postings::SkipReader;
|
||||
use postings::USE_SKIP_INFO_LIMIT;
|
||||
use schema::IndexRecordOption;
|
||||
use std::cmp::Ordering;
|
||||
use tantivy_fst::Streamer;
|
||||
use DocId;
|
||||
|
||||
const EMPTY_ARR: [u8; 0] = [];
|
||||
|
||||
struct PositionComputer {
|
||||
// store the amount of position int
|
||||
// before reading positions.
|
||||
@@ -62,6 +61,7 @@ pub struct SegmentPostings {
|
||||
block_cursor: BlockSegmentPostings,
|
||||
cur: usize,
|
||||
position_computer: Option<PositionComputer>,
|
||||
block_searcher: BlockSearcher,
|
||||
}
|
||||
|
||||
impl SegmentPostings {
|
||||
@@ -72,6 +72,7 @@ impl SegmentPostings {
|
||||
block_cursor: empty_block_cursor,
|
||||
cur: COMPRESSION_BLOCK_SIZE,
|
||||
position_computer: None,
|
||||
block_searcher: BlockSearcher::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -119,41 +120,33 @@ impl SegmentPostings {
|
||||
block_cursor: segment_block_postings,
|
||||
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
|
||||
position_computer: positions_stream_opt.map(PositionComputer::new),
|
||||
block_searcher: BlockSearcher::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
||||
let end = arr.len();
|
||||
debug_assert!(arr.len() <= 128);
|
||||
debug_assert!(target <= arr[end - 1]);
|
||||
let mut begin = 0;
|
||||
for &pivot in [1,3,7,15,31,63].iter().take_while(|&&el| el < end) {
|
||||
if arr[pivot] > target {
|
||||
return (begin, pivot);
|
||||
}
|
||||
begin = pivot;
|
||||
}
|
||||
(begin, end)
|
||||
}
|
||||
|
||||
/// Search the first index containing an element greater or equal to the target.
|
||||
///
|
||||
/// # Assumption
|
||||
///
|
||||
/// The array is assumed non empty.
|
||||
/// The target is assumed greater or equal to the first element.
|
||||
/// The target is assumed smaller or equal to the last element.
|
||||
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
|
||||
let (start, end) = exponential_search(target, block_docs);
|
||||
start.wrapping_add(
|
||||
block_docs[start..end]
|
||||
.binary_search(&target)
|
||||
.unwrap_or_else(|e| e),
|
||||
)
|
||||
}
|
||||
|
||||
impl DocSet for SegmentPostings {
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.position_computer.is_some() && self.cur < COMPRESSION_BLOCK_SIZE {
|
||||
let term_freq = self.term_freq() as usize;
|
||||
if let Some(position_computer) = self.position_computer.as_mut() {
|
||||
position_computer.add_skip(term_freq);
|
||||
}
|
||||
}
|
||||
self.cur += 1;
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
self.cur = 0;
|
||||
if !self.block_cursor.advance() {
|
||||
self.cur = COMPRESSION_BLOCK_SIZE;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
if !self.advance() {
|
||||
return SkipResult::End;
|
||||
@@ -176,7 +169,6 @@ impl DocSet for SegmentPostings {
|
||||
|
||||
// skip blocks until one that might contain the target
|
||||
// check if we need to go to the next block
|
||||
let need_positions = self.position_computer.is_some();
|
||||
let mut sum_freqs_skipped: u32 = 0;
|
||||
if !self
|
||||
.block_cursor
|
||||
@@ -190,7 +182,7 @@ impl DocSet for SegmentPostings {
|
||||
// we are not in the right block.
|
||||
//
|
||||
// First compute all of the freqs skipped from the current block.
|
||||
if need_positions {
|
||||
if self.position_computer.is_some() {
|
||||
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
|
||||
match self.block_cursor.skip_to(target) {
|
||||
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
|
||||
@@ -209,25 +201,21 @@ impl DocSet for SegmentPostings {
|
||||
self.cur = 0;
|
||||
}
|
||||
|
||||
// we're in the right block now, start with an exponential search
|
||||
let block_docs = self.block_cursor.docs();
|
||||
let new_cur = self
|
||||
.cur
|
||||
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||
let cur = self.cur;
|
||||
|
||||
if need_positions {
|
||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
||||
.iter()
|
||||
.sum::<u32>();
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.add_skip(sum_freqs_skipped as usize);
|
||||
// we're in the right block now, start with an exponential search
|
||||
let (output, len) = self.block_cursor.docs_aligned();
|
||||
let new_cur = self
|
||||
.block_searcher
|
||||
.search_in_block(&output, len, cur, target);
|
||||
if let Some(position_computer) = self.position_computer.as_mut() {
|
||||
sum_freqs_skipped += self.block_cursor.freqs()[cur..new_cur].iter().sum::<u32>();
|
||||
position_computer.add_skip(sum_freqs_skipped as usize);
|
||||
}
|
||||
self.cur = new_cur;
|
||||
|
||||
// `doc` is now the first element >= `target`
|
||||
let doc = block_docs[new_cur];
|
||||
let doc = output.0[new_cur];
|
||||
debug_assert!(doc >= target);
|
||||
if doc == target {
|
||||
SkipResult::Reached
|
||||
@@ -236,40 +224,25 @@ impl DocSet for SegmentPostings {
|
||||
}
|
||||
}
|
||||
|
||||
// goes to the next element.
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.position_computer.is_some() {
|
||||
let term_freq = self.term_freq() as usize;
|
||||
self.position_computer.as_mut().unwrap().add_skip(term_freq);
|
||||
}
|
||||
self.cur += 1;
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
self.cur = 0;
|
||||
if !self.block_cursor.advance() {
|
||||
self.cur = COMPRESSION_BLOCK_SIZE;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.len() as u32
|
||||
}
|
||||
|
||||
/// Return the current document's `DocId`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Will panics if called without having called advance before.
|
||||
#[inline]
|
||||
fn doc(&self) -> DocId {
|
||||
let docs = self.block_cursor.docs();
|
||||
debug_assert!(
|
||||
self.cur < docs.len(),
|
||||
"Have you forgotten to call `.advance()` at least once before calling .doc()."
|
||||
"Have you forgotten to call `.advance()` at least once before calling `.doc()` ."
|
||||
);
|
||||
docs[self.cur]
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.len() as u32
|
||||
}
|
||||
|
||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||
// finish the current block
|
||||
if self.advance() {
|
||||
@@ -293,17 +266,33 @@ impl HasLen for SegmentPostings {
|
||||
}
|
||||
|
||||
impl Postings for SegmentPostings {
|
||||
/// Returns the frequency associated to the current document.
|
||||
/// If the schema is set up so that no frequency have been encoded,
|
||||
/// this method should always return 1.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Will panics if called without having called advance before.
|
||||
fn term_freq(&self) -> u32 {
|
||||
debug_assert!(
|
||||
// Here we do not use the len of `freqs()`
|
||||
// because it is actually ok to request for the freq of doc
|
||||
// even if no frequency were encoded for the field.
|
||||
//
|
||||
// In that case we hit the block just as if the frequency had been
|
||||
// decoded. The block is simply prefilled by the value 1.
|
||||
self.cur < COMPRESSION_BLOCK_SIZE,
|
||||
"Have you forgotten to call `.advance()` at least once before calling \
|
||||
`.term_freq()`."
|
||||
);
|
||||
self.block_cursor.freq(self.cur)
|
||||
}
|
||||
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
if self.position_computer.is_some() {
|
||||
output.resize(self.term_freq() as usize, 0u32);
|
||||
self.position_computer
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.positions_with_offset(offset, &mut output[..])
|
||||
let term_freq = self.term_freq() as usize;
|
||||
if let Some(position_comp) = self.position_computer.as_mut() {
|
||||
output.resize(term_freq, 0u32);
|
||||
position_comp.positions_with_offset(offset, &mut output[..]);
|
||||
} else {
|
||||
output.clear();
|
||||
}
|
||||
@@ -368,7 +357,7 @@ impl BlockSegmentPostings {
|
||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
|
||||
let skip_reader = match skip_data_opt {
|
||||
Some(skip_data) => SkipReader::new(skip_data, record_option),
|
||||
None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option),
|
||||
None => SkipReader::new(OwnedRead::new(&[][..]), record_option),
|
||||
};
|
||||
let doc_freq = doc_freq as usize;
|
||||
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
|
||||
@@ -402,7 +391,7 @@ impl BlockSegmentPostings {
|
||||
if let Some(skip_data) = skip_data_opt {
|
||||
self.skip_reader.reset(skip_data);
|
||||
} else {
|
||||
self.skip_reader.reset(OwnedRead::new(&EMPTY_ARR[..]))
|
||||
self.skip_reader.reset(OwnedRead::new(&[][..]))
|
||||
}
|
||||
self.doc_offset = 0;
|
||||
self.doc_freq = doc_freq as usize;
|
||||
@@ -425,6 +414,10 @@ impl BlockSegmentPostings {
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
|
||||
self.doc_decoder.output_aligned()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
@@ -615,20 +608,18 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::exponential_search;
|
||||
use super::search_within_block;
|
||||
use super::BlockSegmentPostings;
|
||||
use super::BlockSegmentPostingsSkipResult;
|
||||
use super::SegmentPostings;
|
||||
use common::HasLen;
|
||||
use core::Index;
|
||||
use docset::DocSet;
|
||||
use fst::Streamer;
|
||||
use postings::postings::Postings;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Schema;
|
||||
use schema::Term;
|
||||
use schema::INT_INDEXED;
|
||||
use schema::INDEXED;
|
||||
use tantivy_fst::Streamer;
|
||||
use DocId;
|
||||
use SkipResult;
|
||||
|
||||
@@ -640,6 +631,18 @@ mod tests {
|
||||
assert_eq!(postings.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
|
||||
fn test_panic_if_doc_called_before_advance() {
|
||||
SegmentPostings::empty().doc();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
|
||||
fn test_panic_if_freq_called_before_advance() {
|
||||
SegmentPostings::empty().term_freq();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_block_segment_postings() {
|
||||
let mut postings = BlockSegmentPostings::empty();
|
||||
@@ -647,56 +650,6 @@ mod tests {
|
||||
assert_eq!(postings.doc_freq(), 0);
|
||||
}
|
||||
|
||||
fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
||||
block
|
||||
.iter()
|
||||
.cloned()
|
||||
.enumerate()
|
||||
.filter(|&(_, ref val)| *val >= target)
|
||||
.next()
|
||||
.unwrap()
|
||||
.0
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exponentiel_search() {
|
||||
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
|
||||
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
|
||||
assert_eq!(
|
||||
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
|
||||
(3, 7)
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_within_block(block: &[u32], target: u32) {
|
||||
assert_eq!(
|
||||
search_within_block(block, target),
|
||||
search_within_block_trivial_but_slow(block, target)
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_within_block_all(block: &[u32]) {
|
||||
use std::collections::HashSet;
|
||||
let mut targets = HashSet::new();
|
||||
for (i, val) in block.iter().cloned().enumerate() {
|
||||
if i > 0 {
|
||||
targets.insert(val - 1);
|
||||
}
|
||||
targets.insert(val);
|
||||
}
|
||||
for target in targets {
|
||||
util_test_search_within_block(block, target);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_search_within_block() {
|
||||
for len in 1u32..128u32 {
|
||||
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
|
||||
util_test_search_within_block_all(&v[..]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings() {
|
||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
||||
@@ -745,7 +698,7 @@ mod tests {
|
||||
|
||||
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||
let int_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
@@ -758,8 +711,7 @@ mod tests {
|
||||
last_doc = doc + 1;
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let inverted_index = segment_reader.inverted_index(int_field);
|
||||
let term = Term::from_field_u64(int_field, 0u64);
|
||||
@@ -816,7 +768,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_reset_block_segment_postings() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||
let int_field = schema_builder.add_u64_field("id", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
@@ -827,8 +779,7 @@ mod tests {
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
|
||||
let mut block_segments;
|
||||
|
||||
@@ -14,7 +14,7 @@ use termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||
use DocId;
|
||||
use Result;
|
||||
|
||||
/// `PostingsSerializer` is in charge of serializing
|
||||
/// `InvertedIndexSerializer` is in charge of serializing
|
||||
/// postings on disk, in the
|
||||
/// * `.idx` (inverted index)
|
||||
/// * `.pos` (positions file)
|
||||
@@ -54,7 +54,7 @@ pub struct InvertedIndexSerializer {
|
||||
}
|
||||
|
||||
impl InvertedIndexSerializer {
|
||||
/// Open a new `PostingsSerializer` for the given segment
|
||||
/// Open a new `InvertedIndexSerializer` for the given segment
|
||||
fn create(
|
||||
terms_write: CompositeWrite<WritePtr>,
|
||||
postings_write: CompositeWrite<WritePtr>,
|
||||
@@ -175,7 +175,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
let positions_idx = self
|
||||
.positions_serializer_opt
|
||||
.as_ref()
|
||||
.map(|positions_serializer| positions_serializer.positions_idx())
|
||||
.map(PositionSerializer::positions_idx)
|
||||
.unwrap_or(0u64);
|
||||
TermInfo {
|
||||
doc_freq: 0,
|
||||
|
||||
@@ -5,12 +5,11 @@ use self::murmurhash32::murmurhash2;
|
||||
use super::{Addr, MemoryArena};
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
use postings::stacker::memory_arena::store;
|
||||
use postings::UnorderedTermId;
|
||||
use std::iter;
|
||||
use std::mem;
|
||||
use std::slice;
|
||||
|
||||
pub type BucketId = usize;
|
||||
|
||||
/// Returns the actual memory size in bytes
|
||||
/// required to create a table of size $2^num_bits$.
|
||||
pub fn compute_table_size(num_bits: usize) -> usize {
|
||||
@@ -28,6 +27,7 @@ pub fn compute_table_size(num_bits: usize) -> usize {
|
||||
struct KeyValue {
|
||||
key_value_addr: Addr,
|
||||
hash: u32,
|
||||
unordered_term_id: UnorderedTermId,
|
||||
}
|
||||
|
||||
impl Default for KeyValue {
|
||||
@@ -35,6 +35,7 @@ impl Default for KeyValue {
|
||||
KeyValue {
|
||||
key_value_addr: Addr::null_pointer(),
|
||||
hash: 0u32,
|
||||
unordered_term_id: UnorderedTermId::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -59,6 +60,7 @@ pub struct TermHashMap {
|
||||
pub heap: MemoryArena,
|
||||
mask: usize,
|
||||
occupied: Vec<usize>,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
struct QuadraticProbing {
|
||||
@@ -85,13 +87,13 @@ pub struct Iter<'a> {
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Iter<'a> {
|
||||
type Item = (&'a [u8], Addr, BucketId);
|
||||
type Item = (&'a [u8], Addr, UnorderedTermId);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.inner.next().cloned().map(move |bucket: usize| {
|
||||
let kv = self.hashmap.table[bucket];
|
||||
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
|
||||
(key, offset, bucket as BucketId)
|
||||
(key, offset, kv.unordered_term_id)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -106,6 +108,7 @@ impl TermHashMap {
|
||||
heap,
|
||||
mask: table_size - 1,
|
||||
occupied: Vec::with_capacity(table_size / 2),
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -139,12 +142,16 @@ impl TermHashMap {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
|
||||
fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) -> UnorderedTermId {
|
||||
self.occupied.push(bucket);
|
||||
let unordered_term_id = self.len as UnorderedTermId;
|
||||
self.len += 1;
|
||||
self.table[bucket] = KeyValue {
|
||||
key_value_addr,
|
||||
hash,
|
||||
unordered_term_id,
|
||||
};
|
||||
unordered_term_id
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Iter {
|
||||
@@ -184,7 +191,11 @@ impl TermHashMap {
|
||||
/// will be in charge of returning a default value.
|
||||
/// If the key already as an associated value, then it will be passed
|
||||
/// `Some(previous_value)`.
|
||||
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
|
||||
pub fn mutate_or_create<S, V, TMutator>(
|
||||
&mut self,
|
||||
key: S,
|
||||
mut updater: TMutator,
|
||||
) -> UnorderedTermId
|
||||
where
|
||||
S: AsRef<[u8]>,
|
||||
V: Copy + 'static,
|
||||
@@ -200,6 +211,7 @@ impl TermHashMap {
|
||||
let bucket = probe.next_probe();
|
||||
let kv: KeyValue = self.table[bucket];
|
||||
if kv.is_empty() {
|
||||
// The key does not exists yet.
|
||||
let val = updater(None);
|
||||
let num_bytes =
|
||||
std::mem::size_of::<u16>() + key_bytes.len() + std::mem::size_of::<V>();
|
||||
@@ -211,8 +223,7 @@ impl TermHashMap {
|
||||
data[2..stop].copy_from_slice(key_bytes);
|
||||
store(&mut data[stop..], val);
|
||||
}
|
||||
self.set_bucket(hash, key_addr, bucket);
|
||||
return bucket as BucketId;
|
||||
return self.set_bucket(hash, key_addr, bucket);
|
||||
} else if kv.hash == hash {
|
||||
if let Some(val_addr) =
|
||||
self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr)
|
||||
@@ -220,7 +231,7 @@ impl TermHashMap {
|
||||
let v = self.heap.read(val_addr);
|
||||
let new_v = updater(Some(v));
|
||||
self.heap.write_at(val_addr, new_v);
|
||||
return bucket as BucketId;
|
||||
return kv.unordered_term_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use core::Searcher;
|
||||
use core::SegmentReader;
|
||||
use docset::DocSet;
|
||||
use query::{Query, Scorer, Weight};
|
||||
use query::explanation::does_not_match;
|
||||
use query::{Explanation, Query, Scorer, Weight};
|
||||
use DocId;
|
||||
use Result;
|
||||
use Score;
|
||||
@@ -29,6 +30,13 @@ impl Weight for AllWeight {
|
||||
max_doc: reader.max_doc(),
|
||||
}))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
if doc >= reader.max_doc() {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
Ok(Explanation::new("AllQuery", 1f32))
|
||||
}
|
||||
}
|
||||
|
||||
enum State {
|
||||
@@ -101,8 +109,9 @@ mod tests {
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.add_document(doc!(field=>"ccc"));
|
||||
index_writer.commit().unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader().unwrap();
|
||||
reader.reload().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let weight = AllQuery.weight(&searcher, false).unwrap();
|
||||
{
|
||||
let reader = searcher.segment_reader(0);
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
use common::BitSet;
|
||||
use core::SegmentReader;
|
||||
use fst::Automaton;
|
||||
use query::BitSetDocSet;
|
||||
use query::ConstScorer;
|
||||
use query::{BitSetDocSet, Explanation};
|
||||
use query::{Scorer, Weight};
|
||||
use schema::{Field, IndexRecordOption};
|
||||
use tantivy_fst::Automaton;
|
||||
use termdict::{TermDictionary, TermStreamer};
|
||||
use Result;
|
||||
use DocId;
|
||||
use TantivyError;
|
||||
use {Result, SkipResult};
|
||||
|
||||
/// A weight struct for Fuzzy Term and Regex Queries
|
||||
pub struct AutomatonWeight<A>
|
||||
@@ -56,4 +58,15 @@ where
|
||||
let doc_bitset = BitSetDocSet::from(doc_bitset);
|
||||
Ok(Box::new(ConstScorer::new(doc_bitset)))
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader)?;
|
||||
if scorer.skip_next(doc) == SkipResult::Reached {
|
||||
Ok(Explanation::new("AutomatonScorer", 1.0f32))
|
||||
} else {
|
||||
Err(TantivyError::InvalidArgument(
|
||||
"Document does not exist".to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use fieldnorm::FieldNormReader;
|
||||
use query::Explanation;
|
||||
use Score;
|
||||
use Searcher;
|
||||
use Term;
|
||||
@@ -26,18 +27,13 @@ fn compute_tf_cache(average_fieldnorm: f32) -> [f32; 256] {
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BM25Weight {
|
||||
idf_explain: Explanation,
|
||||
weight: f32,
|
||||
cache: [f32; 256],
|
||||
average_fieldnorm: f32,
|
||||
}
|
||||
|
||||
impl BM25Weight {
|
||||
pub fn null() -> BM25Weight {
|
||||
BM25Weight {
|
||||
weight: 0f32,
|
||||
cache: [1f32; 256],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn for_terms(searcher: &Searcher, terms: &[Term]) -> BM25Weight {
|
||||
assert!(!terms.is_empty(), "BM25 requires at least one term");
|
||||
let field = terms[0].field();
|
||||
@@ -58,20 +54,37 @@ impl BM25Weight {
|
||||
}
|
||||
let average_fieldnorm = total_num_tokens as f32 / total_num_docs as f32;
|
||||
|
||||
let idf = terms
|
||||
.iter()
|
||||
.map(|term| {
|
||||
let term_doc_freq = searcher.doc_freq(term);
|
||||
idf(term_doc_freq, total_num_docs)
|
||||
})
|
||||
.sum::<f32>();
|
||||
BM25Weight::new(idf, average_fieldnorm)
|
||||
let mut idf_explain: Explanation;
|
||||
if terms.len() == 1 {
|
||||
let term_doc_freq = searcher.doc_freq(&terms[0]);
|
||||
let idf = idf(term_doc_freq, total_num_docs);
|
||||
idf_explain =
|
||||
Explanation::new("idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))", idf);
|
||||
idf_explain.add_const(
|
||||
"n, number of docs containing this term",
|
||||
term_doc_freq as f32,
|
||||
);
|
||||
idf_explain.add_const("N, total number of docs", total_num_docs as f32);
|
||||
} else {
|
||||
let idf = terms
|
||||
.iter()
|
||||
.map(|term| {
|
||||
let term_doc_freq = searcher.doc_freq(term);
|
||||
idf(term_doc_freq, total_num_docs)
|
||||
})
|
||||
.sum::<f32>();
|
||||
idf_explain = Explanation::new("idf", idf);
|
||||
}
|
||||
BM25Weight::new(idf_explain, average_fieldnorm)
|
||||
}
|
||||
|
||||
fn new(idf: f32, average_fieldnorm: f32) -> BM25Weight {
|
||||
fn new(idf_explain: Explanation, average_fieldnorm: f32) -> BM25Weight {
|
||||
let weight = idf_explain.value() * (1f32 + K1);
|
||||
BM25Weight {
|
||||
weight: idf * (1f32 + K1),
|
||||
idf_explain,
|
||||
weight,
|
||||
cache: compute_tf_cache(average_fieldnorm),
|
||||
average_fieldnorm,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,6 +94,37 @@ impl BM25Weight {
|
||||
let term_freq = term_freq as f32;
|
||||
self.weight * term_freq / (term_freq + norm)
|
||||
}
|
||||
|
||||
pub fn explain(&self, fieldnorm_id: u8, term_freq: u32) -> Explanation {
|
||||
// The explain format is directly copied from Lucene's.
|
||||
// (So, Kudos to Lucene)
|
||||
|
||||
let score = self.score(fieldnorm_id, term_freq);
|
||||
|
||||
let norm = self.cache[fieldnorm_id as usize];
|
||||
let term_freq = term_freq as f32;
|
||||
let right_factor = term_freq / (term_freq + norm);
|
||||
|
||||
let mut tf_explanation = Explanation::new(
|
||||
"freq / (freq + k1 * (1 - b + b * dl / avgdl))",
|
||||
right_factor,
|
||||
);
|
||||
|
||||
tf_explanation.add_const("freq, occurrences of term within document", term_freq);
|
||||
tf_explanation.add_const("k1, term saturation parameter", K1);
|
||||
tf_explanation.add_const("b, length normalization parameter", B);
|
||||
tf_explanation.add_const(
|
||||
"dl, length of field",
|
||||
FieldNormReader::id_to_fieldnorm(fieldnorm_id) as f32,
|
||||
);
|
||||
tf_explanation.add_const("avgdl, average length of field", self.average_fieldnorm);
|
||||
|
||||
let mut explanation = Explanation::new("TermQuery, product of...", score);
|
||||
explanation.add_detail(Explanation::new("(K1+1)", K1 + 1f32));
|
||||
explanation.add_detail(self.idf_explain.clone());
|
||||
explanation.add_detail(tf_explanation);
|
||||
explanation
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use core::SegmentReader;
|
||||
use downcast::Downcast;
|
||||
use query::intersect_scorers;
|
||||
use query::explanation::does_not_match;
|
||||
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
|
||||
use query::term_query::TermScorer;
|
||||
use query::EmptyScorer;
|
||||
@@ -10,9 +9,10 @@ use query::RequiredOptionalScorer;
|
||||
use query::Scorer;
|
||||
use query::Union;
|
||||
use query::Weight;
|
||||
use std::borrow::Borrow;
|
||||
use query::{intersect_scorers, Explanation};
|
||||
use std::collections::HashMap;
|
||||
use Result;
|
||||
use {DocId, SkipResult};
|
||||
|
||||
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<Scorer>>) -> Box<Scorer>
|
||||
where
|
||||
@@ -24,14 +24,11 @@ where
|
||||
}
|
||||
|
||||
{
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| {
|
||||
let scorer_ref: &Scorer = scorer.borrow();
|
||||
Downcast::<TermScorer>::is_type(scorer_ref)
|
||||
});
|
||||
let is_all_term_queries = scorers.iter().all(|scorer| scorer.is::<TermScorer>());
|
||||
if is_all_term_queries {
|
||||
let scorers: Vec<TermScorer> = scorers
|
||||
.into_iter()
|
||||
.map(|scorer| *Downcast::<TermScorer>::downcast(scorer).unwrap())
|
||||
.map(|scorer| *(scorer.downcast::<TermScorer>().map_err(|_| ()).unwrap()))
|
||||
.collect();
|
||||
let scorer: Box<Scorer> = Box::new(Union::<TermScorer, TScoreCombiner>::from(scorers));
|
||||
return scorer;
|
||||
@@ -55,10 +52,10 @@ impl BooleanWeight {
|
||||
}
|
||||
}
|
||||
|
||||
fn complex_scorer<TScoreCombiner: ScoreCombiner>(
|
||||
fn per_occur_scorers(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<Box<Scorer>> {
|
||||
) -> Result<HashMap<Occur, Vec<Box<Scorer>>>> {
|
||||
let mut per_occur_scorers: HashMap<Occur, Vec<Box<Scorer>>> = HashMap::new();
|
||||
for &(ref occur, ref subweight) in &self.weights {
|
||||
let sub_scorer: Box<Scorer> = subweight.scorer(reader)?;
|
||||
@@ -67,6 +64,14 @@ impl BooleanWeight {
|
||||
.or_insert_with(Vec::new)
|
||||
.push(sub_scorer);
|
||||
}
|
||||
Ok(per_occur_scorers)
|
||||
}
|
||||
|
||||
fn complex_scorer<TScoreCombiner: ScoreCombiner>(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<Box<Scorer>> {
|
||||
let mut per_occur_scorers = self.per_occur_scorers(reader)?;
|
||||
|
||||
let should_scorer_opt: Option<Box<Scorer>> = per_occur_scorers
|
||||
.remove(&Occur::Should)
|
||||
@@ -123,4 +128,31 @@ impl Weight for BooleanWeight {
|
||||
self.complex_scorer::<DoNothingCombiner>(reader)
|
||||
}
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
let mut scorer = self.scorer(reader)?;
|
||||
if scorer.skip_next(doc) != SkipResult::Reached {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
if !self.scoring_enabled {
|
||||
return Ok(Explanation::new("BooleanQuery with no scoring", 1f32));
|
||||
}
|
||||
|
||||
let mut explanation = Explanation::new("BooleanClause. Sum of ...", scorer.score());
|
||||
for &(ref occur, ref subweight) in &self.weights {
|
||||
if is_positive_occur(*occur) {
|
||||
if let Ok(child_explanation) = subweight.explain(reader, doc) {
|
||||
explanation.add_detail(child_explanation);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(explanation)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_positive_occur(occur: Occur) -> bool {
|
||||
match occur {
|
||||
Occur::Must | Occur::Should => true,
|
||||
Occur::MustNot => false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use collector::tests::TestCollector;
|
||||
use downcast::Downcast;
|
||||
use query::score_combiner::SumWithCoordsCombiner;
|
||||
use query::term_query::TermScorer;
|
||||
use query::Intersection;
|
||||
@@ -19,8 +18,8 @@ mod tests {
|
||||
use query::Scorer;
|
||||
use query::TermQuery;
|
||||
use schema::*;
|
||||
use DocId;
|
||||
use Index;
|
||||
use {DocAddress, DocId};
|
||||
|
||||
fn aux_test_helper() -> (Index, Field) {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -52,7 +51,6 @@ mod tests {
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
(index, text_field)
|
||||
}
|
||||
|
||||
@@ -61,7 +59,8 @@ mod tests {
|
||||
let (index, text_field) = aux_test_helper();
|
||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
let query = query_parser.parse_query("(+a +b) d").unwrap();
|
||||
assert_eq!(query.count(&*index.searcher()).unwrap(), 3);
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
assert_eq!(query.count(&searcher).unwrap(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -69,28 +68,28 @@ mod tests {
|
||||
let (index, text_field) = aux_test_helper();
|
||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
let query = query_parser.parse_query("+a").unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let weight = query.weight(&searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||
assert!(scorer.is::<TermScorer>());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_boolean_termonly_intersection() {
|
||||
let (index, text_field) = aux_test_helper();
|
||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
{
|
||||
let query = query_parser.parse_query("+a +b +c").unwrap();
|
||||
let weight = query.weight(&searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<Intersection<TermScorer>>::is_type(&*scorer));
|
||||
assert!(scorer.is::<Intersection<TermScorer>>());
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("+a +(b c)").unwrap();
|
||||
let weight = query.weight(&searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<Intersection<Box<Scorer>>>::is_type(&*scorer));
|
||||
assert!(scorer.is::<Intersection<Box<Scorer>>>());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,21 +97,19 @@ mod tests {
|
||||
pub fn test_boolean_reqopt() {
|
||||
let (index, text_field) = aux_test_helper();
|
||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
{
|
||||
let query = query_parser.parse_query("+a b").unwrap();
|
||||
let weight = query.weight(&searcher, true).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
assert!(Downcast::<
|
||||
RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>,
|
||||
>::is_type(&*scorer));
|
||||
assert!(scorer
|
||||
.is::<RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>>());
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("+a b").unwrap();
|
||||
let weight = query.weight(&searcher, false).unwrap();
|
||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
println!("{:?}", scorer.type_name());
|
||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||
assert!(scorer.is::<TermScorer>());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -129,10 +126,13 @@ mod tests {
|
||||
query
|
||||
};
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
let matching_docs = |boolean_query: &Query| {
|
||||
let searcher = index.searcher();
|
||||
let test_docs = searcher.search(boolean_query, &TestCollector).unwrap();
|
||||
test_docs
|
||||
reader
|
||||
.searcher()
|
||||
.search(boolean_query, &TestCollector)
|
||||
.unwrap()
|
||||
.docs()
|
||||
.iter()
|
||||
.cloned()
|
||||
@@ -188,10 +188,12 @@ mod tests {
|
||||
let query: Box<Query> = Box::new(term_query);
|
||||
query
|
||||
};
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let score_docs = |boolean_query: &Query| {
|
||||
let searcher = index.searcher();
|
||||
let fruit = searcher.search(boolean_query, &TestCollector).unwrap();
|
||||
let fruit = reader
|
||||
.searcher()
|
||||
.search(boolean_query, &TestCollector)
|
||||
.unwrap();
|
||||
fruit.scores().to_vec()
|
||||
};
|
||||
|
||||
@@ -203,4 +205,167 @@ mod tests {
|
||||
assert_eq!(score_docs(&boolean_query), vec![0.977973, 0.84699446]);
|
||||
}
|
||||
}
|
||||
|
||||
// motivated by #554
|
||||
#[test]
|
||||
fn test_bm25_several_fields() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let text = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
index_writer.add_document(doc!(
|
||||
// tf = 1 0
|
||||
title => "Законы притяжения Оксана Кулакова",
|
||||
// tf = 1 0
|
||||
text => "Законы притяжения Оксана Кулакова] \n\nТема: Сексуальное искусство, Женственность\nТип товара: Запись вебинара (аудио)\nПродолжительность: 1,5 часа\n\nСсылка на вебинар:\n ",
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
// tf = 1 0
|
||||
title => "Любимые русские пироги (Оксана Путан)",
|
||||
// tf = 2 0
|
||||
text => "http://i95.fastpic.ru/big/2017/0628/9a/615b9c8504d94a3893d7f496ac53539a.jpg \n\nОт издателя\nОксана Путан профессиональный повар, автор кулинарных книг и известный кулинарный блогер. Ее рецепты отличаются практичностью, доступностью и пользуются огромной популярностью в русскоязычном интернете. Это третья книга автора о самом вкусном и ароматном настоящих русских пирогах и выпечке!\nДаже новички на кухне легко готовят по ее рецептам. Оксана описывает процесс приготовления настолько подробно и понятно, что вам остается только наслаждаться готовкой и не тратить время на лишние усилия. Готовьте легко и просто!\n\nhttps://www.ozon.ru/context/detail/id/139872462/"
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
// tf = 1 1
|
||||
title => "PDF Мастер Класс \"Морячок\" (Оксана Лифенко)",
|
||||
// tf = 0 0
|
||||
text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212"
|
||||
));
|
||||
for _ in 0..1_000 {
|
||||
index_writer.add_document(doc!(
|
||||
title => "a b d e f g",
|
||||
text => "maitre corbeau sur un arbre perche tenait dans son bec un fromage Maitre rnard par lodeur alleche lui tint a peu pres ce langage."
|
||||
));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, text]);
|
||||
let query = query_parser
|
||||
.parse_query("Оксана Лифенко")
|
||||
.unwrap();
|
||||
let weight = query.weight(&searcher, true).unwrap();
|
||||
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||
scorer.advance();
|
||||
|
||||
let explanation = query.explain(&searcher, DocAddress(0u32, 0u32)).unwrap();
|
||||
assert_eq!(
|
||||
explanation.to_pretty_json(),
|
||||
r#"{
|
||||
"value": 12.997711,
|
||||
"description": "BooleanClause. Sum of ...",
|
||||
"details": [
|
||||
{
|
||||
"value": 12.997711,
|
||||
"description": "BooleanClause. Sum of ...",
|
||||
"details": [
|
||||
{
|
||||
"value": 6.551476,
|
||||
"description": "TermQuery, product of...",
|
||||
"details": [
|
||||
{
|
||||
"value": 2.2,
|
||||
"description": "(K1+1)"
|
||||
},
|
||||
{
|
||||
"value": 5.658984,
|
||||
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))",
|
||||
"details": [
|
||||
{
|
||||
"value": 3.0,
|
||||
"description": "n, number of docs containing this term"
|
||||
},
|
||||
{
|
||||
"value": 1003.0,
|
||||
"description": "N, total number of docs"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"value": 0.5262329,
|
||||
"description": "freq / (freq + k1 * (1 - b + b * dl / avgdl))",
|
||||
"details": [
|
||||
{
|
||||
"value": 1.0,
|
||||
"description": "freq, occurrences of term within document"
|
||||
},
|
||||
{
|
||||
"value": 1.2,
|
||||
"description": "k1, term saturation parameter"
|
||||
},
|
||||
{
|
||||
"value": 0.75,
|
||||
"description": "b, length normalization parameter"
|
||||
},
|
||||
{
|
||||
"value": 4.0,
|
||||
"description": "dl, length of field"
|
||||
},
|
||||
{
|
||||
"value": 5.997009,
|
||||
"description": "avgdl, average length of field"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"value": 6.446235,
|
||||
"description": "TermQuery, product of...",
|
||||
"details": [
|
||||
{
|
||||
"value": 2.2,
|
||||
"description": "(K1+1)"
|
||||
},
|
||||
{
|
||||
"value": 5.9954567,
|
||||
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5))",
|
||||
"details": [
|
||||
{
|
||||
"value": 2.0,
|
||||
"description": "n, number of docs containing this term"
|
||||
},
|
||||
{
|
||||
"value": 1003.0,
|
||||
"description": "N, total number of docs"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"value": 0.4887212,
|
||||
"description": "freq / (freq + k1 * (1 - b + b * dl / avgdl))",
|
||||
"details": [
|
||||
{
|
||||
"value": 1.0,
|
||||
"description": "freq, occurrences of term within document"
|
||||
},
|
||||
{
|
||||
"value": 1.2,
|
||||
"description": "k1, term saturation parameter"
|
||||
},
|
||||
{
|
||||
"value": 0.75,
|
||||
"description": "b, length normalization parameter"
|
||||
},
|
||||
{
|
||||
"value": 20.0,
|
||||
"description": "dl, length of field"
|
||||
},
|
||||
{
|
||||
"value": 24.123629,
|
||||
"description": "avgdl, average length of field"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}"#
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use super::Scorer;
|
||||
use query::Query;
|
||||
use query::explanation::does_not_match;
|
||||
use query::Weight;
|
||||
use query::{Explanation, Query};
|
||||
use DocId;
|
||||
use DocSet;
|
||||
use Result;
|
||||
@@ -32,6 +33,10 @@ impl Weight for EmptyWeight {
|
||||
fn scorer(&self, _reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
Ok(Box::new(EmptyScorer))
|
||||
}
|
||||
|
||||
fn explain(&self, _reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
Err(does_not_match(doc))
|
||||
}
|
||||
}
|
||||
|
||||
/// `EmptyScorer` is a dummy `Scorer` in which no document matches.
|
||||
|
||||
51
src/query/explanation.rs
Normal file
51
src/query/explanation.rs
Normal file
@@ -0,0 +1,51 @@
|
||||
use {DocId, TantivyError};
|
||||
|
||||
pub(crate) fn does_not_match(doc: DocId) -> TantivyError {
|
||||
TantivyError::InvalidArgument(format!("Document #({}) does not match", doc))
|
||||
}
|
||||
|
||||
/// Object describing the score of a given document.
|
||||
/// It is organized in trees.
|
||||
///
|
||||
/// `.to_pretty_json()` can be useful to print out a human readable
|
||||
/// representation of this tree when debugging a given score.
|
||||
#[derive(Clone, Serialize)]
|
||||
pub struct Explanation {
|
||||
value: f32,
|
||||
description: String,
|
||||
#[serde(skip_serializing_if = "Vec::is_empty")]
|
||||
details: Vec<Explanation>,
|
||||
}
|
||||
|
||||
impl Explanation {
|
||||
/// Creates a new explanation object.
|
||||
pub fn new<T: ToString>(description: T, value: f32) -> Explanation {
|
||||
Explanation {
|
||||
value,
|
||||
description: description.to_string(),
|
||||
details: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the value associated to the current node.
|
||||
pub fn value(&self) -> f32 {
|
||||
self.value
|
||||
}
|
||||
|
||||
/// Add some detail, explaining some part of the current node formula.
|
||||
///
|
||||
/// Details are treated as child of the current node.
|
||||
pub fn add_detail(&mut self, child_explanation: Explanation) {
|
||||
self.details.push(child_explanation);
|
||||
}
|
||||
|
||||
/// Shortcut for `self.details.push(Explanation::new(name, value));`
|
||||
pub fn add_const<T: ToString>(&mut self, name: T, value: f32) {
|
||||
self.details.push(Explanation::new(name, value));
|
||||
}
|
||||
|
||||
/// Returns an indented json representation of the explanation tree for debug usage.
|
||||
pub fn to_pretty_json(&self) -> String {
|
||||
serde_json::to_string_pretty(self).unwrap()
|
||||
}
|
||||
}
|
||||
@@ -52,9 +52,8 @@ lazy_static! {
|
||||
/// ));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
///
|
||||
/// index.load_searchers()?;
|
||||
/// let searcher = index.searcher();
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// {
|
||||
///
|
||||
@@ -141,8 +140,8 @@ mod test {
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
{
|
||||
let term = Term::from_field_text(country_field, "japon");
|
||||
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
use docset::{DocSet, SkipResult};
|
||||
use downcast::Downcast;
|
||||
use query::term_query::TermScorer;
|
||||
use query::EmptyScorer;
|
||||
use query::Scorer;
|
||||
use std::borrow::Borrow;
|
||||
use DocId;
|
||||
use Score;
|
||||
|
||||
@@ -16,42 +14,35 @@ use Score;
|
||||
/// specialized implementation if the two
|
||||
/// shortest scorers are `TermScorer`s.
|
||||
pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
|
||||
if scorers.is_empty() {
|
||||
return Box::new(EmptyScorer);
|
||||
}
|
||||
if scorers.len() == 1 {
|
||||
return scorers.pop().unwrap();
|
||||
}
|
||||
// We know that we have at least 2 elements.
|
||||
let num_docsets = scorers.len();
|
||||
scorers.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
|
||||
let rarest_opt = scorers.pop();
|
||||
let second_rarest_opt = scorers.pop();
|
||||
let left = scorers.pop().unwrap();
|
||||
let right = scorers.pop().unwrap();
|
||||
scorers.reverse();
|
||||
match (rarest_opt, second_rarest_opt) {
|
||||
(None, None) => Box::new(EmptyScorer),
|
||||
(Some(single_docset), None) => single_docset,
|
||||
(Some(left), Some(right)) => {
|
||||
{
|
||||
let all_term_scorers = [&left, &right].iter().all(|&scorer| {
|
||||
let scorer_ref: &Scorer = <Box<Scorer> as Borrow<Scorer>>::borrow(scorer);
|
||||
Downcast::<TermScorer>::is_type(scorer_ref)
|
||||
});
|
||||
if all_term_scorers {
|
||||
let left = *Downcast::<TermScorer>::downcast(left).unwrap();
|
||||
let right = *Downcast::<TermScorer>::downcast(right).unwrap();
|
||||
return Box::new(Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docsets,
|
||||
});
|
||||
}
|
||||
}
|
||||
Box::new(Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docsets,
|
||||
})
|
||||
}
|
||||
_ => {
|
||||
unreachable!();
|
||||
}
|
||||
let all_term_scorers = [&left, &right]
|
||||
.iter()
|
||||
.all(|&scorer| scorer.is::<TermScorer>());
|
||||
if all_term_scorers {
|
||||
return Box::new(Intersection {
|
||||
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
|
||||
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
|
||||
others: scorers,
|
||||
num_docsets,
|
||||
});
|
||||
}
|
||||
Box::new(Intersection {
|
||||
left,
|
||||
right,
|
||||
others: scorers,
|
||||
num_docsets,
|
||||
})
|
||||
}
|
||||
|
||||
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
|
||||
@@ -127,7 +118,6 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
match left.skip_next(candidate) {
|
||||
SkipResult::Reached => {
|
||||
break;
|
||||
@@ -143,35 +133,36 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
}
|
||||
// test the remaining scorers;
|
||||
for (ord, docset) in self.others.iter_mut().enumerate() {
|
||||
if ord != other_candidate_ord {
|
||||
// `candidate_ord` is already at the
|
||||
// right position.
|
||||
//
|
||||
// Calling `skip_next` would advance this docset
|
||||
// and miss it.
|
||||
match docset.skip_next(candidate) {
|
||||
SkipResult::Reached => {}
|
||||
SkipResult::OverStep => {
|
||||
// this is not in the intersection,
|
||||
// let's update our candidate.
|
||||
candidate = docset.doc();
|
||||
match left.skip_next(candidate) {
|
||||
SkipResult::Reached => {
|
||||
other_candidate_ord = ord;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
candidate = left.doc();
|
||||
other_candidate_ord = usize::max_value();
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
if ord == other_candidate_ord {
|
||||
continue;
|
||||
}
|
||||
// `candidate_ord` is already at the
|
||||
// right position.
|
||||
//
|
||||
// Calling `skip_next` would advance this docset
|
||||
// and miss it.
|
||||
match docset.skip_next(candidate) {
|
||||
SkipResult::Reached => {}
|
||||
SkipResult::OverStep => {
|
||||
// this is not in the intersection,
|
||||
// let's update our candidate.
|
||||
candidate = docset.doc();
|
||||
match left.skip_next(candidate) {
|
||||
SkipResult::Reached => {
|
||||
other_candidate_ord = ord;
|
||||
}
|
||||
SkipResult::OverStep => {
|
||||
candidate = left.doc();
|
||||
other_candidate_ord = usize::max_value();
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
continue 'outer;
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
continue 'outer;
|
||||
}
|
||||
SkipResult::End => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ mod bm25;
|
||||
mod boolean_query;
|
||||
mod empty_query;
|
||||
mod exclude;
|
||||
mod explanation;
|
||||
mod fuzzy_query;
|
||||
mod intersection;
|
||||
mod occur;
|
||||
@@ -39,6 +40,7 @@ pub use self::bitset::BitSetDocSet;
|
||||
pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
|
||||
pub use self::exclude::Exclude;
|
||||
pub use self::explanation::Explanation;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub use self::intersection::intersect_scorers;
|
||||
pub use self::occur::Occur;
|
||||
|
||||
@@ -31,7 +31,6 @@ mod tests {
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
@@ -46,8 +45,7 @@ mod tests {
|
||||
]);
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let test_query = |texts: Vec<&str>| {
|
||||
let terms: Vec<Term> = texts
|
||||
.iter()
|
||||
@@ -90,8 +88,7 @@ mod tests {
|
||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let phrase_query = PhraseQuery::new(vec![
|
||||
Term::from_field_text(text_field, "a"),
|
||||
Term::from_field_text(text_field, "b"),
|
||||
@@ -115,8 +112,7 @@ mod tests {
|
||||
let index = create_index(&["a b c", "a b c a b"]);
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let test_query = |texts: Vec<&str>| {
|
||||
let terms: Vec<Term> = texts
|
||||
.iter()
|
||||
@@ -148,8 +144,7 @@ mod tests {
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let test_query = |texts: Vec<&str>| {
|
||||
let terms: Vec<Term> = texts
|
||||
.iter()
|
||||
@@ -177,8 +172,7 @@ mod tests {
|
||||
index_writer.add_document(doc!(text_field=>"a b c d e f g h"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let test_query = |texts: Vec<(usize, &str)>| {
|
||||
let terms: Vec<(usize, Term)> = texts
|
||||
.iter()
|
||||
|
||||
@@ -4,6 +4,7 @@ use error::TantivyError;
|
||||
use query::bm25::BM25Weight;
|
||||
use query::Query;
|
||||
use query::Weight;
|
||||
use schema::IndexRecordOption;
|
||||
use schema::{Field, Term};
|
||||
use std::collections::BTreeSet;
|
||||
use Result;
|
||||
@@ -83,7 +84,7 @@ impl Query for PhraseQuery {
|
||||
let has_positions = field_entry
|
||||
.field_type()
|
||||
.get_index_record_option()
|
||||
.map(|index_record_option| index_record_option.has_positions())
|
||||
.map(IndexRecordOption::has_positions)
|
||||
.unwrap_or(false);
|
||||
if !has_positions {
|
||||
let field_name = field_entry.name();
|
||||
@@ -92,21 +93,12 @@ impl Query for PhraseQuery {
|
||||
field_name
|
||||
)));
|
||||
}
|
||||
if scoring_enabled {
|
||||
let terms = self.phrase_terms();
|
||||
let bm25_weight = BM25Weight::for_terms(searcher, &terms);
|
||||
Ok(Box::new(PhraseWeight::new(
|
||||
self.phrase_terms.clone(),
|
||||
bm25_weight,
|
||||
true,
|
||||
)))
|
||||
} else {
|
||||
Ok(Box::new(PhraseWeight::new(
|
||||
self.phrase_terms.clone(),
|
||||
BM25Weight::null(),
|
||||
false,
|
||||
)))
|
||||
}
|
||||
let terms = self.phrase_terms();
|
||||
let bm25_weight = BM25Weight::for_terms(searcher, &terms);
|
||||
|
||||
let phrase_weight: PhraseWeight =
|
||||
PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, scoring_enabled);
|
||||
Ok(Box::new(phrase_weight))
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
|
||||
@@ -43,7 +43,7 @@ impl<TPostings: Postings> DocSet for PostingsWithOffset<TPostings> {
|
||||
|
||||
pub struct PhraseScorer<TPostings: Postings> {
|
||||
intersection_docset: Intersection<PostingsWithOffset<TPostings>, PostingsWithOffset<TPostings>>,
|
||||
num_docsets: usize,
|
||||
num_terms: usize,
|
||||
left: Vec<u32>,
|
||||
right: Vec<u32>,
|
||||
phrase_count: u32,
|
||||
@@ -138,7 +138,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
.collect::<Vec<_>>();
|
||||
PhraseScorer {
|
||||
intersection_docset: Intersection::new(postings_with_offsets),
|
||||
num_docsets,
|
||||
num_terms: num_docsets,
|
||||
left: Vec::with_capacity(100),
|
||||
right: Vec::with_capacity(100),
|
||||
phrase_count: 0u32,
|
||||
@@ -148,9 +148,13 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn phrase_count(&self) -> u32 {
|
||||
self.phrase_count
|
||||
}
|
||||
|
||||
fn phrase_match(&mut self) -> bool {
|
||||
if self.score_needed {
|
||||
let count = self.phrase_count();
|
||||
let count = self.compute_phrase_count();
|
||||
self.phrase_count = count;
|
||||
count > 0u32
|
||||
} else {
|
||||
@@ -165,7 +169,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
.positions(&mut self.left);
|
||||
}
|
||||
let mut intersection_len = self.left.len();
|
||||
for i in 1..self.num_docsets - 1 {
|
||||
for i in 1..self.num_terms - 1 {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(i)
|
||||
@@ -178,19 +182,19 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
}
|
||||
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(self.num_docsets - 1)
|
||||
.docset_mut_specialized(self.num_terms - 1)
|
||||
.positions(&mut self.right);
|
||||
intersection_exists(&self.left[..intersection_len], &self.right[..])
|
||||
}
|
||||
|
||||
fn phrase_count(&mut self) -> u32 {
|
||||
fn compute_phrase_count(&mut self) -> u32 {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(0)
|
||||
.positions(&mut self.left);
|
||||
}
|
||||
let mut intersection_len = self.left.len();
|
||||
for i in 1..self.num_docsets - 1 {
|
||||
for i in 1..self.num_terms - 1 {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(i)
|
||||
@@ -203,7 +207,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
}
|
||||
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(self.num_docsets - 1)
|
||||
.docset_mut_specialized(self.num_terms - 1)
|
||||
.positions(&mut self.right);
|
||||
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
|
||||
}
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
use super::PhraseScorer;
|
||||
use core::SegmentReader;
|
||||
use fieldnorm::FieldNormReader;
|
||||
use postings::SegmentPostings;
|
||||
use query::bm25::BM25Weight;
|
||||
use query::EmptyScorer;
|
||||
use query::explanation::does_not_match;
|
||||
use query::Scorer;
|
||||
use query::Weight;
|
||||
use query::{EmptyScorer, Explanation};
|
||||
use schema::IndexRecordOption;
|
||||
use schema::Term;
|
||||
use Result;
|
||||
use {DocId, DocSet};
|
||||
use {Result, SkipResult};
|
||||
|
||||
pub struct PhraseWeight {
|
||||
phrase_terms: Vec<(usize, Term)>,
|
||||
@@ -27,13 +31,18 @@ impl PhraseWeight {
|
||||
score_needed,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for PhraseWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
let similarity_weight = self.similarity_weight.clone();
|
||||
fn fieldnorm_reader(&self, reader: &SegmentReader) -> FieldNormReader {
|
||||
let field = self.phrase_terms[0].1.field();
|
||||
let fieldnorm_reader = reader.get_fieldnorms_reader(field);
|
||||
reader.get_fieldnorms_reader(field)
|
||||
}
|
||||
|
||||
fn phrase_scorer(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<Option<PhraseScorer<SegmentPostings>>> {
|
||||
let similarity_weight = self.similarity_weight.clone();
|
||||
let fieldnorm_reader = self.fieldnorm_reader(reader);
|
||||
if reader.has_deletes() {
|
||||
let mut term_postings_list = Vec::new();
|
||||
for &(offset, ref term) in &self.phrase_terms {
|
||||
@@ -43,10 +52,10 @@ impl Weight for PhraseWeight {
|
||||
{
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
Ok(Box::new(PhraseScorer::new(
|
||||
Ok(Some(PhraseScorer::new(
|
||||
term_postings_list,
|
||||
similarity_weight,
|
||||
fieldnorm_reader,
|
||||
@@ -61,10 +70,10 @@ impl Weight for PhraseWeight {
|
||||
{
|
||||
term_postings_list.push((offset, postings));
|
||||
} else {
|
||||
return Ok(Box::new(EmptyScorer));
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
Ok(Box::new(PhraseScorer::new(
|
||||
Ok(Some(PhraseScorer::new(
|
||||
term_postings_list,
|
||||
similarity_weight,
|
||||
fieldnorm_reader,
|
||||
@@ -73,3 +82,30 @@ impl Weight for PhraseWeight {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Weight for PhraseWeight {
|
||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||
if let Some(scorer) = self.phrase_scorer(reader)? {
|
||||
Ok(Box::new(scorer))
|
||||
} else {
|
||||
Ok(Box::new(EmptyScorer))
|
||||
}
|
||||
}
|
||||
|
||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> Result<Explanation> {
|
||||
let scorer_opt = self.phrase_scorer(reader)?;
|
||||
if scorer_opt.is_none() {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
let mut scorer = scorer_opt.unwrap();
|
||||
if scorer.skip_next(doc) != SkipResult::Reached {
|
||||
return Err(does_not_match(doc));
|
||||
}
|
||||
let fieldnorm_reader = self.fieldnorm_reader(reader);
|
||||
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
|
||||
let phrase_count = scorer.phrase_count();
|
||||
let mut explanation = Explanation::new("Phrase Scorer", scorer.score());
|
||||
explanation.add_detail(self.similarity_weight.explain(fieldnorm_id, phrase_count));
|
||||
Ok(explanation)
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user